2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
7 This file is part of GCC.
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
25 #include "coretypes.h"
29 #include "basic-block.h"
30 #include "tree-pretty-print.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
35 #include "cfglayout.h"
40 #include "diagnostic-core.h"
42 #include "tree-chrec.h"
43 #include "tree-scalar-evolution.h"
44 #include "tree-vectorizer.h"
47 /* Loop Vectorization Pass.
49 This pass tries to vectorize loops.
51 For example, the vectorizer transforms the following simple loop:
53 short a[N]; short b[N]; short c[N]; int i;
59 as if it was manually vectorized by rewriting the source code into:
61 typedef int __attribute__((mode(V8HI))) v8hi;
62 short a[N]; short b[N]; short c[N]; int i;
63 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
66 for (i=0; i<N/8; i++){
73 The main entry to this pass is vectorize_loops(), in which
74 the vectorizer applies a set of analyses on a given set of loops,
75 followed by the actual vectorization transformation for the loops that
76 had successfully passed the analysis phase.
77 Throughout this pass we make a distinction between two types of
78 data: scalars (which are represented by SSA_NAMES), and memory references
79 ("data-refs"). These two types of data require different handling both
80 during analysis and transformation. The types of data-refs that the
81 vectorizer currently supports are ARRAY_REFS which base is an array DECL
82 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
83 accesses are required to have a simple (consecutive) access pattern.
87 The driver for the analysis phase is vect_analyze_loop().
88 It applies a set of analyses, some of which rely on the scalar evolution
89 analyzer (scev) developed by Sebastian Pop.
91 During the analysis phase the vectorizer records some information
92 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
93 loop, as well as general information about the loop as a whole, which is
94 recorded in a "loop_vec_info" struct attached to each loop.
98 The loop transformation phase scans all the stmts in the loop, and
99 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
100 the loop that needs to be vectorized. It inserts the vector code sequence
101 just before the scalar stmt S, and records a pointer to the vector code
102 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
103 attached to S). This pointer will be used for the vectorization of following
104 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
105 otherwise, we rely on dead code elimination for removing it.
107 For example, say stmt S1 was vectorized into stmt VS1:
110 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
113 To vectorize stmt S2, the vectorizer first finds the stmt that defines
114 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
115 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
116 resulting sequence would be:
119 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
123 Operands that are not SSA_NAMEs, are data-refs that appear in
124 load/store operations (like 'x[i]' in S1), and are handled differently.
128 Currently the only target specific information that is used is the
129 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
130 Targets that can support different sizes of vectors, for now will need
131 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
132 flexibility will be added in the future.
134 Since we only vectorize operations which vector form can be
135 expressed using existing tree codes, to verify that an operation is
136 supported, the vectorizer checks the relevant optab at the relevant
137 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
138 the value found is CODE_FOR_nothing, then there's no target support, and
139 we can't vectorize the stmt.
141 For additional information on this project see:
142 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
145 /* Function vect_determine_vectorization_factor
147 Determine the vectorization factor (VF). VF is the number of data elements
148 that are operated upon in parallel in a single iteration of the vectorized
149 loop. For example, when vectorizing a loop that operates on 4byte elements,
150 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
151 elements can fit in a single vector register.
153 We currently support vectorization of loops in which all types operated upon
154 are of the same size. Therefore this function currently sets VF according to
155 the size of the types operated upon, and fails if there are multiple sizes
158 VF is also the factor by which the loop iterations are strip-mined, e.g.:
165 for (i=0; i<N; i+=VF){
166 a[i:VF] = b[i:VF] + c[i:VF];
171 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
173 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
174 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
175 int nbbs = loop->num_nodes;
176 gimple_stmt_iterator si;
177 unsigned int vectorization_factor = 0;
182 stmt_vec_info stmt_info;
186 if (vect_print_dump_info (REPORT_DETAILS))
187 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
189 for (i = 0; i < nbbs; i++)
191 basic_block bb = bbs[i];
193 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
196 stmt_info = vinfo_for_stmt (phi);
197 if (vect_print_dump_info (REPORT_DETAILS))
199 fprintf (vect_dump, "==> examining phi: ");
200 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
203 gcc_assert (stmt_info);
205 if (STMT_VINFO_RELEVANT_P (stmt_info))
207 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
208 scalar_type = TREE_TYPE (PHI_RESULT (phi));
210 if (vect_print_dump_info (REPORT_DETAILS))
212 fprintf (vect_dump, "get vectype for scalar type: ");
213 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
216 vectype = get_vectype_for_scalar_type (scalar_type);
219 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
222 "not vectorized: unsupported data-type ");
223 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
227 STMT_VINFO_VECTYPE (stmt_info) = vectype;
229 if (vect_print_dump_info (REPORT_DETAILS))
231 fprintf (vect_dump, "vectype: ");
232 print_generic_expr (vect_dump, vectype, TDF_SLIM);
235 nunits = TYPE_VECTOR_SUBPARTS (vectype);
236 if (vect_print_dump_info (REPORT_DETAILS))
237 fprintf (vect_dump, "nunits = %d", nunits);
239 if (!vectorization_factor
240 || (nunits > vectorization_factor))
241 vectorization_factor = nunits;
245 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
248 gimple stmt = gsi_stmt (si);
249 stmt_info = vinfo_for_stmt (stmt);
251 if (vect_print_dump_info (REPORT_DETAILS))
253 fprintf (vect_dump, "==> examining statement: ");
254 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
257 gcc_assert (stmt_info);
259 /* skip stmts which do not need to be vectorized. */
260 if (!STMT_VINFO_RELEVANT_P (stmt_info)
261 && !STMT_VINFO_LIVE_P (stmt_info))
263 if (vect_print_dump_info (REPORT_DETAILS))
264 fprintf (vect_dump, "skip.");
268 if (gimple_get_lhs (stmt) == NULL_TREE)
270 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
272 fprintf (vect_dump, "not vectorized: irregular stmt.");
273 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
278 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
280 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
282 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
283 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
288 if (STMT_VINFO_VECTYPE (stmt_info))
290 /* The only case when a vectype had been already set is for stmts
291 that contain a dataref, or for "pattern-stmts" (stmts generated
292 by the vectorizer to represent/replace a certain idiom). */
293 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
294 || is_pattern_stmt_p (stmt_info));
295 vectype = STMT_VINFO_VECTYPE (stmt_info);
299 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
300 && !is_pattern_stmt_p (stmt_info));
302 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
303 if (vect_print_dump_info (REPORT_DETAILS))
305 fprintf (vect_dump, "get vectype for scalar type: ");
306 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
308 vectype = get_vectype_for_scalar_type (scalar_type);
311 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
314 "not vectorized: unsupported data-type ");
315 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
320 STMT_VINFO_VECTYPE (stmt_info) = vectype;
323 /* The vectorization factor is according to the smallest
324 scalar type (or the largest vector size, but we only
325 support one vector size per loop). */
326 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
328 if (vect_print_dump_info (REPORT_DETAILS))
330 fprintf (vect_dump, "get vectype for scalar type: ");
331 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
333 vf_vectype = get_vectype_for_scalar_type (scalar_type);
336 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
339 "not vectorized: unsupported data-type ");
340 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
345 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
346 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
348 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
351 "not vectorized: different sized vector "
352 "types in statement, ");
353 print_generic_expr (vect_dump, vectype, TDF_SLIM);
354 fprintf (vect_dump, " and ");
355 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
360 if (vect_print_dump_info (REPORT_DETAILS))
362 fprintf (vect_dump, "vectype: ");
363 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
366 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
367 if (vect_print_dump_info (REPORT_DETAILS))
368 fprintf (vect_dump, "nunits = %d", nunits);
370 if (!vectorization_factor
371 || (nunits > vectorization_factor))
372 vectorization_factor = nunits;
376 /* TODO: Analyze cost. Decide if worth while to vectorize. */
377 if (vect_print_dump_info (REPORT_DETAILS))
378 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
379 if (vectorization_factor <= 1)
381 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
382 fprintf (vect_dump, "not vectorized: unsupported data-type");
385 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
391 /* Function vect_is_simple_iv_evolution.
393 FORNOW: A simple evolution of an induction variables in the loop is
394 considered a polynomial evolution with constant step. */
397 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
402 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
404 /* When there is no evolution in this loop, the evolution function
406 if (evolution_part == NULL_TREE)
409 /* When the evolution is a polynomial of degree >= 2
410 the evolution function is not "simple". */
411 if (tree_is_chrec (evolution_part))
414 step_expr = evolution_part;
415 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
417 if (vect_print_dump_info (REPORT_DETAILS))
419 fprintf (vect_dump, "step: ");
420 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
421 fprintf (vect_dump, ", init: ");
422 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
428 if (TREE_CODE (step_expr) != INTEGER_CST)
430 if (vect_print_dump_info (REPORT_DETAILS))
431 fprintf (vect_dump, "step unknown.");
438 /* Function vect_analyze_scalar_cycles_1.
440 Examine the cross iteration def-use cycles of scalar variables
441 in LOOP. LOOP_VINFO represents the loop that is now being
442 considered for vectorization (can be LOOP, or an outer-loop
446 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
448 basic_block bb = loop->header;
450 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
451 gimple_stmt_iterator gsi;
454 if (vect_print_dump_info (REPORT_DETAILS))
455 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
457 /* First - identify all inductions. Reduction detection assumes that all the
458 inductions have been identified, therefore, this order must not be
460 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
462 gimple phi = gsi_stmt (gsi);
463 tree access_fn = NULL;
464 tree def = PHI_RESULT (phi);
465 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
467 if (vect_print_dump_info (REPORT_DETAILS))
469 fprintf (vect_dump, "Analyze phi: ");
470 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
473 /* Skip virtual phi's. The data dependences that are associated with
474 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
475 if (!is_gimple_reg (SSA_NAME_VAR (def)))
478 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
480 /* Analyze the evolution function. */
481 access_fn = analyze_scalar_evolution (loop, def);
482 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
484 fprintf (vect_dump, "Access function of PHI: ");
485 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
489 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
491 VEC_safe_push (gimple, heap, worklist, phi);
495 if (vect_print_dump_info (REPORT_DETAILS))
496 fprintf (vect_dump, "Detected induction.");
497 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
501 /* Second - identify all reductions and nested cycles. */
502 while (VEC_length (gimple, worklist) > 0)
504 gimple phi = VEC_pop (gimple, worklist);
505 tree def = PHI_RESULT (phi);
506 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
510 if (vect_print_dump_info (REPORT_DETAILS))
512 fprintf (vect_dump, "Analyze phi: ");
513 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
516 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
517 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
519 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
520 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
526 if (vect_print_dump_info (REPORT_DETAILS))
527 fprintf (vect_dump, "Detected double reduction.");
529 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
530 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
531 vect_double_reduction_def;
537 if (vect_print_dump_info (REPORT_DETAILS))
538 fprintf (vect_dump, "Detected vectorizable nested cycle.");
540 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
541 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
546 if (vect_print_dump_info (REPORT_DETAILS))
547 fprintf (vect_dump, "Detected reduction.");
549 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
550 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
552 /* Store the reduction cycles for possible vectorization in
554 VEC_safe_push (gimple, heap,
555 LOOP_VINFO_REDUCTIONS (loop_vinfo),
561 if (vect_print_dump_info (REPORT_DETAILS))
562 fprintf (vect_dump, "Unknown def-use cycle pattern.");
565 VEC_free (gimple, heap, worklist);
569 /* Function vect_analyze_scalar_cycles.
571 Examine the cross iteration def-use cycles of scalar variables, by
572 analyzing the loop-header PHIs of scalar variables. Classify each
573 cycle as one of the following: invariant, induction, reduction, unknown.
574 We do that for the loop represented by LOOP_VINFO, and also to its
575 inner-loop, if exists.
576 Examples for scalar cycles:
591 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
593 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
595 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
597 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
598 Reductions in such inner-loop therefore have different properties than
599 the reductions in the nest that gets vectorized:
600 1. When vectorized, they are executed in the same order as in the original
601 scalar loop, so we can't change the order of computation when
603 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
604 current checks are too strict. */
607 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
610 /* Function vect_get_loop_niters.
612 Determine how many iterations the loop is executed.
613 If an expression that represents the number of iterations
614 can be constructed, place it in NUMBER_OF_ITERATIONS.
615 Return the loop exit condition. */
618 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
622 if (vect_print_dump_info (REPORT_DETAILS))
623 fprintf (vect_dump, "=== get_loop_niters ===");
625 niters = number_of_exit_cond_executions (loop);
627 if (niters != NULL_TREE
628 && niters != chrec_dont_know)
630 *number_of_iterations = niters;
632 if (vect_print_dump_info (REPORT_DETAILS))
634 fprintf (vect_dump, "==> get_loop_niters:" );
635 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
639 return get_loop_exit_condition (loop);
643 /* Function bb_in_loop_p
645 Used as predicate for dfs order traversal of the loop bbs. */
648 bb_in_loop_p (const_basic_block bb, const void *data)
650 const struct loop *const loop = (const struct loop *)data;
651 if (flow_bb_inside_loop_p (loop, bb))
657 /* Function new_loop_vec_info.
659 Create and initialize a new loop_vec_info struct for LOOP, as well as
660 stmt_vec_info structs for all the stmts in LOOP. */
663 new_loop_vec_info (struct loop *loop)
667 gimple_stmt_iterator si;
668 unsigned int i, nbbs;
670 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
671 LOOP_VINFO_LOOP (res) = loop;
673 bbs = get_loop_body (loop);
675 /* Create/Update stmt_info for all stmts in the loop. */
676 for (i = 0; i < loop->num_nodes; i++)
678 basic_block bb = bbs[i];
680 /* BBs in a nested inner-loop will have been already processed (because
681 we will have called vect_analyze_loop_form for any nested inner-loop).
682 Therefore, for stmts in an inner-loop we just want to update the
683 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
684 loop_info of the outer-loop we are currently considering to vectorize
685 (instead of the loop_info of the inner-loop).
686 For stmts in other BBs we need to create a stmt_info from scratch. */
687 if (bb->loop_father != loop)
690 gcc_assert (loop->inner && bb->loop_father == loop->inner);
691 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
693 gimple phi = gsi_stmt (si);
694 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
695 loop_vec_info inner_loop_vinfo =
696 STMT_VINFO_LOOP_VINFO (stmt_info);
697 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
698 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
700 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
702 gimple stmt = gsi_stmt (si);
703 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
704 loop_vec_info inner_loop_vinfo =
705 STMT_VINFO_LOOP_VINFO (stmt_info);
706 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
707 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
712 /* bb in current nest. */
713 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
715 gimple phi = gsi_stmt (si);
716 gimple_set_uid (phi, 0);
717 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
720 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
722 gimple stmt = gsi_stmt (si);
723 gimple_set_uid (stmt, 0);
724 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
729 /* CHECKME: We want to visit all BBs before their successors (except for
730 latch blocks, for which this assertion wouldn't hold). In the simple
731 case of the loop forms we allow, a dfs order of the BBs would the same
732 as reversed postorder traversal, so we are safe. */
735 bbs = XCNEWVEC (basic_block, loop->num_nodes);
736 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
737 bbs, loop->num_nodes, loop);
738 gcc_assert (nbbs == loop->num_nodes);
740 LOOP_VINFO_BBS (res) = bbs;
741 LOOP_VINFO_NITERS (res) = NULL;
742 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
743 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
744 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
745 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
746 LOOP_VINFO_VECT_FACTOR (res) = 0;
747 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
748 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
749 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
750 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
751 VEC_alloc (gimple, heap,
752 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
753 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
754 VEC_alloc (ddr_p, heap,
755 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
756 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
757 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
758 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
759 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
760 LOOP_VINFO_PEELING_HTAB (res) = NULL;
766 /* Function destroy_loop_vec_info.
768 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
769 stmts in the loop. */
772 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
777 gimple_stmt_iterator si;
779 VEC (slp_instance, heap) *slp_instances;
780 slp_instance instance;
785 loop = LOOP_VINFO_LOOP (loop_vinfo);
787 bbs = LOOP_VINFO_BBS (loop_vinfo);
788 nbbs = loop->num_nodes;
792 free (LOOP_VINFO_BBS (loop_vinfo));
793 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
794 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
795 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
802 for (j = 0; j < nbbs; j++)
804 basic_block bb = bbs[j];
805 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
806 free_stmt_vec_info (gsi_stmt (si));
808 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
810 gimple stmt = gsi_stmt (si);
811 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
815 /* Check if this is a "pattern stmt" (introduced by the
816 vectorizer during the pattern recognition pass). */
817 bool remove_stmt_p = false;
818 gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
821 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
823 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
824 remove_stmt_p = true;
827 /* Free stmt_vec_info. */
828 free_stmt_vec_info (stmt);
830 /* Remove dead "pattern stmts". */
832 gsi_remove (&si, true);
838 free (LOOP_VINFO_BBS (loop_vinfo));
839 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
840 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
841 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
842 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
843 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
844 FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
845 vect_free_slp_instance (instance);
847 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
848 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
849 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
851 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
852 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
859 /* Function vect_analyze_loop_1.
861 Apply a set of analyses on LOOP, and create a loop_vec_info struct
862 for it. The different analyses will record information in the
863 loop_vec_info struct. This is a subset of the analyses applied in
864 vect_analyze_loop, to be applied on an inner-loop nested in the loop
865 that is now considered for (outer-loop) vectorization. */
868 vect_analyze_loop_1 (struct loop *loop)
870 loop_vec_info loop_vinfo;
872 if (vect_print_dump_info (REPORT_DETAILS))
873 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
875 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
877 loop_vinfo = vect_analyze_loop_form (loop);
880 if (vect_print_dump_info (REPORT_DETAILS))
881 fprintf (vect_dump, "bad inner-loop form.");
889 /* Function vect_analyze_loop_form.
891 Verify that certain CFG restrictions hold, including:
892 - the loop has a pre-header
893 - the loop has a single entry and exit
894 - the loop exit condition is simple enough, and the number of iterations
895 can be analyzed (a countable loop). */
898 vect_analyze_loop_form (struct loop *loop)
900 loop_vec_info loop_vinfo;
902 tree number_of_iterations = NULL;
903 loop_vec_info inner_loop_vinfo = NULL;
905 if (vect_print_dump_info (REPORT_DETAILS))
906 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
908 /* Different restrictions apply when we are considering an inner-most loop,
909 vs. an outer (nested) loop.
910 (FORNOW. May want to relax some of these restrictions in the future). */
914 /* Inner-most loop. We currently require that the number of BBs is
915 exactly 2 (the header and latch). Vectorizable inner-most loops
926 if (loop->num_nodes != 2)
928 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
929 fprintf (vect_dump, "not vectorized: control flow in loop.");
933 if (empty_block_p (loop->header))
935 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
936 fprintf (vect_dump, "not vectorized: empty loop.");
942 struct loop *innerloop = loop->inner;
945 /* Nested loop. We currently require that the loop is doubly-nested,
946 contains a single inner loop, and the number of BBs is exactly 5.
947 Vectorizable outer-loops look like this:
959 The inner-loop has the properties expected of inner-most loops
960 as described above. */
962 if ((loop->inner)->inner || (loop->inner)->next)
964 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
965 fprintf (vect_dump, "not vectorized: multiple nested loops.");
969 /* Analyze the inner-loop. */
970 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
971 if (!inner_loop_vinfo)
973 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
974 fprintf (vect_dump, "not vectorized: Bad inner loop.");
978 if (!expr_invariant_in_loop_p (loop,
979 LOOP_VINFO_NITERS (inner_loop_vinfo)))
981 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
983 "not vectorized: inner-loop count not invariant.");
984 destroy_loop_vec_info (inner_loop_vinfo, true);
988 if (loop->num_nodes != 5)
990 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
991 fprintf (vect_dump, "not vectorized: control flow in loop.");
992 destroy_loop_vec_info (inner_loop_vinfo, true);
996 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
997 entryedge = EDGE_PRED (innerloop->header, 0);
998 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
999 entryedge = EDGE_PRED (innerloop->header, 1);
1001 if (entryedge->src != loop->header
1002 || !single_exit (innerloop)
1003 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1005 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1006 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1007 destroy_loop_vec_info (inner_loop_vinfo, true);
1011 if (vect_print_dump_info (REPORT_DETAILS))
1012 fprintf (vect_dump, "Considering outer-loop vectorization.");
1015 if (!single_exit (loop)
1016 || EDGE_COUNT (loop->header->preds) != 2)
1018 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1020 if (!single_exit (loop))
1021 fprintf (vect_dump, "not vectorized: multiple exits.");
1022 else if (EDGE_COUNT (loop->header->preds) != 2)
1023 fprintf (vect_dump, "not vectorized: too many incoming edges.");
1025 if (inner_loop_vinfo)
1026 destroy_loop_vec_info (inner_loop_vinfo, true);
1030 /* We assume that the loop exit condition is at the end of the loop. i.e,
1031 that the loop is represented as a do-while (with a proper if-guard
1032 before the loop if needed), where the loop header contains all the
1033 executable statements, and the latch is empty. */
1034 if (!empty_block_p (loop->latch)
1035 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1037 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1038 fprintf (vect_dump, "not vectorized: unexpected loop form.");
1039 if (inner_loop_vinfo)
1040 destroy_loop_vec_info (inner_loop_vinfo, true);
1044 /* Make sure there exists a single-predecessor exit bb: */
1045 if (!single_pred_p (single_exit (loop)->dest))
1047 edge e = single_exit (loop);
1048 if (!(e->flags & EDGE_ABNORMAL))
1050 split_loop_exit_edge (e);
1051 if (vect_print_dump_info (REPORT_DETAILS))
1052 fprintf (vect_dump, "split exit edge.");
1056 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1057 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1058 if (inner_loop_vinfo)
1059 destroy_loop_vec_info (inner_loop_vinfo, true);
1064 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1067 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1068 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1069 if (inner_loop_vinfo)
1070 destroy_loop_vec_info (inner_loop_vinfo, true);
1074 if (!number_of_iterations)
1076 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1078 "not vectorized: number of iterations cannot be computed.");
1079 if (inner_loop_vinfo)
1080 destroy_loop_vec_info (inner_loop_vinfo, true);
1084 if (chrec_contains_undetermined (number_of_iterations))
1086 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1087 fprintf (vect_dump, "Infinite number of iterations.");
1088 if (inner_loop_vinfo)
1089 destroy_loop_vec_info (inner_loop_vinfo, true);
1093 if (!NITERS_KNOWN_P (number_of_iterations))
1095 if (vect_print_dump_info (REPORT_DETAILS))
1097 fprintf (vect_dump, "Symbolic number of iterations is ");
1098 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1101 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1103 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1104 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1105 if (inner_loop_vinfo)
1106 destroy_loop_vec_info (inner_loop_vinfo, false);
1110 loop_vinfo = new_loop_vec_info (loop);
1111 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1112 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1114 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1116 /* CHECKME: May want to keep it around it in the future. */
1117 if (inner_loop_vinfo)
1118 destroy_loop_vec_info (inner_loop_vinfo, false);
1120 gcc_assert (!loop->aux);
1121 loop->aux = loop_vinfo;
1126 /* Get cost by calling cost target builtin. */
1129 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1131 tree dummy_type = NULL;
1134 return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1139 /* Function vect_analyze_loop_operations.
1141 Scan the loop stmts and make sure they are all vectorizable. */
1144 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1146 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1147 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1148 int nbbs = loop->num_nodes;
1149 gimple_stmt_iterator si;
1150 unsigned int vectorization_factor = 0;
1153 stmt_vec_info stmt_info;
1154 bool need_to_vectorize = false;
1155 int min_profitable_iters;
1156 int min_scalar_loop_bound;
1158 bool only_slp_in_loop = true, ok;
1160 if (vect_print_dump_info (REPORT_DETAILS))
1161 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1163 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1164 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1166 for (i = 0; i < nbbs; i++)
1168 basic_block bb = bbs[i];
1170 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1172 phi = gsi_stmt (si);
1175 stmt_info = vinfo_for_stmt (phi);
1176 if (vect_print_dump_info (REPORT_DETAILS))
1178 fprintf (vect_dump, "examining phi: ");
1179 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1182 if (! is_loop_header_bb_p (bb))
1184 /* inner-loop loop-closed exit phi in outer-loop vectorization
1185 (i.e. a phi in the tail of the outer-loop).
1186 FORNOW: we currently don't support the case that these phis
1187 are not used in the outerloop (unless it is double reduction,
1188 i.e., this phi is vect_reduction_def), cause this case
1189 requires to actually do something here. */
1190 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1191 || STMT_VINFO_LIVE_P (stmt_info))
1192 && STMT_VINFO_DEF_TYPE (stmt_info)
1193 != vect_double_reduction_def)
1195 if (vect_print_dump_info (REPORT_DETAILS))
1197 "Unsupported loop-closed phi in outer-loop.");
1203 gcc_assert (stmt_info);
1205 if (STMT_VINFO_LIVE_P (stmt_info))
1207 /* FORNOW: not yet supported. */
1208 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1209 fprintf (vect_dump, "not vectorized: value used after loop.");
1213 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1214 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1216 /* A scalar-dependence cycle that we don't support. */
1217 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1218 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1222 if (STMT_VINFO_RELEVANT_P (stmt_info))
1224 need_to_vectorize = true;
1225 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1226 ok = vectorizable_induction (phi, NULL, NULL);
1231 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1234 "not vectorized: relevant phi not supported: ");
1235 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1241 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1243 gimple stmt = gsi_stmt (si);
1244 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1246 gcc_assert (stmt_info);
1248 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1251 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1252 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1253 && !PURE_SLP_STMT (stmt_info))
1254 /* STMT needs both SLP and loop-based vectorization. */
1255 only_slp_in_loop = false;
1259 /* All operations in the loop are either irrelevant (deal with loop
1260 control, or dead), or only used outside the loop and can be moved
1261 out of the loop (e.g. invariants, inductions). The loop can be
1262 optimized away by scalar optimizations. We're better off not
1263 touching this loop. */
1264 if (!need_to_vectorize)
1266 if (vect_print_dump_info (REPORT_DETAILS))
1268 "All the computation can be taken out of the loop.");
1269 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1271 "not vectorized: redundant loop. no profit to vectorize.");
1275 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1276 vectorization factor of the loop is the unrolling factor required by the
1277 SLP instances. If that unrolling factor is 1, we say, that we perform
1278 pure SLP on loop - cross iteration parallelism is not exploited. */
1279 if (only_slp_in_loop)
1280 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1282 vectorization_factor = least_common_multiple (vectorization_factor,
1283 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1285 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1287 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1288 && vect_print_dump_info (REPORT_DETAILS))
1290 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1291 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1293 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1294 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1296 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1297 fprintf (vect_dump, "not vectorized: iteration count too small.");
1298 if (vect_print_dump_info (REPORT_DETAILS))
1299 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1300 "vectorization factor.");
1304 /* Analyze cost. Decide if worth while to vectorize. */
1306 /* Once VF is set, SLP costs should be updated since the number of created
1307 vector stmts depends on VF. */
1308 vect_update_slp_costs_according_to_vf (loop_vinfo);
1310 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1311 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1313 if (min_profitable_iters < 0)
1315 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1316 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1317 if (vect_print_dump_info (REPORT_DETAILS))
1318 fprintf (vect_dump, "not vectorized: vector version will never be "
1323 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1324 * vectorization_factor) - 1);
1326 /* Use the cost model only if it is more conservative than user specified
1329 th = (unsigned) min_scalar_loop_bound;
1330 if (min_profitable_iters
1331 && (!min_scalar_loop_bound
1332 || min_profitable_iters > min_scalar_loop_bound))
1333 th = (unsigned) min_profitable_iters;
1335 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1336 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1338 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1339 fprintf (vect_dump, "not vectorized: vectorization not "
1341 if (vect_print_dump_info (REPORT_DETAILS))
1342 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1343 "user specified loop bound parameter or minimum "
1344 "profitable iterations (whichever is more conservative).");
1348 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1349 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1350 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1352 if (vect_print_dump_info (REPORT_DETAILS))
1353 fprintf (vect_dump, "epilog loop required.");
1354 if (!vect_can_advance_ivs_p (loop_vinfo))
1356 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1358 "not vectorized: can't create epilog loop 1.");
1361 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1363 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1365 "not vectorized: can't create epilog loop 2.");
1374 /* Function vect_analyze_loop_2.
1376 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1377 for it. The different analyses will record information in the
1378 loop_vec_info struct. */
1380 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1383 int max_vf = MAX_VECTORIZATION_FACTOR;
1386 /* Find all data references in the loop (which correspond to vdefs/vuses)
1387 and analyze their evolution in the loop. Also adjust the minimal
1388 vectorization factor according to the loads and stores.
1390 FORNOW: Handle only simple, array references, which
1391 alignment can be forced, and aligned pointer-references. */
1393 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1396 if (vect_print_dump_info (REPORT_DETAILS))
1397 fprintf (vect_dump, "bad data references.");
1401 /* Classify all cross-iteration scalar data-flow cycles.
1402 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1404 vect_analyze_scalar_cycles (loop_vinfo);
1406 vect_pattern_recog (loop_vinfo);
1408 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1410 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1413 if (vect_print_dump_info (REPORT_DETAILS))
1414 fprintf (vect_dump, "unexpected pattern.");
1418 /* Analyze data dependences between the data-refs in the loop
1419 and adjust the maximum vectorization factor according to
1421 FORNOW: fail at the first data dependence that we encounter. */
1423 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf, &dummy);
1427 if (vect_print_dump_info (REPORT_DETAILS))
1428 fprintf (vect_dump, "bad data dependence.");
1432 ok = vect_determine_vectorization_factor (loop_vinfo);
1435 if (vect_print_dump_info (REPORT_DETAILS))
1436 fprintf (vect_dump, "can't determine vectorization factor.");
1439 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1441 if (vect_print_dump_info (REPORT_DETAILS))
1442 fprintf (vect_dump, "bad data dependence.");
1446 /* Analyze the alignment of the data-refs in the loop.
1447 Fail if a data reference is found that cannot be vectorized. */
1449 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1452 if (vect_print_dump_info (REPORT_DETAILS))
1453 fprintf (vect_dump, "bad data alignment.");
1457 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1458 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1460 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1463 if (vect_print_dump_info (REPORT_DETAILS))
1464 fprintf (vect_dump, "bad data access.");
1468 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1469 It is important to call pruning after vect_analyze_data_ref_accesses,
1470 since we use grouping information gathered by interleaving analysis. */
1471 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1474 if (vect_print_dump_info (REPORT_DETAILS))
1475 fprintf (vect_dump, "too long list of versioning for alias "
1480 /* This pass will decide on using loop versioning and/or loop peeling in
1481 order to enhance the alignment of data references in the loop. */
1483 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1486 if (vect_print_dump_info (REPORT_DETAILS))
1487 fprintf (vect_dump, "bad data alignment.");
1491 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1492 ok = vect_analyze_slp (loop_vinfo, NULL);
1495 /* Decide which possible SLP instances to SLP. */
1496 vect_make_slp_decision (loop_vinfo);
1498 /* Find stmts that need to be both vectorized and SLPed. */
1499 vect_detect_hybrid_slp (loop_vinfo);
1502 /* Scan all the operations in the loop and make sure they are
1505 ok = vect_analyze_loop_operations (loop_vinfo);
1508 if (vect_print_dump_info (REPORT_DETAILS))
1509 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1516 /* Function vect_analyze_loop.
1518 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1519 for it. The different analyses will record information in the
1520 loop_vec_info struct. */
1522 vect_analyze_loop (struct loop *loop)
1524 loop_vec_info loop_vinfo;
1525 unsigned int vector_sizes;
1527 /* Autodetect first vector size we try. */
1528 current_vector_size = 0;
1529 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1531 if (vect_print_dump_info (REPORT_DETAILS))
1532 fprintf (vect_dump, "===== analyze_loop_nest =====");
1534 if (loop_outer (loop)
1535 && loop_vec_info_for_loop (loop_outer (loop))
1536 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1538 if (vect_print_dump_info (REPORT_DETAILS))
1539 fprintf (vect_dump, "outer-loop already vectorized.");
1545 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1546 loop_vinfo = vect_analyze_loop_form (loop);
1549 if (vect_print_dump_info (REPORT_DETAILS))
1550 fprintf (vect_dump, "bad loop form.");
1554 if (vect_analyze_loop_2 (loop_vinfo))
1556 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1561 destroy_loop_vec_info (loop_vinfo, true);
1563 vector_sizes &= ~current_vector_size;
1564 if (vector_sizes == 0
1565 || current_vector_size == 0)
1568 /* Try the next biggest vector size. */
1569 current_vector_size = 1 << floor_log2 (vector_sizes);
1570 if (vect_print_dump_info (REPORT_DETAILS))
1571 fprintf (vect_dump, "***** Re-trying analysis with "
1572 "vector size %d\n", current_vector_size);
1577 /* Function reduction_code_for_scalar_code
1580 CODE - tree_code of a reduction operations.
1583 REDUC_CODE - the corresponding tree-code to be used to reduce the
1584 vector of partial results into a single scalar result (which
1585 will also reside in a vector) or ERROR_MARK if the operation is
1586 a supported reduction operation, but does not have such tree-code.
1588 Return FALSE if CODE currently cannot be vectorized as reduction. */
1591 reduction_code_for_scalar_code (enum tree_code code,
1592 enum tree_code *reduc_code)
1597 *reduc_code = REDUC_MAX_EXPR;
1601 *reduc_code = REDUC_MIN_EXPR;
1605 *reduc_code = REDUC_PLUS_EXPR;
1613 *reduc_code = ERROR_MARK;
1622 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1623 STMT is printed with a message MSG. */
1626 report_vect_op (gimple stmt, const char *msg)
1628 fprintf (vect_dump, "%s", msg);
1629 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1633 /* Function vect_is_simple_reduction_1
1635 (1) Detect a cross-iteration def-use cycle that represents a simple
1636 reduction computation. We look for the following pattern:
1641 a2 = operation (a3, a1)
1644 1. operation is commutative and associative and it is safe to
1645 change the order of the computation (if CHECK_REDUCTION is true)
1646 2. no uses for a2 in the loop (a2 is used out of the loop)
1647 3. no uses of a1 in the loop besides the reduction operation.
1649 Condition 1 is tested here.
1650 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1652 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1653 nested cycles, if CHECK_REDUCTION is false.
1655 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1659 inner loop (def of a3)
1662 If MODIFY is true it tries also to rework the code in-place to enable
1663 detection of more reduction patterns. For the time being we rewrite
1664 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1668 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1669 bool check_reduction, bool *double_reduc,
1672 struct loop *loop = (gimple_bb (phi))->loop_father;
1673 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1674 edge latch_e = loop_latch_edge (loop);
1675 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1676 gimple def_stmt, def1 = NULL, def2 = NULL;
1677 enum tree_code orig_code, code;
1678 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1682 imm_use_iterator imm_iter;
1683 use_operand_p use_p;
1686 *double_reduc = false;
1688 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1689 otherwise, we assume outer loop vectorization. */
1690 gcc_assert ((check_reduction && loop == vect_loop)
1691 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1693 name = PHI_RESULT (phi);
1695 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1697 gimple use_stmt = USE_STMT (use_p);
1698 if (is_gimple_debug (use_stmt))
1700 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1701 && vinfo_for_stmt (use_stmt)
1702 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1706 if (vect_print_dump_info (REPORT_DETAILS))
1707 fprintf (vect_dump, "reduction used in loop.");
1712 if (TREE_CODE (loop_arg) != SSA_NAME)
1714 if (vect_print_dump_info (REPORT_DETAILS))
1716 fprintf (vect_dump, "reduction: not ssa_name: ");
1717 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1722 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1725 if (vect_print_dump_info (REPORT_DETAILS))
1726 fprintf (vect_dump, "reduction: no def_stmt.");
1730 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1732 if (vect_print_dump_info (REPORT_DETAILS))
1733 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1737 if (is_gimple_assign (def_stmt))
1739 name = gimple_assign_lhs (def_stmt);
1744 name = PHI_RESULT (def_stmt);
1749 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1751 gimple use_stmt = USE_STMT (use_p);
1752 if (is_gimple_debug (use_stmt))
1754 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1755 && vinfo_for_stmt (use_stmt)
1756 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1760 if (vect_print_dump_info (REPORT_DETAILS))
1761 fprintf (vect_dump, "reduction used in loop.");
1766 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1767 defined in the inner loop. */
1770 op1 = PHI_ARG_DEF (def_stmt, 0);
1772 if (gimple_phi_num_args (def_stmt) != 1
1773 || TREE_CODE (op1) != SSA_NAME)
1775 if (vect_print_dump_info (REPORT_DETAILS))
1776 fprintf (vect_dump, "unsupported phi node definition.");
1781 def1 = SSA_NAME_DEF_STMT (op1);
1782 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1784 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1785 && is_gimple_assign (def1))
1787 if (vect_print_dump_info (REPORT_DETAILS))
1788 report_vect_op (def_stmt, "detected double reduction: ");
1790 *double_reduc = true;
1797 code = orig_code = gimple_assign_rhs_code (def_stmt);
1799 /* We can handle "res -= x[i]", which is non-associative by
1800 simply rewriting this into "res += -x[i]". Avoid changing
1801 gimple instruction for the first simple tests and only do this
1802 if we're allowed to change code at all. */
1803 if (code == MINUS_EXPR && modify)
1807 && (!commutative_tree_code (code) || !associative_tree_code (code)))
1809 if (vect_print_dump_info (REPORT_DETAILS))
1810 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1814 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1816 if (code != COND_EXPR)
1818 if (vect_print_dump_info (REPORT_DETAILS))
1819 report_vect_op (def_stmt, "reduction: not binary operation: ");
1824 op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1825 if (COMPARISON_CLASS_P (op3))
1827 op4 = TREE_OPERAND (op3, 1);
1828 op3 = TREE_OPERAND (op3, 0);
1831 op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1832 op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1834 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1836 if (vect_print_dump_info (REPORT_DETAILS))
1837 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1844 op1 = gimple_assign_rhs1 (def_stmt);
1845 op2 = gimple_assign_rhs2 (def_stmt);
1847 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1849 if (vect_print_dump_info (REPORT_DETAILS))
1850 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1856 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1857 if ((TREE_CODE (op1) == SSA_NAME
1858 && !types_compatible_p (type,TREE_TYPE (op1)))
1859 || (TREE_CODE (op2) == SSA_NAME
1860 && !types_compatible_p (type, TREE_TYPE (op2)))
1861 || (op3 && TREE_CODE (op3) == SSA_NAME
1862 && !types_compatible_p (type, TREE_TYPE (op3)))
1863 || (op4 && TREE_CODE (op4) == SSA_NAME
1864 && !types_compatible_p (type, TREE_TYPE (op4))))
1866 if (vect_print_dump_info (REPORT_DETAILS))
1868 fprintf (vect_dump, "reduction: multiple types: operation type: ");
1869 print_generic_expr (vect_dump, type, TDF_SLIM);
1870 fprintf (vect_dump, ", operands types: ");
1871 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1872 fprintf (vect_dump, ",");
1873 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1876 fprintf (vect_dump, ",");
1877 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1882 fprintf (vect_dump, ",");
1883 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1890 /* Check that it's ok to change the order of the computation.
1891 Generally, when vectorizing a reduction we change the order of the
1892 computation. This may change the behavior of the program in some
1893 cases, so we need to check that this is ok. One exception is when
1894 vectorizing an outer-loop: the inner-loop is executed sequentially,
1895 and therefore vectorizing reductions in the inner-loop during
1896 outer-loop vectorization is safe. */
1898 /* CHECKME: check for !flag_finite_math_only too? */
1899 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1902 /* Changing the order of operations changes the semantics. */
1903 if (vect_print_dump_info (REPORT_DETAILS))
1904 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1907 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1910 /* Changing the order of operations changes the semantics. */
1911 if (vect_print_dump_info (REPORT_DETAILS))
1912 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1915 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1917 /* Changing the order of operations changes the semantics. */
1918 if (vect_print_dump_info (REPORT_DETAILS))
1919 report_vect_op (def_stmt,
1920 "reduction: unsafe fixed-point math optimization: ");
1924 /* If we detected "res -= x[i]" earlier, rewrite it into
1925 "res += -x[i]" now. If this turns out to be useless reassoc
1926 will clean it up again. */
1927 if (orig_code == MINUS_EXPR)
1929 tree rhs = gimple_assign_rhs2 (def_stmt);
1930 tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
1931 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
1933 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
1934 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
1936 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
1937 gimple_assign_set_rhs2 (def_stmt, negrhs);
1938 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
1939 update_stmt (def_stmt);
1942 /* Reduction is safe. We're dealing with one of the following:
1943 1) integer arithmetic and no trapv
1944 2) floating point arithmetic, and special flags permit this optimization
1945 3) nested cycle (i.e., outer loop vectorization). */
1946 if (TREE_CODE (op1) == SSA_NAME)
1947 def1 = SSA_NAME_DEF_STMT (op1);
1949 if (TREE_CODE (op2) == SSA_NAME)
1950 def2 = SSA_NAME_DEF_STMT (op2);
1952 if (code != COND_EXPR
1953 && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1955 if (vect_print_dump_info (REPORT_DETAILS))
1956 report_vect_op (def_stmt, "reduction: no defs for operands: ");
1960 /* Check that one def is the reduction def, defined by PHI,
1961 the other def is either defined in the loop ("vect_internal_def"),
1962 or it's an induction (defined by a loop-header phi-node). */
1964 if (def2 && def2 == phi
1965 && (code == COND_EXPR
1966 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1967 && (is_gimple_assign (def1)
1968 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1969 == vect_induction_def
1970 || (gimple_code (def1) == GIMPLE_PHI
1971 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1972 == vect_internal_def
1973 && !is_loop_header_bb_p (gimple_bb (def1)))))))
1975 if (vect_print_dump_info (REPORT_DETAILS))
1976 report_vect_op (def_stmt, "detected reduction: ");
1979 else if (def1 && def1 == phi
1980 && (code == COND_EXPR
1981 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1982 && (is_gimple_assign (def2)
1983 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1984 == vect_induction_def
1985 || (gimple_code (def2) == GIMPLE_PHI
1986 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1987 == vect_internal_def
1988 && !is_loop_header_bb_p (gimple_bb (def2)))))))
1990 if (check_reduction)
1992 /* Swap operands (just for simplicity - so that the rest of the code
1993 can assume that the reduction variable is always the last (second)
1995 if (vect_print_dump_info (REPORT_DETAILS))
1996 report_vect_op (def_stmt,
1997 "detected reduction: need to swap operands: ");
1999 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2000 gimple_assign_rhs2_ptr (def_stmt));
2004 if (vect_print_dump_info (REPORT_DETAILS))
2005 report_vect_op (def_stmt, "detected reduction: ");
2012 if (vect_print_dump_info (REPORT_DETAILS))
2013 report_vect_op (def_stmt, "reduction: unknown pattern: ");
2019 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2020 in-place. Arguments as there. */
2023 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2024 bool check_reduction, bool *double_reduc)
2026 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2027 double_reduc, false);
2030 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2031 in-place if it enables detection of more reductions. Arguments
2035 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2036 bool check_reduction, bool *double_reduc)
2038 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2039 double_reduc, true);
2042 /* Calculate the cost of one scalar iteration of the loop. */
2044 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2046 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2047 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2048 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2049 int innerloop_iters, i, stmt_cost;
2051 /* Count statements in scalar loop. Using this as scalar cost for a single
2054 TODO: Add outer loop support.
2056 TODO: Consider assigning different costs to different scalar
2060 innerloop_iters = 1;
2062 innerloop_iters = 50; /* FIXME */
2064 for (i = 0; i < nbbs; i++)
2066 gimple_stmt_iterator si;
2067 basic_block bb = bbs[i];
2069 if (bb->loop_father == loop->inner)
2070 factor = innerloop_iters;
2074 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2076 gimple stmt = gsi_stmt (si);
2077 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2079 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2082 /* Skip stmts that are not vectorized inside the loop. */
2084 && !STMT_VINFO_RELEVANT_P (stmt_info)
2085 && (!STMT_VINFO_LIVE_P (stmt_info)
2086 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2089 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2091 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2092 stmt_cost = vect_get_cost (scalar_load);
2094 stmt_cost = vect_get_cost (scalar_store);
2097 stmt_cost = vect_get_cost (scalar_stmt);
2099 scalar_single_iter_cost += stmt_cost * factor;
2102 return scalar_single_iter_cost;
2105 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2107 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2108 int *peel_iters_epilogue,
2109 int scalar_single_iter_cost)
2111 int peel_guard_costs = 0;
2112 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2114 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2116 *peel_iters_epilogue = vf/2;
2117 if (vect_print_dump_info (REPORT_COST))
2118 fprintf (vect_dump, "cost model: "
2119 "epilogue peel iters set to vf/2 because "
2120 "loop iterations are unknown .");
2122 /* If peeled iterations are known but number of scalar loop
2123 iterations are unknown, count a taken branch per peeled loop. */
2124 peel_guard_costs = 2 * vect_get_cost (cond_branch_taken);
2128 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2129 peel_iters_prologue = niters < peel_iters_prologue ?
2130 niters : peel_iters_prologue;
2131 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2134 return (peel_iters_prologue * scalar_single_iter_cost)
2135 + (*peel_iters_epilogue * scalar_single_iter_cost)
2139 /* Function vect_estimate_min_profitable_iters
2141 Return the number of iterations required for the vector version of the
2142 loop to be profitable relative to the cost of the scalar version of the
2145 TODO: Take profile info into account before making vectorization
2146 decisions, if available. */
2149 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2152 int min_profitable_iters;
2153 int peel_iters_prologue;
2154 int peel_iters_epilogue;
2155 int vec_inside_cost = 0;
2156 int vec_outside_cost = 0;
2157 int scalar_single_iter_cost = 0;
2158 int scalar_outside_cost = 0;
2159 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2160 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2161 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2162 int nbbs = loop->num_nodes;
2163 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2164 int peel_guard_costs = 0;
2165 int innerloop_iters = 0, factor;
2166 VEC (slp_instance, heap) *slp_instances;
2167 slp_instance instance;
2169 /* Cost model disabled. */
2170 if (!flag_vect_cost_model)
2172 if (vect_print_dump_info (REPORT_COST))
2173 fprintf (vect_dump, "cost model disabled.");
2177 /* Requires loop versioning tests to handle misalignment. */
2178 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2180 /* FIXME: Make cost depend on complexity of individual check. */
2182 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2183 if (vect_print_dump_info (REPORT_COST))
2184 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2185 "versioning to treat misalignment.\n");
2188 /* Requires loop versioning with alias checks. */
2189 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2191 /* FIXME: Make cost depend on complexity of individual check. */
2193 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2194 if (vect_print_dump_info (REPORT_COST))
2195 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2196 "versioning aliasing.\n");
2199 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2200 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2201 vec_outside_cost += vect_get_cost (cond_branch_taken);
2203 /* Count statements in scalar loop. Using this as scalar cost for a single
2206 TODO: Add outer loop support.
2208 TODO: Consider assigning different costs to different scalar
2213 innerloop_iters = 50; /* FIXME */
2215 for (i = 0; i < nbbs; i++)
2217 gimple_stmt_iterator si;
2218 basic_block bb = bbs[i];
2220 if (bb->loop_father == loop->inner)
2221 factor = innerloop_iters;
2225 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2227 gimple stmt = gsi_stmt (si);
2228 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2229 /* Skip stmts that are not vectorized inside the loop. */
2230 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2231 && (!STMT_VINFO_LIVE_P (stmt_info)
2232 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2234 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2235 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2236 some of the "outside" costs are generated inside the outer-loop. */
2237 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2241 scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2243 /* Add additional cost for the peeled instructions in prologue and epilogue
2246 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2247 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2249 TODO: Build an expression that represents peel_iters for prologue and
2250 epilogue to be used in a run-time test. */
2254 peel_iters_prologue = vf/2;
2255 if (vect_print_dump_info (REPORT_COST))
2256 fprintf (vect_dump, "cost model: "
2257 "prologue peel iters set to vf/2.");
2259 /* If peeling for alignment is unknown, loop bound of main loop becomes
2261 peel_iters_epilogue = vf/2;
2262 if (vect_print_dump_info (REPORT_COST))
2263 fprintf (vect_dump, "cost model: "
2264 "epilogue peel iters set to vf/2 because "
2265 "peeling for alignment is unknown .");
2267 /* If peeled iterations are unknown, count a taken branch and a not taken
2268 branch per peeled loop. Even if scalar loop iterations are known,
2269 vector iterations are not known since peeled prologue iterations are
2270 not known. Hence guards remain the same. */
2271 peel_guard_costs += 2 * (vect_get_cost (cond_branch_taken)
2272 + vect_get_cost (cond_branch_not_taken));
2273 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2274 + (peel_iters_epilogue * scalar_single_iter_cost)
2279 peel_iters_prologue = npeel;
2280 vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2281 peel_iters_prologue, &peel_iters_epilogue,
2282 scalar_single_iter_cost);
2285 /* FORNOW: The scalar outside cost is incremented in one of the
2288 1. The vectorizer checks for alignment and aliasing and generates
2289 a condition that allows dynamic vectorization. A cost model
2290 check is ANDED with the versioning condition. Hence scalar code
2291 path now has the added cost of the versioning check.
2293 if (cost > th & versioning_check)
2296 Hence run-time scalar is incremented by not-taken branch cost.
2298 2. The vectorizer then checks if a prologue is required. If the
2299 cost model check was not done before during versioning, it has to
2300 be done before the prologue check.
2303 prologue = scalar_iters
2308 if (prologue == num_iters)
2311 Hence the run-time scalar cost is incremented by a taken branch,
2312 plus a not-taken branch, plus a taken branch cost.
2314 3. The vectorizer then checks if an epilogue is required. If the
2315 cost model check was not done before during prologue check, it
2316 has to be done with the epilogue check.
2322 if (prologue == num_iters)
2325 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2328 Hence the run-time scalar cost should be incremented by 2 taken
2331 TODO: The back end may reorder the BBS's differently and reverse
2332 conditions/branch directions. Change the estimates below to
2333 something more reasonable. */
2335 /* If the number of iterations is known and we do not do versioning, we can
2336 decide whether to vectorize at compile time. Hence the scalar version
2337 do not carry cost model guard costs. */
2338 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2339 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2340 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2342 /* Cost model check occurs at versioning. */
2343 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2344 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2345 scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2348 /* Cost model check occurs at prologue generation. */
2349 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2350 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2351 + vect_get_cost (cond_branch_not_taken);
2352 /* Cost model check occurs at epilogue generation. */
2354 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2358 /* Add SLP costs. */
2359 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2360 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2362 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2363 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2366 /* Calculate number of iterations required to make the vector version
2367 profitable, relative to the loop bodies only. The following condition
2369 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2371 SIC = scalar iteration cost, VIC = vector iteration cost,
2372 VOC = vector outside cost, VF = vectorization factor,
2373 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2374 SOC = scalar outside cost for run time cost model check. */
2376 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2378 if (vec_outside_cost <= 0)
2379 min_profitable_iters = 1;
2382 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2383 - vec_inside_cost * peel_iters_prologue
2384 - vec_inside_cost * peel_iters_epilogue)
2385 / ((scalar_single_iter_cost * vf)
2388 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2389 <= ((vec_inside_cost * min_profitable_iters)
2390 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2391 min_profitable_iters++;
2394 /* vector version will never be profitable. */
2397 if (vect_print_dump_info (REPORT_COST))
2398 fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2399 "divided by the scalar iteration cost = %d "
2400 "is greater or equal to the vectorization factor = %d.",
2401 vec_inside_cost, scalar_single_iter_cost, vf);
2405 if (vect_print_dump_info (REPORT_COST))
2407 fprintf (vect_dump, "Cost model analysis: \n");
2408 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2410 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2412 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2413 scalar_single_iter_cost);
2414 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2415 fprintf (vect_dump, " prologue iterations: %d\n",
2416 peel_iters_prologue);
2417 fprintf (vect_dump, " epilogue iterations: %d\n",
2418 peel_iters_epilogue);
2419 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2420 min_profitable_iters);
2423 min_profitable_iters =
2424 min_profitable_iters < vf ? vf : min_profitable_iters;
2426 /* Because the condition we create is:
2427 if (niters <= min_profitable_iters)
2428 then skip the vectorized loop. */
2429 min_profitable_iters--;
2431 if (vect_print_dump_info (REPORT_COST))
2432 fprintf (vect_dump, " Profitability threshold = %d\n",
2433 min_profitable_iters);
2435 return min_profitable_iters;
2439 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2440 functions. Design better to avoid maintenance issues. */
2442 /* Function vect_model_reduction_cost.
2444 Models cost for a reduction operation, including the vector ops
2445 generated within the strip-mine loop, the initial definition before
2446 the loop, and the epilogue code that must be generated. */
2449 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2453 enum tree_code code;
2456 gimple stmt, orig_stmt;
2458 enum machine_mode mode;
2459 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2460 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2463 /* Cost of reduction op inside loop. */
2464 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2465 += ncopies * vect_get_cost (vector_stmt);
2467 stmt = STMT_VINFO_STMT (stmt_info);
2469 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2471 case GIMPLE_SINGLE_RHS:
2472 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2473 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2475 case GIMPLE_UNARY_RHS:
2476 reduction_op = gimple_assign_rhs1 (stmt);
2478 case GIMPLE_BINARY_RHS:
2479 reduction_op = gimple_assign_rhs2 (stmt);
2485 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2488 if (vect_print_dump_info (REPORT_COST))
2490 fprintf (vect_dump, "unsupported data-type ");
2491 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2496 mode = TYPE_MODE (vectype);
2497 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2500 orig_stmt = STMT_VINFO_STMT (stmt_info);
2502 code = gimple_assign_rhs_code (orig_stmt);
2504 /* Add in cost for initial definition. */
2505 outer_cost += vect_get_cost (scalar_to_vec);
2507 /* Determine cost of epilogue code.
2509 We have a reduction operator that will reduce the vector in one statement.
2510 Also requires scalar extract. */
2512 if (!nested_in_vect_loop_p (loop, orig_stmt))
2514 if (reduc_code != ERROR_MARK)
2515 outer_cost += vect_get_cost (vector_stmt)
2516 + vect_get_cost (vec_to_scalar);
2519 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2521 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2522 int element_bitsize = tree_low_cst (bitsize, 1);
2523 int nelements = vec_size_in_bits / element_bitsize;
2525 optab = optab_for_tree_code (code, vectype, optab_default);
2527 /* We have a whole vector shift available. */
2528 if (VECTOR_MODE_P (mode)
2529 && optab_handler (optab, mode) != CODE_FOR_nothing
2530 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2531 /* Final reduction via vector shifts and the reduction operator. Also
2532 requires scalar extract. */
2533 outer_cost += ((exact_log2(nelements) * 2)
2534 * vect_get_cost (vector_stmt)
2535 + vect_get_cost (vec_to_scalar));
2537 /* Use extracts and reduction op for final reduction. For N elements,
2538 we have N extracts and N-1 reduction ops. */
2539 outer_cost += ((nelements + nelements - 1)
2540 * vect_get_cost (vector_stmt));
2544 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2546 if (vect_print_dump_info (REPORT_COST))
2547 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2548 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2549 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2555 /* Function vect_model_induction_cost.
2557 Models cost for induction operations. */
2560 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2562 /* loop cost for vec_loop. */
2563 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2564 = ncopies * vect_get_cost (vector_stmt);
2565 /* prologue cost for vec_init and vec_step. */
2566 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2567 = 2 * vect_get_cost (scalar_to_vec);
2569 if (vect_print_dump_info (REPORT_COST))
2570 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2571 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2572 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2576 /* Function get_initial_def_for_induction
2579 STMT - a stmt that performs an induction operation in the loop.
2580 IV_PHI - the initial value of the induction variable
2583 Return a vector variable, initialized with the first VF values of
2584 the induction variable. E.g., for an iv with IV_PHI='X' and
2585 evolution S, for a vector of 4 units, we want to return:
2586 [X, X + S, X + 2*S, X + 3*S]. */
2589 get_initial_def_for_induction (gimple iv_phi)
2591 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2592 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2593 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2594 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2597 edge pe = loop_preheader_edge (loop);
2598 struct loop *iv_loop;
2600 tree vec, vec_init, vec_step, t;
2604 gimple init_stmt, induction_phi, new_stmt;
2605 tree induc_def, vec_def, vec_dest;
2606 tree init_expr, step_expr;
2607 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2612 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2613 bool nested_in_vect_loop = false;
2614 gimple_seq stmts = NULL;
2615 imm_use_iterator imm_iter;
2616 use_operand_p use_p;
2620 gimple_stmt_iterator si;
2621 basic_block bb = gimple_bb (iv_phi);
2624 vectype = get_vectype_for_scalar_type (scalar_type);
2625 gcc_assert (vectype);
2626 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2627 ncopies = vf / nunits;
2629 gcc_assert (phi_info);
2630 gcc_assert (ncopies >= 1);
2632 /* Find the first insertion point in the BB. */
2633 si = gsi_after_labels (bb);
2635 if (INTEGRAL_TYPE_P (scalar_type))
2636 step_expr = build_int_cst (scalar_type, 0);
2637 else if (POINTER_TYPE_P (scalar_type))
2638 step_expr = size_zero_node;
2640 step_expr = build_real (scalar_type, dconst0);
2642 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2643 if (nested_in_vect_loop_p (loop, iv_phi))
2645 nested_in_vect_loop = true;
2646 iv_loop = loop->inner;
2650 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2652 latch_e = loop_latch_edge (iv_loop);
2653 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2655 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2656 gcc_assert (access_fn);
2657 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2658 &init_expr, &step_expr);
2660 pe = loop_preheader_edge (iv_loop);
2662 /* Create the vector that holds the initial_value of the induction. */
2663 if (nested_in_vect_loop)
2665 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2666 been created during vectorization of previous stmts. We obtain it
2667 from the STMT_VINFO_VEC_STMT of the defining stmt. */
2668 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2669 loop_preheader_edge (iv_loop));
2670 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2674 /* iv_loop is the loop to be vectorized. Create:
2675 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
2676 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2677 add_referenced_var (new_var);
2679 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2682 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2683 gcc_assert (!new_bb);
2687 t = tree_cons (NULL_TREE, init_expr, t);
2688 for (i = 1; i < nunits; i++)
2690 /* Create: new_name_i = new_name + step_expr */
2691 enum tree_code code = POINTER_TYPE_P (scalar_type)
2692 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2693 init_stmt = gimple_build_assign_with_ops (code, new_var,
2694 new_name, step_expr);
2695 new_name = make_ssa_name (new_var, init_stmt);
2696 gimple_assign_set_lhs (init_stmt, new_name);
2698 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2699 gcc_assert (!new_bb);
2701 if (vect_print_dump_info (REPORT_DETAILS))
2703 fprintf (vect_dump, "created new init_stmt: ");
2704 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2706 t = tree_cons (NULL_TREE, new_name, t);
2708 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
2709 vec = build_constructor_from_list (vectype, nreverse (t));
2710 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2714 /* Create the vector that holds the step of the induction. */
2715 if (nested_in_vect_loop)
2716 /* iv_loop is nested in the loop to be vectorized. Generate:
2717 vec_step = [S, S, S, S] */
2718 new_name = step_expr;
2721 /* iv_loop is the loop to be vectorized. Generate:
2722 vec_step = [VF*S, VF*S, VF*S, VF*S] */
2723 expr = build_int_cst (TREE_TYPE (step_expr), vf);
2724 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2729 for (i = 0; i < nunits; i++)
2730 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2731 gcc_assert (CONSTANT_CLASS_P (new_name));
2732 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2733 gcc_assert (stepvectype);
2734 vec = build_vector (stepvectype, t);
2735 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2738 /* Create the following def-use cycle:
2743 vec_iv = PHI <vec_init, vec_loop>
2747 vec_loop = vec_iv + vec_step; */
2749 /* Create the induction-phi that defines the induction-operand. */
2750 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2751 add_referenced_var (vec_dest);
2752 induction_phi = create_phi_node (vec_dest, iv_loop->header);
2753 set_vinfo_for_stmt (induction_phi,
2754 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2755 induc_def = PHI_RESULT (induction_phi);
2757 /* Create the iv update inside the loop */
2758 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2759 induc_def, vec_step);
2760 vec_def = make_ssa_name (vec_dest, new_stmt);
2761 gimple_assign_set_lhs (new_stmt, vec_def);
2762 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2763 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2766 /* Set the arguments of the phi node: */
2767 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2768 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2772 /* In case that vectorization factor (VF) is bigger than the number
2773 of elements that we can fit in a vectype (nunits), we have to generate
2774 more than one vector stmt - i.e - we need to "unroll" the
2775 vector stmt by a factor VF/nunits. For more details see documentation
2776 in vectorizable_operation. */
2780 stmt_vec_info prev_stmt_vinfo;
2781 /* FORNOW. This restriction should be relaxed. */
2782 gcc_assert (!nested_in_vect_loop);
2784 /* Create the vector that holds the step of the induction. */
2785 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2786 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2789 for (i = 0; i < nunits; i++)
2790 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2791 gcc_assert (CONSTANT_CLASS_P (new_name));
2792 vec = build_vector (stepvectype, t);
2793 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2795 vec_def = induc_def;
2796 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2797 for (i = 1; i < ncopies; i++)
2799 /* vec_i = vec_prev + vec_step */
2800 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2802 vec_def = make_ssa_name (vec_dest, new_stmt);
2803 gimple_assign_set_lhs (new_stmt, vec_def);
2805 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2806 set_vinfo_for_stmt (new_stmt,
2807 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2808 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2809 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2813 if (nested_in_vect_loop)
2815 /* Find the loop-closed exit-phi of the induction, and record
2816 the final vector of induction results: */
2818 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2820 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2822 exit_phi = USE_STMT (use_p);
2828 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2829 /* FORNOW. Currently not supporting the case that an inner-loop induction
2830 is not used in the outer-loop (i.e. only outside the outer-loop). */
2831 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2832 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2834 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2835 if (vect_print_dump_info (REPORT_DETAILS))
2837 fprintf (vect_dump, "vector of inductions after inner-loop:");
2838 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2844 if (vect_print_dump_info (REPORT_DETAILS))
2846 fprintf (vect_dump, "transform induction: created def-use cycle: ");
2847 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2848 fprintf (vect_dump, "\n");
2849 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2852 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2857 /* Function get_initial_def_for_reduction
2860 STMT - a stmt that performs a reduction operation in the loop.
2861 INIT_VAL - the initial value of the reduction variable
2864 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2865 of the reduction (used for adjusting the epilog - see below).
2866 Return a vector variable, initialized according to the operation that STMT
2867 performs. This vector will be used as the initial value of the
2868 vector of partial results.
2870 Option1 (adjust in epilog): Initialize the vector as follows:
2871 add/bit or/xor: [0,0,...,0,0]
2872 mult/bit and: [1,1,...,1,1]
2873 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2874 and when necessary (e.g. add/mult case) let the caller know
2875 that it needs to adjust the result by init_val.
2877 Option2: Initialize the vector as follows:
2878 add/bit or/xor: [init_val,0,0,...,0]
2879 mult/bit and: [init_val,1,1,...,1]
2880 min/max/cond_expr: [init_val,init_val,...,init_val]
2881 and no adjustments are needed.
2883 For example, for the following code:
2889 STMT is 's = s + a[i]', and the reduction variable is 's'.
2890 For a vector of 4 units, we want to return either [0,0,0,init_val],
2891 or [0,0,0,0] and let the caller know that it needs to adjust
2892 the result at the end by 'init_val'.
2894 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2895 initialization vector is simpler (same element in all entries), if
2896 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2898 A cost model should help decide between these two schemes. */
2901 get_initial_def_for_reduction (gimple stmt, tree init_val,
2902 tree *adjustment_def)
2904 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2905 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2906 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2907 tree scalar_type = TREE_TYPE (init_val);
2908 tree vectype = get_vectype_for_scalar_type (scalar_type);
2910 enum tree_code code = gimple_assign_rhs_code (stmt);
2915 bool nested_in_vect_loop = false;
2917 REAL_VALUE_TYPE real_init_val = dconst0;
2918 int int_init_val = 0;
2919 gimple def_stmt = NULL;
2921 gcc_assert (vectype);
2922 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2924 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2925 || SCALAR_FLOAT_TYPE_P (scalar_type));
2927 if (nested_in_vect_loop_p (loop, stmt))
2928 nested_in_vect_loop = true;
2930 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2932 /* In case of double reduction we only create a vector variable to be put
2933 in the reduction phi node. The actual statement creation is done in
2934 vect_create_epilog_for_reduction. */
2935 if (adjustment_def && nested_in_vect_loop
2936 && TREE_CODE (init_val) == SSA_NAME
2937 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2938 && gimple_code (def_stmt) == GIMPLE_PHI
2939 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2940 && vinfo_for_stmt (def_stmt)
2941 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2942 == vect_double_reduction_def)
2944 *adjustment_def = NULL;
2945 return vect_create_destination_var (init_val, vectype);
2948 if (TREE_CONSTANT (init_val))
2950 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2951 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2953 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2956 init_value = init_val;
2960 case WIDEN_SUM_EXPR:
2968 /* ADJUSMENT_DEF is NULL when called from
2969 vect_create_epilog_for_reduction to vectorize double reduction. */
2972 if (nested_in_vect_loop)
2973 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2976 *adjustment_def = init_val;
2979 if (code == MULT_EXPR)
2981 real_init_val = dconst1;
2985 if (code == BIT_AND_EXPR)
2988 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2989 def_for_init = build_real (scalar_type, real_init_val);
2991 def_for_init = build_int_cst (scalar_type, int_init_val);
2993 /* Create a vector of '0' or '1' except the first element. */
2994 for (i = nunits - 2; i >= 0; --i)
2995 t = tree_cons (NULL_TREE, def_for_init, t);
2997 /* Option1: the first element is '0' or '1' as well. */
3000 t = tree_cons (NULL_TREE, def_for_init, t);
3001 init_def = build_vector (vectype, t);
3005 /* Option2: the first element is INIT_VAL. */
3006 t = tree_cons (NULL_TREE, init_value, t);
3007 if (TREE_CONSTANT (init_val))
3008 init_def = build_vector (vectype, t);
3010 init_def = build_constructor_from_list (vectype, t);
3019 *adjustment_def = NULL_TREE;
3020 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3024 for (i = nunits - 1; i >= 0; --i)
3025 t = tree_cons (NULL_TREE, init_value, t);
3027 if (TREE_CONSTANT (init_val))
3028 init_def = build_vector (vectype, t);
3030 init_def = build_constructor_from_list (vectype, t);
3042 /* Function vect_create_epilog_for_reduction
3044 Create code at the loop-epilog to finalize the result of a reduction
3047 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3048 reduction statements.
3049 STMT is the scalar reduction stmt that is being vectorized.
3050 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3051 number of elements that we can fit in a vectype (nunits). In this case
3052 we have to generate more than one vector stmt - i.e - we need to "unroll"
3053 the vector stmt by a factor VF/nunits. For more details see documentation
3054 in vectorizable_operation.
3055 REDUC_CODE is the tree-code for the epilog reduction.
3056 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3058 REDUC_INDEX is the index of the operand in the right hand side of the
3059 statement that is defined by REDUCTION_PHI.
3060 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3061 SLP_NODE is an SLP node containing a group of reduction statements. The
3062 first one in this group is STMT.
3065 1. Creates the reduction def-use cycles: sets the arguments for
3067 The loop-entry argument is the vectorized initial-value of the reduction.
3068 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3070 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3071 by applying the operation specified by REDUC_CODE if available, or by
3072 other means (whole-vector shifts or a scalar loop).
3073 The function also creates a new phi node at the loop exit to preserve
3074 loop-closed form, as illustrated below.
3076 The flow at the entry to this function:
3079 vec_def = phi <null, null> # REDUCTION_PHI
3080 VECT_DEF = vector_stmt # vectorized form of STMT
3081 s_loop = scalar_stmt # (scalar) STMT
3083 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3087 The above is transformed by this function into:
3090 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3091 VECT_DEF = vector_stmt # vectorized form of STMT
3092 s_loop = scalar_stmt # (scalar) STMT
3094 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3095 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3096 v_out2 = reduce <v_out1>
3097 s_out3 = extract_field <v_out2, 0>
3098 s_out4 = adjust_result <s_out3>
3104 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3105 int ncopies, enum tree_code reduc_code,
3106 VEC (gimple, heap) *reduction_phis,
3107 int reduc_index, bool double_reduc,
3110 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3111 stmt_vec_info prev_phi_info;
3113 enum machine_mode mode;
3114 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3115 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3116 basic_block exit_bb;
3119 gimple new_phi = NULL, phi;
3120 gimple_stmt_iterator exit_gsi;
3122 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3123 gimple epilog_stmt = NULL;
3124 enum tree_code code = gimple_assign_rhs_code (stmt);
3126 tree bitsize, bitpos;
3127 tree adjustment_def = NULL;
3128 tree vec_initial_def = NULL;
3129 tree reduction_op, expr, def;
3130 tree orig_name, scalar_result;
3131 imm_use_iterator imm_iter, phi_imm_iter;
3132 use_operand_p use_p, phi_use_p;
3133 bool extract_scalar_result = false;
3134 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3135 bool nested_in_vect_loop = false;
3136 VEC (gimple, heap) *new_phis = NULL;
3137 enum vect_def_type dt = vect_unknown_def_type;
3139 VEC (tree, heap) *scalar_results = NULL;
3140 unsigned int group_size = 1, k, ratio;
3141 VEC (tree, heap) *vec_initial_defs = NULL;
3142 VEC (gimple, heap) *phis;
3145 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3147 if (nested_in_vect_loop_p (loop, stmt))
3151 nested_in_vect_loop = true;
3152 gcc_assert (!slp_node);
3155 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3157 case GIMPLE_SINGLE_RHS:
3158 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3160 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3162 case GIMPLE_UNARY_RHS:
3163 reduction_op = gimple_assign_rhs1 (stmt);
3165 case GIMPLE_BINARY_RHS:
3166 reduction_op = reduc_index ?
3167 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3173 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3174 gcc_assert (vectype);
3175 mode = TYPE_MODE (vectype);
3177 /* 1. Create the reduction def-use cycle:
3178 Set the arguments of REDUCTION_PHIS, i.e., transform
3181 vec_def = phi <null, null> # REDUCTION_PHI
3182 VECT_DEF = vector_stmt # vectorized form of STMT
3188 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3189 VECT_DEF = vector_stmt # vectorized form of STMT
3192 (in case of SLP, do it for all the phis). */
3194 /* Get the loop-entry arguments. */
3196 vect_get_slp_defs (slp_node, &vec_initial_defs, NULL, reduc_index);
3199 vec_initial_defs = VEC_alloc (tree, heap, 1);
3200 /* For the case of reduction, vect_get_vec_def_for_operand returns
3201 the scalar def before the loop, that defines the initial value
3202 of the reduction variable. */
3203 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3205 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3208 /* Set phi nodes arguments. */
3209 FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3211 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3212 tree def = VEC_index (tree, vect_defs, i);
3213 for (j = 0; j < ncopies; j++)
3215 /* Set the loop-entry arg of the reduction-phi. */
3216 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3219 /* Set the loop-latch arg for the reduction-phi. */
3221 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3223 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3225 if (vect_print_dump_info (REPORT_DETAILS))
3227 fprintf (vect_dump, "transform reduction: created def-use"
3229 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3230 fprintf (vect_dump, "\n");
3231 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3235 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3239 VEC_free (tree, heap, vec_initial_defs);
3241 /* 2. Create epilog code.
3242 The reduction epilog code operates across the elements of the vector
3243 of partial results computed by the vectorized loop.
3244 The reduction epilog code consists of:
3246 step 1: compute the scalar result in a vector (v_out2)
3247 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3248 step 3: adjust the scalar result (s_out3) if needed.
3250 Step 1 can be accomplished using one the following three schemes:
3251 (scheme 1) using reduc_code, if available.
3252 (scheme 2) using whole-vector shifts, if available.
3253 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3256 The overall epilog code looks like this:
3258 s_out0 = phi <s_loop> # original EXIT_PHI
3259 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3260 v_out2 = reduce <v_out1> # step 1
3261 s_out3 = extract_field <v_out2, 0> # step 2
3262 s_out4 = adjust_result <s_out3> # step 3
3264 (step 3 is optional, and steps 1 and 2 may be combined).
3265 Lastly, the uses of s_out0 are replaced by s_out4. */
3268 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3269 v_out1 = phi <VECT_DEF>
3270 Store them in NEW_PHIS. */
3272 exit_bb = single_exit (loop)->dest;
3273 prev_phi_info = NULL;
3274 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3275 FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3277 for (j = 0; j < ncopies; j++)
3279 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3280 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3282 VEC_quick_push (gimple, new_phis, phi);
3285 def = vect_get_vec_def_for_stmt_copy (dt, def);
3286 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3289 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3290 prev_phi_info = vinfo_for_stmt (phi);
3294 /* The epilogue is created for the outer-loop, i.e., for the loop being
3299 exit_bb = single_exit (loop)->dest;
3302 exit_gsi = gsi_after_labels (exit_bb);
3304 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3305 (i.e. when reduc_code is not available) and in the final adjustment
3306 code (if needed). Also get the original scalar reduction variable as
3307 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3308 represents a reduction pattern), the tree-code and scalar-def are
3309 taken from the original stmt that the pattern-stmt (STMT) replaces.
3310 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3311 are taken from STMT. */
3313 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3316 /* Regular reduction */
3321 /* Reduction pattern */
3322 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3323 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3324 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3327 code = gimple_assign_rhs_code (orig_stmt);
3328 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3329 partial results are added and not subtracted. */
3330 if (code == MINUS_EXPR)
3333 scalar_dest = gimple_assign_lhs (orig_stmt);
3334 scalar_type = TREE_TYPE (scalar_dest);
3335 scalar_results = VEC_alloc (tree, heap, group_size);
3336 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3337 bitsize = TYPE_SIZE (scalar_type);
3339 /* In case this is a reduction in an inner-loop while vectorizing an outer
3340 loop - we don't need to extract a single scalar result at the end of the
3341 inner-loop (unless it is double reduction, i.e., the use of reduction is
3342 outside the outer-loop). The final vector of partial results will be used
3343 in the vectorized outer-loop, or reduced to a scalar result at the end of
3345 if (nested_in_vect_loop && !double_reduc)
3346 goto vect_finalize_reduction;
3348 /* 2.3 Create the reduction code, using one of the three schemes described
3349 above. In SLP we simply need to extract all the elements from the
3350 vector (without reducing them), so we use scalar shifts. */
3351 if (reduc_code != ERROR_MARK && !slp_node)
3355 /*** Case 1: Create:
3356 v_out2 = reduc_expr <v_out1> */
3358 if (vect_print_dump_info (REPORT_DETAILS))
3359 fprintf (vect_dump, "Reduce using direct vector reduction.");
3361 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3362 new_phi = VEC_index (gimple, new_phis, 0);
3363 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
3364 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3365 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3366 gimple_assign_set_lhs (epilog_stmt, new_temp);
3367 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3369 extract_scalar_result = true;
3373 enum tree_code shift_code = ERROR_MARK;
3374 bool have_whole_vector_shift = true;
3376 int element_bitsize = tree_low_cst (bitsize, 1);
3377 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3380 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3381 shift_code = VEC_RSHIFT_EXPR;
3383 have_whole_vector_shift = false;
3385 /* Regardless of whether we have a whole vector shift, if we're
3386 emulating the operation via tree-vect-generic, we don't want
3387 to use it. Only the first round of the reduction is likely
3388 to still be profitable via emulation. */
3389 /* ??? It might be better to emit a reduction tree code here, so that
3390 tree-vect-generic can expand the first round via bit tricks. */
3391 if (!VECTOR_MODE_P (mode))
3392 have_whole_vector_shift = false;
3395 optab optab = optab_for_tree_code (code, vectype, optab_default);
3396 if (optab_handler (optab, mode) == CODE_FOR_nothing)
3397 have_whole_vector_shift = false;
3400 if (have_whole_vector_shift && !slp_node)
3402 /*** Case 2: Create:
3403 for (offset = VS/2; offset >= element_size; offset/=2)
3405 Create: va' = vec_shift <va, offset>
3406 Create: va = vop <va, va'>
3409 if (vect_print_dump_info (REPORT_DETAILS))
3410 fprintf (vect_dump, "Reduce using vector shifts");
3412 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3413 new_phi = VEC_index (gimple, new_phis, 0);
3414 new_temp = PHI_RESULT (new_phi);
3415 for (bit_offset = vec_size_in_bits/2;
3416 bit_offset >= element_bitsize;
3419 tree bitpos = size_int (bit_offset);
3421 epilog_stmt = gimple_build_assign_with_ops (shift_code,
3422 vec_dest, new_temp, bitpos);
3423 new_name = make_ssa_name (vec_dest, epilog_stmt);
3424 gimple_assign_set_lhs (epilog_stmt, new_name);
3425 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3427 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3428 new_name, new_temp);
3429 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3430 gimple_assign_set_lhs (epilog_stmt, new_temp);
3431 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3434 extract_scalar_result = true;
3440 /*** Case 3: Create:
3441 s = extract_field <v_out2, 0>
3442 for (offset = element_size;
3443 offset < vector_size;
3444 offset += element_size;)
3446 Create: s' = extract_field <v_out2, offset>
3447 Create: s = op <s, s'> // For non SLP cases
3450 if (vect_print_dump_info (REPORT_DETAILS))
3451 fprintf (vect_dump, "Reduce using scalar code. ");
3453 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3454 FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3456 vec_temp = PHI_RESULT (new_phi);
3457 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3459 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3460 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3461 gimple_assign_set_lhs (epilog_stmt, new_temp);
3462 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3464 /* In SLP we don't need to apply reduction operation, so we just
3465 collect s' values in SCALAR_RESULTS. */
3467 VEC_safe_push (tree, heap, scalar_results, new_temp);
3469 for (bit_offset = element_bitsize;
3470 bit_offset < vec_size_in_bits;
3471 bit_offset += element_bitsize)
3473 tree bitpos = bitsize_int (bit_offset);
3474 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3477 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3478 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3479 gimple_assign_set_lhs (epilog_stmt, new_name);
3480 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3484 /* In SLP we don't need to apply reduction operation, so
3485 we just collect s' values in SCALAR_RESULTS. */
3486 new_temp = new_name;
3487 VEC_safe_push (tree, heap, scalar_results, new_name);
3491 epilog_stmt = gimple_build_assign_with_ops (code,
3492 new_scalar_dest, new_name, new_temp);
3493 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3494 gimple_assign_set_lhs (epilog_stmt, new_temp);
3495 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3500 /* The only case where we need to reduce scalar results in SLP, is
3501 unrolling. If the size of SCALAR_RESULTS is greater than
3502 GROUP_SIZE, we reduce them combining elements modulo
3506 tree res, first_res, new_res;
3509 /* Reduce multiple scalar results in case of SLP unrolling. */
3510 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3513 first_res = VEC_index (tree, scalar_results, j % group_size);
3514 new_stmt = gimple_build_assign_with_ops (code,
3515 new_scalar_dest, first_res, res);
3516 new_res = make_ssa_name (new_scalar_dest, new_stmt);
3517 gimple_assign_set_lhs (new_stmt, new_res);
3518 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3519 VEC_replace (tree, scalar_results, j % group_size, new_res);
3523 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
3524 VEC_safe_push (tree, heap, scalar_results, new_temp);
3526 extract_scalar_result = false;
3530 /* 2.4 Extract the final scalar result. Create:
3531 s_out3 = extract_field <v_out2, bitpos> */
3533 if (extract_scalar_result)
3537 if (vect_print_dump_info (REPORT_DETAILS))
3538 fprintf (vect_dump, "extract scalar result");
3540 if (BYTES_BIG_ENDIAN)
3541 bitpos = size_binop (MULT_EXPR,
3542 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3543 TYPE_SIZE (scalar_type));
3545 bitpos = bitsize_zero_node;
3547 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3548 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3549 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3550 gimple_assign_set_lhs (epilog_stmt, new_temp);
3551 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3552 VEC_safe_push (tree, heap, scalar_results, new_temp);
3555 vect_finalize_reduction:
3560 /* 2.5 Adjust the final result by the initial value of the reduction
3561 variable. (When such adjustment is not needed, then
3562 'adjustment_def' is zero). For example, if code is PLUS we create:
3563 new_temp = loop_exit_def + adjustment_def */
3567 gcc_assert (!slp_node);
3568 if (nested_in_vect_loop)
3570 new_phi = VEC_index (gimple, new_phis, 0);
3571 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3572 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3573 new_dest = vect_create_destination_var (scalar_dest, vectype);
3577 new_temp = VEC_index (tree, scalar_results, 0);
3578 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3579 expr = build2 (code, scalar_type, new_temp, adjustment_def);
3580 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3583 epilog_stmt = gimple_build_assign (new_dest, expr);
3584 new_temp = make_ssa_name (new_dest, epilog_stmt);
3585 gimple_assign_set_lhs (epilog_stmt, new_temp);
3586 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3587 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3588 if (nested_in_vect_loop)
3590 set_vinfo_for_stmt (epilog_stmt,
3591 new_stmt_vec_info (epilog_stmt, loop_vinfo,
3593 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3594 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3597 VEC_quick_push (tree, scalar_results, new_temp);
3599 VEC_replace (tree, scalar_results, 0, new_temp);
3602 VEC_replace (tree, scalar_results, 0, new_temp);
3604 VEC_replace (gimple, new_phis, 0, epilog_stmt);
3607 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
3608 phis with new adjusted scalar results, i.e., replace use <s_out0>
3613 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3614 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3615 v_out2 = reduce <v_out1>
3616 s_out3 = extract_field <v_out2, 0>
3617 s_out4 = adjust_result <s_out3>
3624 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3625 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3626 v_out2 = reduce <v_out1>
3627 s_out3 = extract_field <v_out2, 0>
3628 s_out4 = adjust_result <s_out3>
3632 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
3633 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
3634 need to match SCALAR_RESULTS with corresponding statements. The first
3635 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
3636 the first vector stmt, etc.
3637 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
3638 if (group_size > VEC_length (gimple, new_phis))
3640 ratio = group_size / VEC_length (gimple, new_phis);
3641 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
3646 for (k = 0; k < group_size; k++)
3650 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
3651 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
3656 gimple current_stmt = VEC_index (gimple,
3657 SLP_TREE_SCALAR_STMTS (slp_node), k);
3659 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
3660 /* SLP statements can't participate in patterns. */
3661 gcc_assert (!orig_stmt);
3662 scalar_dest = gimple_assign_lhs (current_stmt);
3665 phis = VEC_alloc (gimple, heap, 3);
3666 /* Find the loop-closed-use at the loop exit of the original scalar
3667 result. (The reduction result is expected to have two immediate uses -
3668 one at the latch block, and one at the loop exit). */
3669 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3670 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3671 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3673 /* We expect to have found an exit_phi because of loop-closed-ssa
3675 gcc_assert (!VEC_empty (gimple, phis));
3677 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3681 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
3684 /* FORNOW. Currently not supporting the case that an inner-loop
3685 reduction is not used in the outer-loop (but only outside the
3686 outer-loop), unless it is double reduction. */
3687 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
3688 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
3691 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
3693 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
3694 != vect_double_reduction_def)
3697 /* Handle double reduction:
3699 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
3700 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
3701 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
3702 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
3704 At that point the regular reduction (stmt2 and stmt3) is
3705 already vectorized, as well as the exit phi node, stmt4.
3706 Here we vectorize the phi node of double reduction, stmt1, and
3707 update all relevant statements. */
3709 /* Go through all the uses of s2 to find double reduction phi
3710 node, i.e., stmt1 above. */
3711 orig_name = PHI_RESULT (exit_phi);
3712 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3714 stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3715 stmt_vec_info new_phi_vinfo;
3716 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3717 basic_block bb = gimple_bb (use_stmt);
3720 /* Check that USE_STMT is really double reduction phi
3722 if (gimple_code (use_stmt) != GIMPLE_PHI
3723 || gimple_phi_num_args (use_stmt) != 2
3725 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3726 != vect_double_reduction_def
3727 || bb->loop_father != outer_loop)
3730 /* Create vector phi node for double reduction:
3731 vs1 = phi <vs0, vs2>
3732 vs1 was created previously in this function by a call to
3733 vect_get_vec_def_for_operand and is stored in
3735 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3736 vs0 is created here. */
3738 /* Create vector phi node. */
3739 vect_phi = create_phi_node (vec_initial_def, bb);
3740 new_phi_vinfo = new_stmt_vec_info (vect_phi,
3741 loop_vec_info_for_loop (outer_loop), NULL);
3742 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3744 /* Create vs0 - initial def of the double reduction phi. */
3745 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3746 loop_preheader_edge (outer_loop));
3747 init_def = get_initial_def_for_reduction (stmt,
3748 preheader_arg, NULL);
3749 vect_phi_init = vect_init_vector (use_stmt, init_def,
3752 /* Update phi node arguments with vs0 and vs2. */
3753 add_phi_arg (vect_phi, vect_phi_init,
3754 loop_preheader_edge (outer_loop),
3756 add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3757 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3758 if (vect_print_dump_info (REPORT_DETAILS))
3760 fprintf (vect_dump, "created double reduction phi "
3762 print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3765 vect_phi_res = PHI_RESULT (vect_phi);
3767 /* Replace the use, i.e., set the correct vs1 in the regular
3768 reduction phi node. FORNOW, NCOPIES is always 1, so the
3769 loop is redundant. */
3770 use = reduction_phi;
3771 for (j = 0; j < ncopies; j++)
3773 edge pr_edge = loop_preheader_edge (loop);
3774 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3775 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3781 VEC_free (gimple, heap, phis);
3782 if (nested_in_vect_loop)
3790 phis = VEC_alloc (gimple, heap, 3);
3791 /* Find the loop-closed-use at the loop exit of the original scalar
3792 result. (The reduction result is expected to have two immediate uses,
3793 one at the latch block, and one at the loop exit). For double
3794 reductions we are looking for exit phis of the outer loop. */
3795 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3797 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3798 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3801 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
3803 tree phi_res = PHI_RESULT (USE_STMT (use_p));
3805 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
3807 if (!flow_bb_inside_loop_p (loop,
3808 gimple_bb (USE_STMT (phi_use_p))))
3809 VEC_safe_push (gimple, heap, phis,
3810 USE_STMT (phi_use_p));
3816 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3818 /* Replace the uses: */
3819 orig_name = PHI_RESULT (exit_phi);
3820 scalar_result = VEC_index (tree, scalar_results, k);
3821 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3822 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3823 SET_USE (use_p, scalar_result);
3826 VEC_free (gimple, heap, phis);
3829 VEC_free (tree, heap, scalar_results);
3830 VEC_free (gimple, heap, new_phis);
3834 /* Function vectorizable_reduction.
3836 Check if STMT performs a reduction operation that can be vectorized.
3837 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3838 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3839 Return FALSE if not a vectorizable STMT, TRUE otherwise.
3841 This function also handles reduction idioms (patterns) that have been
3842 recognized in advance during vect_pattern_recog. In this case, STMT may be
3844 X = pattern_expr (arg0, arg1, ..., X)
3845 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3846 sequence that had been detected and replaced by the pattern-stmt (STMT).
3848 In some cases of reduction patterns, the type of the reduction variable X is
3849 different than the type of the other arguments of STMT.
3850 In such cases, the vectype that is used when transforming STMT into a vector
3851 stmt is different than the vectype that is used to determine the
3852 vectorization factor, because it consists of a different number of elements
3853 than the actual number of elements that are being operated upon in parallel.
3855 For example, consider an accumulation of shorts into an int accumulator.
3856 On some targets it's possible to vectorize this pattern operating on 8
3857 shorts at a time (hence, the vectype for purposes of determining the
3858 vectorization factor should be V8HI); on the other hand, the vectype that
3859 is used to create the vector form is actually V4SI (the type of the result).
3861 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3862 indicates what is the actual level of parallelism (V8HI in the example), so
3863 that the right vectorization factor would be derived. This vectype
3864 corresponds to the type of arguments to the reduction stmt, and should *NOT*
3865 be used to create the vectorized stmt. The right vectype for the vectorized
3866 stmt is obtained from the type of the result X:
3867 get_vectype_for_scalar_type (TREE_TYPE (X))
3869 This means that, contrary to "regular" reductions (or "regular" stmts in
3870 general), the following equation:
3871 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3872 does *NOT* necessarily hold for reduction patterns. */
3875 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3876 gimple *vec_stmt, slp_tree slp_node)
3880 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3881 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3882 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
3883 tree vectype_in = NULL_TREE;
3884 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3885 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3886 enum tree_code code, orig_code, epilog_reduc_code;
3887 enum machine_mode vec_mode;
3889 optab optab, reduc_optab;
3890 tree new_temp = NULL_TREE;
3893 enum vect_def_type dt;
3894 gimple new_phi = NULL;
3898 stmt_vec_info orig_stmt_info;
3899 tree expr = NULL_TREE;
3903 stmt_vec_info prev_stmt_info, prev_phi_info;
3904 bool single_defuse_cycle = false;
3905 tree reduc_def = NULL_TREE;
3906 gimple new_stmt = NULL;
3909 bool nested_cycle = false, found_nested_cycle_def = false;
3910 gimple reduc_def_stmt = NULL;
3911 /* The default is that the reduction variable is the last in statement. */
3912 int reduc_index = 2;
3913 bool double_reduc = false, dummy;
3915 struct loop * def_stmt_loop, *outer_loop = NULL;
3917 gimple def_arg_stmt;
3918 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
3919 VEC (gimple, heap) *phis = NULL;
3923 if (nested_in_vect_loop_p (loop, stmt))
3927 nested_cycle = true;
3930 /* 1. Is vectorizable reduction? */
3931 /* Not supportable if the reduction variable is used in the loop. */
3932 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3935 /* Reductions that are not used even in an enclosing outer-loop,
3936 are expected to be "live" (used out of the loop). */
3937 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3938 && !STMT_VINFO_LIVE_P (stmt_info))
3941 /* Make sure it was already recognized as a reduction computation. */
3942 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3943 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3946 /* 2. Has this been recognized as a reduction pattern?
3948 Check if STMT represents a pattern that has been recognized
3949 in earlier analysis stages. For stmts that represent a pattern,
3950 the STMT_VINFO_RELATED_STMT field records the last stmt in
3951 the original sequence that constitutes the pattern. */
3953 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3956 orig_stmt_info = vinfo_for_stmt (orig_stmt);
3957 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3958 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3959 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3962 /* 3. Check the operands of the operation. The first operands are defined
3963 inside the loop body. The last operand is the reduction variable,
3964 which is defined by the loop-header-phi. */
3966 gcc_assert (is_gimple_assign (stmt));
3969 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3971 case GIMPLE_SINGLE_RHS:
3972 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3973 if (op_type == ternary_op)
3975 tree rhs = gimple_assign_rhs1 (stmt);
3976 ops[0] = TREE_OPERAND (rhs, 0);
3977 ops[1] = TREE_OPERAND (rhs, 1);
3978 ops[2] = TREE_OPERAND (rhs, 2);
3979 code = TREE_CODE (rhs);
3985 case GIMPLE_BINARY_RHS:
3986 code = gimple_assign_rhs_code (stmt);
3987 op_type = TREE_CODE_LENGTH (code);
3988 gcc_assert (op_type == binary_op);
3989 ops[0] = gimple_assign_rhs1 (stmt);
3990 ops[1] = gimple_assign_rhs2 (stmt);
3993 case GIMPLE_UNARY_RHS:
4000 scalar_dest = gimple_assign_lhs (stmt);
4001 scalar_type = TREE_TYPE (scalar_dest);
4002 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4003 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4006 /* All uses but the last are expected to be defined in the loop.
4007 The last use is the reduction variable. In case of nested cycle this
4008 assumption is not true: we use reduc_index to record the index of the
4009 reduction variable. */
4010 for (i = 0; i < op_type-1; i++)
4014 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4015 if (i == 0 && code == COND_EXPR)
4018 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
4019 &def_stmt, &def, &dt, &tem);
4022 gcc_assert (is_simple_use);
4023 if (dt != vect_internal_def
4024 && dt != vect_external_def
4025 && dt != vect_constant_def
4026 && dt != vect_induction_def
4027 && !(dt == vect_nested_cycle && nested_cycle))
4030 if (dt == vect_nested_cycle)
4032 found_nested_cycle_def = true;
4033 reduc_def_stmt = def_stmt;
4038 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
4040 gcc_assert (is_simple_use);
4041 gcc_assert (dt == vect_reduction_def
4042 || dt == vect_nested_cycle
4043 || ((dt == vect_internal_def || dt == vect_external_def
4044 || dt == vect_constant_def || dt == vect_induction_def)
4045 && nested_cycle && found_nested_cycle_def));
4046 if (!found_nested_cycle_def)
4047 reduc_def_stmt = def_stmt;
4049 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4051 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4056 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4057 !nested_cycle, &dummy));
4059 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4065 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4066 / TYPE_VECTOR_SUBPARTS (vectype_in));
4068 gcc_assert (ncopies >= 1);
4070 vec_mode = TYPE_MODE (vectype_in);
4072 if (code == COND_EXPR)
4074 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
4076 if (vect_print_dump_info (REPORT_DETAILS))
4077 fprintf (vect_dump, "unsupported condition in reduction");
4084 /* 4. Supportable by target? */
4086 /* 4.1. check support for the operation in the loop */
4087 optab = optab_for_tree_code (code, vectype_in, optab_default);
4090 if (vect_print_dump_info (REPORT_DETAILS))
4091 fprintf (vect_dump, "no optab.");
4096 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4098 if (vect_print_dump_info (REPORT_DETAILS))
4099 fprintf (vect_dump, "op not supported by target.");
4101 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4102 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4103 < vect_min_worthwhile_factor (code))
4106 if (vect_print_dump_info (REPORT_DETAILS))
4107 fprintf (vect_dump, "proceeding using word mode.");
4110 /* Worthwhile without SIMD support? */
4111 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4112 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4113 < vect_min_worthwhile_factor (code))
4115 if (vect_print_dump_info (REPORT_DETAILS))
4116 fprintf (vect_dump, "not worthwhile without SIMD support.");
4122 /* 4.2. Check support for the epilog operation.
4124 If STMT represents a reduction pattern, then the type of the
4125 reduction variable may be different than the type of the rest
4126 of the arguments. For example, consider the case of accumulation
4127 of shorts into an int accumulator; The original code:
4128 S1: int_a = (int) short_a;
4129 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4132 STMT: int_acc = widen_sum <short_a, int_acc>
4135 1. The tree-code that is used to create the vector operation in the
4136 epilog code (that reduces the partial results) is not the
4137 tree-code of STMT, but is rather the tree-code of the original
4138 stmt from the pattern that STMT is replacing. I.e, in the example
4139 above we want to use 'widen_sum' in the loop, but 'plus' in the
4141 2. The type (mode) we use to check available target support
4142 for the vector operation to be created in the *epilog*, is
4143 determined by the type of the reduction variable (in the example
4144 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4145 However the type (mode) we use to check available target support
4146 for the vector operation to be created *inside the loop*, is
4147 determined by the type of the other arguments to STMT (in the
4148 example we'd check this: optab_handler (widen_sum_optab,
4151 This is contrary to "regular" reductions, in which the types of all
4152 the arguments are the same as the type of the reduction variable.
4153 For "regular" reductions we can therefore use the same vector type
4154 (and also the same tree-code) when generating the epilog code and
4155 when generating the code inside the loop. */
4159 /* This is a reduction pattern: get the vectype from the type of the
4160 reduction variable, and get the tree-code from orig_stmt. */
4161 orig_code = gimple_assign_rhs_code (orig_stmt);
4162 gcc_assert (vectype_out);
4163 vec_mode = TYPE_MODE (vectype_out);
4167 /* Regular reduction: use the same vectype and tree-code as used for
4168 the vector code inside the loop can be used for the epilog code. */
4174 def_bb = gimple_bb (reduc_def_stmt);
4175 def_stmt_loop = def_bb->loop_father;
4176 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4177 loop_preheader_edge (def_stmt_loop));
4178 if (TREE_CODE (def_arg) == SSA_NAME
4179 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4180 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4181 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4182 && vinfo_for_stmt (def_arg_stmt)
4183 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4184 == vect_double_reduction_def)
4185 double_reduc = true;
4188 epilog_reduc_code = ERROR_MARK;
4189 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4191 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4195 if (vect_print_dump_info (REPORT_DETAILS))
4196 fprintf (vect_dump, "no optab for reduction.");
4198 epilog_reduc_code = ERROR_MARK;
4202 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4204 if (vect_print_dump_info (REPORT_DETAILS))
4205 fprintf (vect_dump, "reduc op not supported by target.");
4207 epilog_reduc_code = ERROR_MARK;
4212 if (!nested_cycle || double_reduc)
4214 if (vect_print_dump_info (REPORT_DETAILS))
4215 fprintf (vect_dump, "no reduc code for scalar code.");
4221 if (double_reduc && ncopies > 1)
4223 if (vect_print_dump_info (REPORT_DETAILS))
4224 fprintf (vect_dump, "multiple types in double reduction");
4229 if (!vec_stmt) /* transformation not required. */
4231 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4232 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4239 if (vect_print_dump_info (REPORT_DETAILS))
4240 fprintf (vect_dump, "transform reduction.");
4242 /* FORNOW: Multiple types are not supported for condition. */
4243 if (code == COND_EXPR)
4244 gcc_assert (ncopies == 1);
4246 /* Create the destination vector */
4247 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4249 /* In case the vectorization factor (VF) is bigger than the number
4250 of elements that we can fit in a vectype (nunits), we have to generate
4251 more than one vector stmt - i.e - we need to "unroll" the
4252 vector stmt by a factor VF/nunits. For more details see documentation
4253 in vectorizable_operation. */
4255 /* If the reduction is used in an outer loop we need to generate
4256 VF intermediate results, like so (e.g. for ncopies=2):
4261 (i.e. we generate VF results in 2 registers).
4262 In this case we have a separate def-use cycle for each copy, and therefore
4263 for each copy we get the vector def for the reduction variable from the
4264 respective phi node created for this copy.
4266 Otherwise (the reduction is unused in the loop nest), we can combine
4267 together intermediate results, like so (e.g. for ncopies=2):
4271 (i.e. we generate VF/2 results in a single register).
4272 In this case for each copy we get the vector def for the reduction variable
4273 from the vectorized reduction operation generated in the previous iteration.
4276 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4278 single_defuse_cycle = true;
4282 epilog_copies = ncopies;
4284 prev_stmt_info = NULL;
4285 prev_phi_info = NULL;
4288 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4289 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4290 == TYPE_VECTOR_SUBPARTS (vectype_in));
4295 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4296 if (op_type == ternary_op)
4297 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4300 phis = VEC_alloc (gimple, heap, vec_num);
4301 vect_defs = VEC_alloc (tree, heap, vec_num);
4303 VEC_quick_push (tree, vect_defs, NULL_TREE);
4305 for (j = 0; j < ncopies; j++)
4307 if (j == 0 || !single_defuse_cycle)
4309 for (i = 0; i < vec_num; i++)
4311 /* Create the reduction-phi that defines the reduction
4313 new_phi = create_phi_node (vec_dest, loop->header);
4314 set_vinfo_for_stmt (new_phi,
4315 new_stmt_vec_info (new_phi, loop_vinfo,
4317 if (j == 0 || slp_node)
4318 VEC_quick_push (gimple, phis, new_phi);
4322 if (code == COND_EXPR)
4324 gcc_assert (!slp_node);
4325 vectorizable_condition (stmt, gsi, vec_stmt,
4326 PHI_RESULT (VEC_index (gimple, phis, 0)),
4328 /* Multiple types are not supported for condition. */
4336 vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1);
4339 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4341 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4342 if (op_type == ternary_op)
4344 if (reduc_index == 0)
4345 loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
4348 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
4351 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4359 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
4360 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
4361 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4362 if (op_type == ternary_op)
4364 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4366 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4370 if (single_defuse_cycle)
4371 reduc_def = gimple_assign_lhs (new_stmt);
4373 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4376 FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4379 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4382 if (!single_defuse_cycle || j == 0)
4383 reduc_def = PHI_RESULT (new_phi);
4386 def1 = ((op_type == ternary_op)
4387 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4388 if (op_type == binary_op)
4390 if (reduc_index == 0)
4391 expr = build2 (code, vectype_out, reduc_def, def0);
4393 expr = build2 (code, vectype_out, def0, reduc_def);
4397 if (reduc_index == 0)
4398 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4401 if (reduc_index == 1)
4402 expr = build3 (code, vectype_out, def0, reduc_def, def1);
4404 expr = build3 (code, vectype_out, def0, def1, reduc_def);
4408 new_stmt = gimple_build_assign (vec_dest, expr);
4409 new_temp = make_ssa_name (vec_dest, new_stmt);
4410 gimple_assign_set_lhs (new_stmt, new_temp);
4411 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4414 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4415 VEC_quick_push (tree, vect_defs, new_temp);
4418 VEC_replace (tree, vect_defs, 0, new_temp);
4425 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4427 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4429 prev_stmt_info = vinfo_for_stmt (new_stmt);
4430 prev_phi_info = vinfo_for_stmt (new_phi);
4433 /* Finalize the reduction-phi (set its arguments) and create the
4434 epilog reduction code. */
4435 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4437 new_temp = gimple_assign_lhs (*vec_stmt);
4438 VEC_replace (tree, vect_defs, 0, new_temp);
4441 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4442 epilog_reduc_code, phis, reduc_index,
4443 double_reduc, slp_node);
4445 VEC_free (gimple, heap, phis);
4446 VEC_free (tree, heap, vec_oprnds0);
4448 VEC_free (tree, heap, vec_oprnds1);
4453 /* Function vect_min_worthwhile_factor.
4455 For a loop where we could vectorize the operation indicated by CODE,
4456 return the minimum vectorization factor that makes it worthwhile
4457 to use generic vectors. */
4459 vect_min_worthwhile_factor (enum tree_code code)
4480 /* Function vectorizable_induction
4482 Check if PHI performs an induction computation that can be vectorized.
4483 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4484 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4485 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4488 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4491 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4492 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4493 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4494 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4495 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4496 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4499 gcc_assert (ncopies >= 1);
4500 /* FORNOW. This restriction should be relaxed. */
4501 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4503 if (vect_print_dump_info (REPORT_DETAILS))
4504 fprintf (vect_dump, "multiple types in nested loop.");
4508 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4511 /* FORNOW: SLP not supported. */
4512 if (STMT_SLP_TYPE (stmt_info))
4515 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4517 if (gimple_code (phi) != GIMPLE_PHI)
4520 if (!vec_stmt) /* transformation not required. */
4522 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4523 if (vect_print_dump_info (REPORT_DETAILS))
4524 fprintf (vect_dump, "=== vectorizable_induction ===");
4525 vect_model_induction_cost (stmt_info, ncopies);
4531 if (vect_print_dump_info (REPORT_DETAILS))
4532 fprintf (vect_dump, "transform induction phi.");
4534 vec_def = get_initial_def_for_induction (phi);
4535 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4539 /* Function vectorizable_live_operation.
4541 STMT computes a value that is used outside the loop. Check if
4542 it can be supported. */
4545 vectorizable_live_operation (gimple stmt,
4546 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4547 gimple *vec_stmt ATTRIBUTE_UNUSED)
4549 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4550 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4551 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4557 enum vect_def_type dt;
4558 enum tree_code code;
4559 enum gimple_rhs_class rhs_class;
4561 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4563 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4566 if (!is_gimple_assign (stmt))
4569 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4572 /* FORNOW. CHECKME. */
4573 if (nested_in_vect_loop_p (loop, stmt))
4576 code = gimple_assign_rhs_code (stmt);
4577 op_type = TREE_CODE_LENGTH (code);
4578 rhs_class = get_gimple_rhs_class (code);
4579 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4580 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4582 /* FORNOW: support only if all uses are invariant. This means
4583 that the scalar operations can remain in place, unvectorized.
4584 The original last scalar value that they compute will be used. */
4586 for (i = 0; i < op_type; i++)
4588 if (rhs_class == GIMPLE_SINGLE_RHS)
4589 op = TREE_OPERAND (gimple_op (stmt, 1), i);
4591 op = gimple_op (stmt, i + 1);
4593 && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4595 if (vect_print_dump_info (REPORT_DETAILS))
4596 fprintf (vect_dump, "use not simple.");
4600 if (dt != vect_external_def && dt != vect_constant_def)
4604 /* No transformation is required for the cases we currently support. */
4608 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
4611 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4613 ssa_op_iter op_iter;
4614 imm_use_iterator imm_iter;
4615 def_operand_p def_p;
4618 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4620 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4624 if (!is_gimple_debug (ustmt))
4627 bb = gimple_bb (ustmt);
4629 if (!flow_bb_inside_loop_p (loop, bb))
4631 if (gimple_debug_bind_p (ustmt))
4633 if (vect_print_dump_info (REPORT_DETAILS))
4634 fprintf (vect_dump, "killing debug use");
4636 gimple_debug_bind_reset_value (ustmt);
4637 update_stmt (ustmt);
4646 /* Function vect_transform_loop.
4648 The analysis phase has determined that the loop is vectorizable.
4649 Vectorize the loop - created vectorized stmts to replace the scalar
4650 stmts in the loop, and update the loop exit condition. */
4653 vect_transform_loop (loop_vec_info loop_vinfo)
4655 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4656 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4657 int nbbs = loop->num_nodes;
4658 gimple_stmt_iterator si;
4661 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4663 bool slp_scheduled = false;
4664 unsigned int nunits;
4665 tree cond_expr = NULL_TREE;
4666 gimple_seq cond_expr_stmt_list = NULL;
4667 bool do_peeling_for_loop_bound;
4669 if (vect_print_dump_info (REPORT_DETAILS))
4670 fprintf (vect_dump, "=== vec_transform_loop ===");
4672 /* Peel the loop if there are data refs with unknown alignment.
4673 Only one data ref with unknown store is allowed. */
4675 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4676 vect_do_peeling_for_alignment (loop_vinfo);
4678 do_peeling_for_loop_bound
4679 = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4680 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4681 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4683 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4684 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4685 vect_loop_versioning (loop_vinfo,
4686 !do_peeling_for_loop_bound,
4687 &cond_expr, &cond_expr_stmt_list);
4689 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4690 compile time constant), or it is a constant that doesn't divide by the
4691 vectorization factor, then an epilog loop needs to be created.
4692 We therefore duplicate the loop: the original loop will be vectorized,
4693 and will compute the first (n/VF) iterations. The second copy of the loop
4694 will remain scalar and will compute the remaining (n%VF) iterations.
4695 (VF is the vectorization factor). */
4697 if (do_peeling_for_loop_bound)
4698 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4699 cond_expr, cond_expr_stmt_list);
4701 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4702 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4704 /* 1) Make sure the loop header has exactly two entries
4705 2) Make sure we have a preheader basic block. */
4707 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4709 split_edge (loop_preheader_edge (loop));
4711 /* FORNOW: the vectorizer supports only loops which body consist
4712 of one basic block (header + empty latch). When the vectorizer will
4713 support more involved loop forms, the order by which the BBs are
4714 traversed need to be reconsidered. */
4716 for (i = 0; i < nbbs; i++)
4718 basic_block bb = bbs[i];
4719 stmt_vec_info stmt_info;
4722 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4724 phi = gsi_stmt (si);
4725 if (vect_print_dump_info (REPORT_DETAILS))
4727 fprintf (vect_dump, "------>vectorizing phi: ");
4728 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4730 stmt_info = vinfo_for_stmt (phi);
4734 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4735 vect_loop_kill_debug_uses (loop, phi);
4737 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4738 && !STMT_VINFO_LIVE_P (stmt_info))
4741 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4742 != (unsigned HOST_WIDE_INT) vectorization_factor)
4743 && vect_print_dump_info (REPORT_DETAILS))
4744 fprintf (vect_dump, "multiple-types.");
4746 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4748 if (vect_print_dump_info (REPORT_DETAILS))
4749 fprintf (vect_dump, "transform phi.");
4750 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4754 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4756 gimple stmt = gsi_stmt (si);
4759 if (vect_print_dump_info (REPORT_DETAILS))
4761 fprintf (vect_dump, "------>vectorizing statement: ");
4762 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4765 stmt_info = vinfo_for_stmt (stmt);
4767 /* vector stmts created in the outer-loop during vectorization of
4768 stmts in an inner-loop may not have a stmt_info, and do not
4769 need to be vectorized. */
4776 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4777 vect_loop_kill_debug_uses (loop, stmt);
4779 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4780 && !STMT_VINFO_LIVE_P (stmt_info))
4786 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4788 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4789 if (!STMT_SLP_TYPE (stmt_info)
4790 && nunits != (unsigned int) vectorization_factor
4791 && vect_print_dump_info (REPORT_DETAILS))
4792 /* For SLP VF is set according to unrolling factor, and not to
4793 vector size, hence for SLP this print is not valid. */
4794 fprintf (vect_dump, "multiple-types.");
4796 /* SLP. Schedule all the SLP instances when the first SLP stmt is
4798 if (STMT_SLP_TYPE (stmt_info))
4802 slp_scheduled = true;
4804 if (vect_print_dump_info (REPORT_DETAILS))
4805 fprintf (vect_dump, "=== scheduling SLP instances ===");
4807 vect_schedule_slp (loop_vinfo, NULL);
4810 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
4811 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4818 /* -------- vectorize statement ------------ */
4819 if (vect_print_dump_info (REPORT_DETAILS))
4820 fprintf (vect_dump, "transform statement.");
4822 strided_store = false;
4823 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4826 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4828 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4829 interleaving chain was completed - free all the stores in
4831 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4832 gsi_remove (&si, true);
4837 /* Free the attached stmt_vec_info and remove the stmt. */
4838 free_stmt_vec_info (stmt);
4839 gsi_remove (&si, true);
4847 slpeel_make_loop_iterate_ntimes (loop, ratio);
4849 /* The memory tags and pointers in vectorized statements need to
4850 have their SSA forms updated. FIXME, why can't this be delayed
4851 until all the loops have been transformed? */
4852 update_ssa (TODO_update_ssa);
4854 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4855 fprintf (vect_dump, "LOOP VECTORIZED.");
4856 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4857 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");