2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
7 This file is part of GCC.
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
25 #include "coretypes.h"
29 #include "basic-block.h"
30 #include "tree-pretty-print.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
35 #include "cfglayout.h"
40 #include "diagnostic-core.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
46 /* Loop Vectorization Pass.
48 This pass tries to vectorize loops.
50 For example, the vectorizer transforms the following simple loop:
52 short a[N]; short b[N]; short c[N]; int i;
58 as if it was manually vectorized by rewriting the source code into:
60 typedef int __attribute__((mode(V8HI))) v8hi;
61 short a[N]; short b[N]; short c[N]; int i;
62 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
65 for (i=0; i<N/8; i++){
72 The main entry to this pass is vectorize_loops(), in which
73 the vectorizer applies a set of analyses on a given set of loops,
74 followed by the actual vectorization transformation for the loops that
75 had successfully passed the analysis phase.
76 Throughout this pass we make a distinction between two types of
77 data: scalars (which are represented by SSA_NAMES), and memory references
78 ("data-refs"). These two types of data require different handling both
79 during analysis and transformation. The types of data-refs that the
80 vectorizer currently supports are ARRAY_REFS which base is an array DECL
81 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
82 accesses are required to have a simple (consecutive) access pattern.
86 The driver for the analysis phase is vect_analyze_loop().
87 It applies a set of analyses, some of which rely on the scalar evolution
88 analyzer (scev) developed by Sebastian Pop.
90 During the analysis phase the vectorizer records some information
91 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
92 loop, as well as general information about the loop as a whole, which is
93 recorded in a "loop_vec_info" struct attached to each loop.
97 The loop transformation phase scans all the stmts in the loop, and
98 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
99 the loop that needs to be vectorized. It inserts the vector code sequence
100 just before the scalar stmt S, and records a pointer to the vector code
101 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
102 attached to S). This pointer will be used for the vectorization of following
103 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
104 otherwise, we rely on dead code elimination for removing it.
106 For example, say stmt S1 was vectorized into stmt VS1:
109 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
112 To vectorize stmt S2, the vectorizer first finds the stmt that defines
113 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
114 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
115 resulting sequence would be:
118 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
120 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
122 Operands that are not SSA_NAMEs, are data-refs that appear in
123 load/store operations (like 'x[i]' in S1), and are handled differently.
127 Currently the only target specific information that is used is the
128 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
129 Targets that can support different sizes of vectors, for now will need
130 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
131 flexibility will be added in the future.
133 Since we only vectorize operations which vector form can be
134 expressed using existing tree codes, to verify that an operation is
135 supported, the vectorizer checks the relevant optab at the relevant
136 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
137 the value found is CODE_FOR_nothing, then there's no target support, and
138 we can't vectorize the stmt.
140 For additional information on this project see:
141 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
144 /* Function vect_determine_vectorization_factor
146 Determine the vectorization factor (VF). VF is the number of data elements
147 that are operated upon in parallel in a single iteration of the vectorized
148 loop. For example, when vectorizing a loop that operates on 4byte elements,
149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150 elements can fit in a single vector register.
152 We currently support vectorization of loops in which all types operated upon
153 are of the same size. Therefore this function currently sets VF according to
154 the size of the types operated upon, and fails if there are multiple sizes
157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
164 for (i=0; i<N; i+=VF){
165 a[i:VF] = b[i:VF] + c[i:VF];
170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
172 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
173 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
174 int nbbs = loop->num_nodes;
175 gimple_stmt_iterator si;
176 unsigned int vectorization_factor = 0;
181 stmt_vec_info stmt_info;
184 gimple stmt, pattern_stmt = NULL, pattern_def_stmt = NULL;
185 bool analyze_pattern_stmt = false, pattern_def = false;
187 if (vect_print_dump_info (REPORT_DETAILS))
188 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
190 for (i = 0; i < nbbs; i++)
192 basic_block bb = bbs[i];
194 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
197 stmt_info = vinfo_for_stmt (phi);
198 if (vect_print_dump_info (REPORT_DETAILS))
200 fprintf (vect_dump, "==> examining phi: ");
201 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
204 gcc_assert (stmt_info);
206 if (STMT_VINFO_RELEVANT_P (stmt_info))
208 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
209 scalar_type = TREE_TYPE (PHI_RESULT (phi));
211 if (vect_print_dump_info (REPORT_DETAILS))
213 fprintf (vect_dump, "get vectype for scalar type: ");
214 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
217 vectype = get_vectype_for_scalar_type (scalar_type);
220 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
223 "not vectorized: unsupported data-type ");
224 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
228 STMT_VINFO_VECTYPE (stmt_info) = vectype;
230 if (vect_print_dump_info (REPORT_DETAILS))
232 fprintf (vect_dump, "vectype: ");
233 print_generic_expr (vect_dump, vectype, TDF_SLIM);
236 nunits = TYPE_VECTOR_SUBPARTS (vectype);
237 if (vect_print_dump_info (REPORT_DETAILS))
238 fprintf (vect_dump, "nunits = %d", nunits);
240 if (!vectorization_factor
241 || (nunits > vectorization_factor))
242 vectorization_factor = nunits;
246 for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
250 if (analyze_pattern_stmt)
253 analyze_pattern_stmt = false;
256 stmt = gsi_stmt (si);
258 stmt_info = vinfo_for_stmt (stmt);
260 if (vect_print_dump_info (REPORT_DETAILS))
262 fprintf (vect_dump, "==> examining statement: ");
263 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
266 gcc_assert (stmt_info);
268 /* Skip stmts which do not need to be vectorized. */
269 if (!STMT_VINFO_RELEVANT_P (stmt_info)
270 && !STMT_VINFO_LIVE_P (stmt_info))
272 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
273 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
274 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
275 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
278 stmt_info = vinfo_for_stmt (pattern_stmt);
279 if (vect_print_dump_info (REPORT_DETAILS))
281 fprintf (vect_dump, "==> examining pattern statement: ");
282 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
287 if (vect_print_dump_info (REPORT_DETAILS))
288 fprintf (vect_dump, "skip.");
293 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
294 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
295 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
296 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
297 analyze_pattern_stmt = true;
299 /* If a pattern statement has a def stmt, analyze it too. */
300 if (is_pattern_stmt_p (stmt_info)
301 && (pattern_def_stmt = STMT_VINFO_PATTERN_DEF_STMT (stmt_info))
302 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_def_stmt))
303 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_def_stmt))))
309 if (vect_print_dump_info (REPORT_DETAILS))
311 fprintf (vect_dump, "==> examining pattern def stmt: ");
312 print_gimple_stmt (vect_dump, pattern_def_stmt, 0,
317 stmt = pattern_def_stmt;
318 stmt_info = vinfo_for_stmt (stmt);
322 if (gimple_get_lhs (stmt) == NULL_TREE)
324 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
326 fprintf (vect_dump, "not vectorized: irregular stmt.");
327 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
332 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
334 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
336 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
337 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
342 if (STMT_VINFO_VECTYPE (stmt_info))
344 /* The only case when a vectype had been already set is for stmts
345 that contain a dataref, or for "pattern-stmts" (stmts
346 generated by the vectorizer to represent/replace a certain
348 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
349 || is_pattern_stmt_p (stmt_info)
351 vectype = STMT_VINFO_VECTYPE (stmt_info);
355 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
356 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
357 if (vect_print_dump_info (REPORT_DETAILS))
359 fprintf (vect_dump, "get vectype for scalar type: ");
360 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
362 vectype = get_vectype_for_scalar_type (scalar_type);
365 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
368 "not vectorized: unsupported data-type ");
369 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
374 STMT_VINFO_VECTYPE (stmt_info) = vectype;
377 /* The vectorization factor is according to the smallest
378 scalar type (or the largest vector size, but we only
379 support one vector size per loop). */
380 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
382 if (vect_print_dump_info (REPORT_DETAILS))
384 fprintf (vect_dump, "get vectype for scalar type: ");
385 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
387 vf_vectype = get_vectype_for_scalar_type (scalar_type);
390 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
393 "not vectorized: unsupported data-type ");
394 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
399 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
400 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
402 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
405 "not vectorized: different sized vector "
406 "types in statement, ");
407 print_generic_expr (vect_dump, vectype, TDF_SLIM);
408 fprintf (vect_dump, " and ");
409 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
414 if (vect_print_dump_info (REPORT_DETAILS))
416 fprintf (vect_dump, "vectype: ");
417 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
420 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
421 if (vect_print_dump_info (REPORT_DETAILS))
422 fprintf (vect_dump, "nunits = %d", nunits);
424 if (!vectorization_factor
425 || (nunits > vectorization_factor))
426 vectorization_factor = nunits;
428 if (!analyze_pattern_stmt && !pattern_def)
433 /* TODO: Analyze cost. Decide if worth while to vectorize. */
434 if (vect_print_dump_info (REPORT_DETAILS))
435 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
436 if (vectorization_factor <= 1)
438 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
439 fprintf (vect_dump, "not vectorized: unsupported data-type");
442 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
448 /* Function vect_is_simple_iv_evolution.
450 FORNOW: A simple evolution of an induction variables in the loop is
451 considered a polynomial evolution with constant step. */
454 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
459 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
461 /* When there is no evolution in this loop, the evolution function
463 if (evolution_part == NULL_TREE)
466 /* When the evolution is a polynomial of degree >= 2
467 the evolution function is not "simple". */
468 if (tree_is_chrec (evolution_part))
471 step_expr = evolution_part;
472 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
474 if (vect_print_dump_info (REPORT_DETAILS))
476 fprintf (vect_dump, "step: ");
477 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
478 fprintf (vect_dump, ", init: ");
479 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
485 if (TREE_CODE (step_expr) != INTEGER_CST)
487 if (vect_print_dump_info (REPORT_DETAILS))
488 fprintf (vect_dump, "step unknown.");
495 /* Function vect_analyze_scalar_cycles_1.
497 Examine the cross iteration def-use cycles of scalar variables
498 in LOOP. LOOP_VINFO represents the loop that is now being
499 considered for vectorization (can be LOOP, or an outer-loop
503 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
505 basic_block bb = loop->header;
507 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
508 gimple_stmt_iterator gsi;
511 if (vect_print_dump_info (REPORT_DETAILS))
512 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
514 /* First - identify all inductions. Reduction detection assumes that all the
515 inductions have been identified, therefore, this order must not be
517 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
519 gimple phi = gsi_stmt (gsi);
520 tree access_fn = NULL;
521 tree def = PHI_RESULT (phi);
522 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
524 if (vect_print_dump_info (REPORT_DETAILS))
526 fprintf (vect_dump, "Analyze phi: ");
527 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
530 /* Skip virtual phi's. The data dependences that are associated with
531 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
532 if (!is_gimple_reg (SSA_NAME_VAR (def)))
535 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
537 /* Analyze the evolution function. */
538 access_fn = analyze_scalar_evolution (loop, def);
540 STRIP_NOPS (access_fn);
541 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
543 fprintf (vect_dump, "Access function of PHI: ");
544 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
548 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
550 VEC_safe_push (gimple, heap, worklist, phi);
554 if (vect_print_dump_info (REPORT_DETAILS))
555 fprintf (vect_dump, "Detected induction.");
556 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
560 /* Second - identify all reductions and nested cycles. */
561 while (VEC_length (gimple, worklist) > 0)
563 gimple phi = VEC_pop (gimple, worklist);
564 tree def = PHI_RESULT (phi);
565 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
569 if (vect_print_dump_info (REPORT_DETAILS))
571 fprintf (vect_dump, "Analyze phi: ");
572 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
575 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
576 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
578 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
579 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
585 if (vect_print_dump_info (REPORT_DETAILS))
586 fprintf (vect_dump, "Detected double reduction.");
588 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
589 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
590 vect_double_reduction_def;
596 if (vect_print_dump_info (REPORT_DETAILS))
597 fprintf (vect_dump, "Detected vectorizable nested cycle.");
599 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
600 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
605 if (vect_print_dump_info (REPORT_DETAILS))
606 fprintf (vect_dump, "Detected reduction.");
608 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
609 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
611 /* Store the reduction cycles for possible vectorization in
613 VEC_safe_push (gimple, heap,
614 LOOP_VINFO_REDUCTIONS (loop_vinfo),
620 if (vect_print_dump_info (REPORT_DETAILS))
621 fprintf (vect_dump, "Unknown def-use cycle pattern.");
624 VEC_free (gimple, heap, worklist);
628 /* Function vect_analyze_scalar_cycles.
630 Examine the cross iteration def-use cycles of scalar variables, by
631 analyzing the loop-header PHIs of scalar variables. Classify each
632 cycle as one of the following: invariant, induction, reduction, unknown.
633 We do that for the loop represented by LOOP_VINFO, and also to its
634 inner-loop, if exists.
635 Examples for scalar cycles:
650 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
652 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
654 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
656 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
657 Reductions in such inner-loop therefore have different properties than
658 the reductions in the nest that gets vectorized:
659 1. When vectorized, they are executed in the same order as in the original
660 scalar loop, so we can't change the order of computation when
662 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
663 current checks are too strict. */
666 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
669 /* Function vect_get_loop_niters.
671 Determine how many iterations the loop is executed.
672 If an expression that represents the number of iterations
673 can be constructed, place it in NUMBER_OF_ITERATIONS.
674 Return the loop exit condition. */
677 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
681 if (vect_print_dump_info (REPORT_DETAILS))
682 fprintf (vect_dump, "=== get_loop_niters ===");
684 niters = number_of_exit_cond_executions (loop);
686 if (niters != NULL_TREE
687 && niters != chrec_dont_know)
689 *number_of_iterations = niters;
691 if (vect_print_dump_info (REPORT_DETAILS))
693 fprintf (vect_dump, "==> get_loop_niters:" );
694 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
698 return get_loop_exit_condition (loop);
702 /* Function bb_in_loop_p
704 Used as predicate for dfs order traversal of the loop bbs. */
707 bb_in_loop_p (const_basic_block bb, const void *data)
709 const struct loop *const loop = (const struct loop *)data;
710 if (flow_bb_inside_loop_p (loop, bb))
716 /* Function new_loop_vec_info.
718 Create and initialize a new loop_vec_info struct for LOOP, as well as
719 stmt_vec_info structs for all the stmts in LOOP. */
722 new_loop_vec_info (struct loop *loop)
726 gimple_stmt_iterator si;
727 unsigned int i, nbbs;
729 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
730 LOOP_VINFO_LOOP (res) = loop;
732 bbs = get_loop_body (loop);
734 /* Create/Update stmt_info for all stmts in the loop. */
735 for (i = 0; i < loop->num_nodes; i++)
737 basic_block bb = bbs[i];
739 /* BBs in a nested inner-loop will have been already processed (because
740 we will have called vect_analyze_loop_form for any nested inner-loop).
741 Therefore, for stmts in an inner-loop we just want to update the
742 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
743 loop_info of the outer-loop we are currently considering to vectorize
744 (instead of the loop_info of the inner-loop).
745 For stmts in other BBs we need to create a stmt_info from scratch. */
746 if (bb->loop_father != loop)
749 gcc_assert (loop->inner && bb->loop_father == loop->inner);
750 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
752 gimple phi = gsi_stmt (si);
753 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
754 loop_vec_info inner_loop_vinfo =
755 STMT_VINFO_LOOP_VINFO (stmt_info);
756 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
757 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
759 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
761 gimple stmt = gsi_stmt (si);
762 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
763 loop_vec_info inner_loop_vinfo =
764 STMT_VINFO_LOOP_VINFO (stmt_info);
765 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
766 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
771 /* bb in current nest. */
772 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
774 gimple phi = gsi_stmt (si);
775 gimple_set_uid (phi, 0);
776 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
779 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
781 gimple stmt = gsi_stmt (si);
782 gimple_set_uid (stmt, 0);
783 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
788 /* CHECKME: We want to visit all BBs before their successors (except for
789 latch blocks, for which this assertion wouldn't hold). In the simple
790 case of the loop forms we allow, a dfs order of the BBs would the same
791 as reversed postorder traversal, so we are safe. */
794 bbs = XCNEWVEC (basic_block, loop->num_nodes);
795 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
796 bbs, loop->num_nodes, loop);
797 gcc_assert (nbbs == loop->num_nodes);
799 LOOP_VINFO_BBS (res) = bbs;
800 LOOP_VINFO_NITERS (res) = NULL;
801 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
802 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
803 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
804 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
805 LOOP_VINFO_VECT_FACTOR (res) = 0;
806 LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
807 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
808 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
809 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
810 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
811 VEC_alloc (gimple, heap,
812 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
813 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
814 VEC_alloc (ddr_p, heap,
815 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
816 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
817 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
818 LOOP_VINFO_REDUCTION_CHAINS (res) = VEC_alloc (gimple, heap, 10);
819 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
820 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
821 LOOP_VINFO_PEELING_HTAB (res) = NULL;
822 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
828 /* Function destroy_loop_vec_info.
830 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
831 stmts in the loop. */
834 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
839 gimple_stmt_iterator si;
841 VEC (slp_instance, heap) *slp_instances;
842 slp_instance instance;
847 loop = LOOP_VINFO_LOOP (loop_vinfo);
849 bbs = LOOP_VINFO_BBS (loop_vinfo);
850 nbbs = loop->num_nodes;
854 free (LOOP_VINFO_BBS (loop_vinfo));
855 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
856 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
857 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
858 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
859 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
866 for (j = 0; j < nbbs; j++)
868 basic_block bb = bbs[j];
869 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
870 free_stmt_vec_info (gsi_stmt (si));
872 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
874 gimple stmt = gsi_stmt (si);
875 /* Free stmt_vec_info. */
876 free_stmt_vec_info (stmt);
881 free (LOOP_VINFO_BBS (loop_vinfo));
882 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
883 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
884 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
885 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
886 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
887 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
888 FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
889 vect_free_slp_instance (instance);
891 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
892 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
893 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
894 VEC_free (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo));
896 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
897 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
904 /* Function vect_analyze_loop_1.
906 Apply a set of analyses on LOOP, and create a loop_vec_info struct
907 for it. The different analyses will record information in the
908 loop_vec_info struct. This is a subset of the analyses applied in
909 vect_analyze_loop, to be applied on an inner-loop nested in the loop
910 that is now considered for (outer-loop) vectorization. */
913 vect_analyze_loop_1 (struct loop *loop)
915 loop_vec_info loop_vinfo;
917 if (vect_print_dump_info (REPORT_DETAILS))
918 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
920 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
922 loop_vinfo = vect_analyze_loop_form (loop);
925 if (vect_print_dump_info (REPORT_DETAILS))
926 fprintf (vect_dump, "bad inner-loop form.");
934 /* Function vect_analyze_loop_form.
936 Verify that certain CFG restrictions hold, including:
937 - the loop has a pre-header
938 - the loop has a single entry and exit
939 - the loop exit condition is simple enough, and the number of iterations
940 can be analyzed (a countable loop). */
943 vect_analyze_loop_form (struct loop *loop)
945 loop_vec_info loop_vinfo;
947 tree number_of_iterations = NULL;
948 loop_vec_info inner_loop_vinfo = NULL;
950 if (vect_print_dump_info (REPORT_DETAILS))
951 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
953 /* Different restrictions apply when we are considering an inner-most loop,
954 vs. an outer (nested) loop.
955 (FORNOW. May want to relax some of these restrictions in the future). */
959 /* Inner-most loop. We currently require that the number of BBs is
960 exactly 2 (the header and latch). Vectorizable inner-most loops
971 if (loop->num_nodes != 2)
973 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
974 fprintf (vect_dump, "not vectorized: control flow in loop.");
978 if (empty_block_p (loop->header))
980 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
981 fprintf (vect_dump, "not vectorized: empty loop.");
987 struct loop *innerloop = loop->inner;
990 /* Nested loop. We currently require that the loop is doubly-nested,
991 contains a single inner loop, and the number of BBs is exactly 5.
992 Vectorizable outer-loops look like this:
1004 The inner-loop has the properties expected of inner-most loops
1005 as described above. */
1007 if ((loop->inner)->inner || (loop->inner)->next)
1009 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1010 fprintf (vect_dump, "not vectorized: multiple nested loops.");
1014 /* Analyze the inner-loop. */
1015 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1016 if (!inner_loop_vinfo)
1018 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1019 fprintf (vect_dump, "not vectorized: Bad inner loop.");
1023 if (!expr_invariant_in_loop_p (loop,
1024 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1026 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1028 "not vectorized: inner-loop count not invariant.");
1029 destroy_loop_vec_info (inner_loop_vinfo, true);
1033 if (loop->num_nodes != 5)
1035 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1036 fprintf (vect_dump, "not vectorized: control flow in loop.");
1037 destroy_loop_vec_info (inner_loop_vinfo, true);
1041 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1042 entryedge = EDGE_PRED (innerloop->header, 0);
1043 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1044 entryedge = EDGE_PRED (innerloop->header, 1);
1046 if (entryedge->src != loop->header
1047 || !single_exit (innerloop)
1048 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1050 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1051 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1052 destroy_loop_vec_info (inner_loop_vinfo, true);
1056 if (vect_print_dump_info (REPORT_DETAILS))
1057 fprintf (vect_dump, "Considering outer-loop vectorization.");
1060 if (!single_exit (loop)
1061 || EDGE_COUNT (loop->header->preds) != 2)
1063 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1065 if (!single_exit (loop))
1066 fprintf (vect_dump, "not vectorized: multiple exits.");
1067 else if (EDGE_COUNT (loop->header->preds) != 2)
1068 fprintf (vect_dump, "not vectorized: too many incoming edges.");
1070 if (inner_loop_vinfo)
1071 destroy_loop_vec_info (inner_loop_vinfo, true);
1075 /* We assume that the loop exit condition is at the end of the loop. i.e,
1076 that the loop is represented as a do-while (with a proper if-guard
1077 before the loop if needed), where the loop header contains all the
1078 executable statements, and the latch is empty. */
1079 if (!empty_block_p (loop->latch)
1080 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1082 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1083 fprintf (vect_dump, "not vectorized: unexpected loop form.");
1084 if (inner_loop_vinfo)
1085 destroy_loop_vec_info (inner_loop_vinfo, true);
1089 /* Make sure there exists a single-predecessor exit bb: */
1090 if (!single_pred_p (single_exit (loop)->dest))
1092 edge e = single_exit (loop);
1093 if (!(e->flags & EDGE_ABNORMAL))
1095 split_loop_exit_edge (e);
1096 if (vect_print_dump_info (REPORT_DETAILS))
1097 fprintf (vect_dump, "split exit edge.");
1101 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1102 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1103 if (inner_loop_vinfo)
1104 destroy_loop_vec_info (inner_loop_vinfo, true);
1109 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1112 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1113 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1114 if (inner_loop_vinfo)
1115 destroy_loop_vec_info (inner_loop_vinfo, true);
1119 if (!number_of_iterations)
1121 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1123 "not vectorized: number of iterations cannot be computed.");
1124 if (inner_loop_vinfo)
1125 destroy_loop_vec_info (inner_loop_vinfo, true);
1129 if (chrec_contains_undetermined (number_of_iterations))
1131 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1132 fprintf (vect_dump, "Infinite number of iterations.");
1133 if (inner_loop_vinfo)
1134 destroy_loop_vec_info (inner_loop_vinfo, true);
1138 if (!NITERS_KNOWN_P (number_of_iterations))
1140 if (vect_print_dump_info (REPORT_DETAILS))
1142 fprintf (vect_dump, "Symbolic number of iterations is ");
1143 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1146 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1148 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1149 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1150 if (inner_loop_vinfo)
1151 destroy_loop_vec_info (inner_loop_vinfo, false);
1155 loop_vinfo = new_loop_vec_info (loop);
1156 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1157 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1159 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1161 /* CHECKME: May want to keep it around it in the future. */
1162 if (inner_loop_vinfo)
1163 destroy_loop_vec_info (inner_loop_vinfo, false);
1165 gcc_assert (!loop->aux);
1166 loop->aux = loop_vinfo;
1171 /* Get cost by calling cost target builtin. */
1174 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1176 tree dummy_type = NULL;
1179 return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1184 /* Function vect_analyze_loop_operations.
1186 Scan the loop stmts and make sure they are all vectorizable. */
1189 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1191 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1192 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1193 int nbbs = loop->num_nodes;
1194 gimple_stmt_iterator si;
1195 unsigned int vectorization_factor = 0;
1198 stmt_vec_info stmt_info;
1199 bool need_to_vectorize = false;
1200 int min_profitable_iters;
1201 int min_scalar_loop_bound;
1203 bool only_slp_in_loop = true, ok;
1205 if (vect_print_dump_info (REPORT_DETAILS))
1206 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1208 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1209 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1212 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1213 vectorization factor of the loop is the unrolling factor required by
1214 the SLP instances. If that unrolling factor is 1, we say, that we
1215 perform pure SLP on loop - cross iteration parallelism is not
1217 for (i = 0; i < nbbs; i++)
1219 basic_block bb = bbs[i];
1220 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1222 gimple stmt = gsi_stmt (si);
1223 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1224 gcc_assert (stmt_info);
1225 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1226 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1227 && !PURE_SLP_STMT (stmt_info))
1228 /* STMT needs both SLP and loop-based vectorization. */
1229 only_slp_in_loop = false;
1233 if (only_slp_in_loop)
1234 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1236 vectorization_factor = least_common_multiple (vectorization_factor,
1237 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1239 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1240 if (vect_print_dump_info (REPORT_DETAILS))
1241 fprintf (vect_dump, "Updating vectorization factor to %d ",
1242 vectorization_factor);
1245 for (i = 0; i < nbbs; i++)
1247 basic_block bb = bbs[i];
1249 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1251 phi = gsi_stmt (si);
1254 stmt_info = vinfo_for_stmt (phi);
1255 if (vect_print_dump_info (REPORT_DETAILS))
1257 fprintf (vect_dump, "examining phi: ");
1258 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1261 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1262 (i.e., a phi in the tail of the outer-loop). */
1263 if (! is_loop_header_bb_p (bb))
1265 /* FORNOW: we currently don't support the case that these phis
1266 are not used in the outerloop (unless it is double reduction,
1267 i.e., this phi is vect_reduction_def), cause this case
1268 requires to actually do something here. */
1269 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1270 || STMT_VINFO_LIVE_P (stmt_info))
1271 && STMT_VINFO_DEF_TYPE (stmt_info)
1272 != vect_double_reduction_def)
1274 if (vect_print_dump_info (REPORT_DETAILS))
1276 "Unsupported loop-closed phi in outer-loop.");
1280 /* If PHI is used in the outer loop, we check that its operand
1281 is defined in the inner loop. */
1282 if (STMT_VINFO_RELEVANT_P (stmt_info))
1287 if (gimple_phi_num_args (phi) != 1)
1290 phi_op = PHI_ARG_DEF (phi, 0);
1291 if (TREE_CODE (phi_op) != SSA_NAME)
1294 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1295 if (!op_def_stmt || !vinfo_for_stmt (op_def_stmt))
1298 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1299 != vect_used_in_outer
1300 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1301 != vect_used_in_outer_by_reduction)
1308 gcc_assert (stmt_info);
1310 if (STMT_VINFO_LIVE_P (stmt_info))
1312 /* FORNOW: not yet supported. */
1313 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1314 fprintf (vect_dump, "not vectorized: value used after loop.");
1318 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1319 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1321 /* A scalar-dependence cycle that we don't support. */
1322 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1323 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1327 if (STMT_VINFO_RELEVANT_P (stmt_info))
1329 need_to_vectorize = true;
1330 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1331 ok = vectorizable_induction (phi, NULL, NULL);
1336 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1339 "not vectorized: relevant phi not supported: ");
1340 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1346 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1348 gimple stmt = gsi_stmt (si);
1349 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1354 /* All operations in the loop are either irrelevant (deal with loop
1355 control, or dead), or only used outside the loop and can be moved
1356 out of the loop (e.g. invariants, inductions). The loop can be
1357 optimized away by scalar optimizations. We're better off not
1358 touching this loop. */
1359 if (!need_to_vectorize)
1361 if (vect_print_dump_info (REPORT_DETAILS))
1363 "All the computation can be taken out of the loop.");
1364 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1366 "not vectorized: redundant loop. no profit to vectorize.");
1370 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1371 && vect_print_dump_info (REPORT_DETAILS))
1373 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1374 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1376 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1377 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1379 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1380 fprintf (vect_dump, "not vectorized: iteration count too small.");
1381 if (vect_print_dump_info (REPORT_DETAILS))
1382 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1383 "vectorization factor.");
1387 /* Analyze cost. Decide if worth while to vectorize. */
1389 /* Once VF is set, SLP costs should be updated since the number of created
1390 vector stmts depends on VF. */
1391 vect_update_slp_costs_according_to_vf (loop_vinfo);
1393 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1394 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1396 if (min_profitable_iters < 0)
1398 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1399 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1400 if (vect_print_dump_info (REPORT_DETAILS))
1401 fprintf (vect_dump, "not vectorized: vector version will never be "
1406 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1407 * vectorization_factor) - 1);
1409 /* Use the cost model only if it is more conservative than user specified
1412 th = (unsigned) min_scalar_loop_bound;
1413 if (min_profitable_iters
1414 && (!min_scalar_loop_bound
1415 || min_profitable_iters > min_scalar_loop_bound))
1416 th = (unsigned) min_profitable_iters;
1418 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1419 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1421 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1422 fprintf (vect_dump, "not vectorized: vectorization not "
1424 if (vect_print_dump_info (REPORT_DETAILS))
1425 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1426 "user specified loop bound parameter or minimum "
1427 "profitable iterations (whichever is more conservative).");
1431 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1432 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1433 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1435 if (vect_print_dump_info (REPORT_DETAILS))
1436 fprintf (vect_dump, "epilog loop required.");
1437 if (!vect_can_advance_ivs_p (loop_vinfo))
1439 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1441 "not vectorized: can't create epilog loop 1.");
1444 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1446 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1448 "not vectorized: can't create epilog loop 2.");
1457 /* Function vect_analyze_loop_2.
1459 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1460 for it. The different analyses will record information in the
1461 loop_vec_info struct. */
1463 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1465 bool ok, slp = false;
1466 int max_vf = MAX_VECTORIZATION_FACTOR;
1469 /* Find all data references in the loop (which correspond to vdefs/vuses)
1470 and analyze their evolution in the loop. Also adjust the minimal
1471 vectorization factor according to the loads and stores.
1473 FORNOW: Handle only simple, array references, which
1474 alignment can be forced, and aligned pointer-references. */
1476 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1479 if (vect_print_dump_info (REPORT_DETAILS))
1480 fprintf (vect_dump, "bad data references.");
1484 /* Classify all cross-iteration scalar data-flow cycles.
1485 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1487 vect_analyze_scalar_cycles (loop_vinfo);
1489 vect_pattern_recog (loop_vinfo);
1491 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1493 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1496 if (vect_print_dump_info (REPORT_DETAILS))
1497 fprintf (vect_dump, "unexpected pattern.");
1501 /* Analyze data dependences between the data-refs in the loop
1502 and adjust the maximum vectorization factor according to
1504 FORNOW: fail at the first data dependence that we encounter. */
1506 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1510 if (vect_print_dump_info (REPORT_DETAILS))
1511 fprintf (vect_dump, "bad data dependence.");
1515 ok = vect_determine_vectorization_factor (loop_vinfo);
1518 if (vect_print_dump_info (REPORT_DETAILS))
1519 fprintf (vect_dump, "can't determine vectorization factor.");
1522 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1524 if (vect_print_dump_info (REPORT_DETAILS))
1525 fprintf (vect_dump, "bad data dependence.");
1529 /* Analyze the alignment of the data-refs in the loop.
1530 Fail if a data reference is found that cannot be vectorized. */
1532 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1535 if (vect_print_dump_info (REPORT_DETAILS))
1536 fprintf (vect_dump, "bad data alignment.");
1540 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1541 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1543 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1546 if (vect_print_dump_info (REPORT_DETAILS))
1547 fprintf (vect_dump, "bad data access.");
1551 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1552 It is important to call pruning after vect_analyze_data_ref_accesses,
1553 since we use grouping information gathered by interleaving analysis. */
1554 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1557 if (vect_print_dump_info (REPORT_DETAILS))
1558 fprintf (vect_dump, "too long list of versioning for alias "
1563 /* This pass will decide on using loop versioning and/or loop peeling in
1564 order to enhance the alignment of data references in the loop. */
1566 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1569 if (vect_print_dump_info (REPORT_DETAILS))
1570 fprintf (vect_dump, "bad data alignment.");
1574 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1575 ok = vect_analyze_slp (loop_vinfo, NULL);
1578 /* Decide which possible SLP instances to SLP. */
1579 slp = vect_make_slp_decision (loop_vinfo);
1581 /* Find stmts that need to be both vectorized and SLPed. */
1582 vect_detect_hybrid_slp (loop_vinfo);
1587 /* Scan all the operations in the loop and make sure they are
1590 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1593 if (vect_print_dump_info (REPORT_DETAILS))
1594 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1601 /* Function vect_analyze_loop.
1603 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1604 for it. The different analyses will record information in the
1605 loop_vec_info struct. */
1607 vect_analyze_loop (struct loop *loop)
1609 loop_vec_info loop_vinfo;
1610 unsigned int vector_sizes;
1612 /* Autodetect first vector size we try. */
1613 current_vector_size = 0;
1614 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1616 if (vect_print_dump_info (REPORT_DETAILS))
1617 fprintf (vect_dump, "===== analyze_loop_nest =====");
1619 if (loop_outer (loop)
1620 && loop_vec_info_for_loop (loop_outer (loop))
1621 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1623 if (vect_print_dump_info (REPORT_DETAILS))
1624 fprintf (vect_dump, "outer-loop already vectorized.");
1630 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1631 loop_vinfo = vect_analyze_loop_form (loop);
1634 if (vect_print_dump_info (REPORT_DETAILS))
1635 fprintf (vect_dump, "bad loop form.");
1639 if (vect_analyze_loop_2 (loop_vinfo))
1641 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1646 destroy_loop_vec_info (loop_vinfo, true);
1648 vector_sizes &= ~current_vector_size;
1649 if (vector_sizes == 0
1650 || current_vector_size == 0)
1653 /* Try the next biggest vector size. */
1654 current_vector_size = 1 << floor_log2 (vector_sizes);
1655 if (vect_print_dump_info (REPORT_DETAILS))
1656 fprintf (vect_dump, "***** Re-trying analysis with "
1657 "vector size %d\n", current_vector_size);
1662 /* Function reduction_code_for_scalar_code
1665 CODE - tree_code of a reduction operations.
1668 REDUC_CODE - the corresponding tree-code to be used to reduce the
1669 vector of partial results into a single scalar result (which
1670 will also reside in a vector) or ERROR_MARK if the operation is
1671 a supported reduction operation, but does not have such tree-code.
1673 Return FALSE if CODE currently cannot be vectorized as reduction. */
1676 reduction_code_for_scalar_code (enum tree_code code,
1677 enum tree_code *reduc_code)
1682 *reduc_code = REDUC_MAX_EXPR;
1686 *reduc_code = REDUC_MIN_EXPR;
1690 *reduc_code = REDUC_PLUS_EXPR;
1698 *reduc_code = ERROR_MARK;
1707 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1708 STMT is printed with a message MSG. */
1711 report_vect_op (gimple stmt, const char *msg)
1713 fprintf (vect_dump, "%s", msg);
1714 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1718 /* Detect SLP reduction of the form:
1728 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1729 FIRST_STMT is the first reduction stmt in the chain
1730 (a2 = operation (a1)).
1732 Return TRUE if a reduction chain was detected. */
1735 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1737 struct loop *loop = (gimple_bb (phi))->loop_father;
1738 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1739 enum tree_code code;
1740 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1741 stmt_vec_info use_stmt_info, current_stmt_info;
1743 imm_use_iterator imm_iter;
1744 use_operand_p use_p;
1745 int nloop_uses, size = 0, n_out_of_loop_uses;
1748 if (loop != vect_loop)
1751 lhs = PHI_RESULT (phi);
1752 code = gimple_assign_rhs_code (first_stmt);
1756 n_out_of_loop_uses = 0;
1757 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1759 gimple use_stmt = USE_STMT (use_p);
1760 if (is_gimple_debug (use_stmt))
1763 use_stmt = USE_STMT (use_p);
1765 /* Check if we got back to the reduction phi. */
1766 if (use_stmt == phi)
1768 loop_use_stmt = use_stmt;
1773 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1775 if (vinfo_for_stmt (use_stmt)
1776 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1778 loop_use_stmt = use_stmt;
1783 n_out_of_loop_uses++;
1785 /* There are can be either a single use in the loop or two uses in
1787 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1794 /* We reached a statement with no loop uses. */
1795 if (nloop_uses == 0)
1798 /* This is a loop exit phi, and we haven't reached the reduction phi. */
1799 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1802 if (!is_gimple_assign (loop_use_stmt)
1803 || code != gimple_assign_rhs_code (loop_use_stmt)
1804 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1807 /* Insert USE_STMT into reduction chain. */
1808 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1811 current_stmt_info = vinfo_for_stmt (current_stmt);
1812 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1813 GROUP_FIRST_ELEMENT (use_stmt_info)
1814 = GROUP_FIRST_ELEMENT (current_stmt_info);
1817 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1819 lhs = gimple_assign_lhs (loop_use_stmt);
1820 current_stmt = loop_use_stmt;
1824 if (!found || loop_use_stmt != phi || size < 2)
1827 /* Swap the operands, if needed, to make the reduction operand be the second
1829 lhs = PHI_RESULT (phi);
1830 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1833 if (gimple_assign_rhs2 (next_stmt) == lhs)
1835 tree op = gimple_assign_rhs1 (next_stmt);
1836 gimple def_stmt = NULL;
1838 if (TREE_CODE (op) == SSA_NAME)
1839 def_stmt = SSA_NAME_DEF_STMT (op);
1841 /* Check that the other def is either defined in the loop
1842 ("vect_internal_def"), or it's an induction (defined by a
1843 loop-header phi-node). */
1845 && gimple_bb (def_stmt)
1846 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1847 && (is_gimple_assign (def_stmt)
1848 || is_gimple_call (def_stmt)
1849 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1850 == vect_induction_def
1851 || (gimple_code (def_stmt) == GIMPLE_PHI
1852 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1853 == vect_internal_def
1854 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1856 lhs = gimple_assign_lhs (next_stmt);
1857 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1865 tree op = gimple_assign_rhs2 (next_stmt);
1866 gimple def_stmt = NULL;
1868 if (TREE_CODE (op) == SSA_NAME)
1869 def_stmt = SSA_NAME_DEF_STMT (op);
1871 /* Check that the other def is either defined in the loop
1872 ("vect_internal_def"), or it's an induction (defined by a
1873 loop-header phi-node). */
1875 && gimple_bb (def_stmt)
1876 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1877 && (is_gimple_assign (def_stmt)
1878 || is_gimple_call (def_stmt)
1879 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1880 == vect_induction_def
1881 || (gimple_code (def_stmt) == GIMPLE_PHI
1882 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1883 == vect_internal_def
1884 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1886 if (vect_print_dump_info (REPORT_DETAILS))
1888 fprintf (vect_dump, "swapping oprnds: ");
1889 print_gimple_stmt (vect_dump, next_stmt, 0, TDF_SLIM);
1892 swap_tree_operands (next_stmt,
1893 gimple_assign_rhs1_ptr (next_stmt),
1894 gimple_assign_rhs2_ptr (next_stmt));
1895 mark_symbols_for_renaming (next_stmt);
1901 lhs = gimple_assign_lhs (next_stmt);
1902 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1905 /* Save the chain for further analysis in SLP detection. */
1906 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1907 VEC_safe_push (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_info), first);
1908 GROUP_SIZE (vinfo_for_stmt (first)) = size;
1914 /* Function vect_is_simple_reduction_1
1916 (1) Detect a cross-iteration def-use cycle that represents a simple
1917 reduction computation. We look for the following pattern:
1922 a2 = operation (a3, a1)
1925 1. operation is commutative and associative and it is safe to
1926 change the order of the computation (if CHECK_REDUCTION is true)
1927 2. no uses for a2 in the loop (a2 is used out of the loop)
1928 3. no uses of a1 in the loop besides the reduction operation
1929 4. no uses of a1 outside the loop.
1931 Conditions 1,4 are tested here.
1932 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1934 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1935 nested cycles, if CHECK_REDUCTION is false.
1937 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1941 inner loop (def of a3)
1944 If MODIFY is true it tries also to rework the code in-place to enable
1945 detection of more reduction patterns. For the time being we rewrite
1946 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1950 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1951 bool check_reduction, bool *double_reduc,
1954 struct loop *loop = (gimple_bb (phi))->loop_father;
1955 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1956 edge latch_e = loop_latch_edge (loop);
1957 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1958 gimple def_stmt, def1 = NULL, def2 = NULL;
1959 enum tree_code orig_code, code;
1960 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1964 imm_use_iterator imm_iter;
1965 use_operand_p use_p;
1968 *double_reduc = false;
1970 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1971 otherwise, we assume outer loop vectorization. */
1972 gcc_assert ((check_reduction && loop == vect_loop)
1973 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1975 name = PHI_RESULT (phi);
1977 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1979 gimple use_stmt = USE_STMT (use_p);
1980 if (is_gimple_debug (use_stmt))
1983 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1985 if (vect_print_dump_info (REPORT_DETAILS))
1986 fprintf (vect_dump, "intermediate value used outside loop.");
1991 if (vinfo_for_stmt (use_stmt)
1992 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1996 if (vect_print_dump_info (REPORT_DETAILS))
1997 fprintf (vect_dump, "reduction used in loop.");
2002 if (TREE_CODE (loop_arg) != SSA_NAME)
2004 if (vect_print_dump_info (REPORT_DETAILS))
2006 fprintf (vect_dump, "reduction: not ssa_name: ");
2007 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
2012 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2015 if (vect_print_dump_info (REPORT_DETAILS))
2016 fprintf (vect_dump, "reduction: no def_stmt.");
2020 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2022 if (vect_print_dump_info (REPORT_DETAILS))
2023 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
2027 if (is_gimple_assign (def_stmt))
2029 name = gimple_assign_lhs (def_stmt);
2034 name = PHI_RESULT (def_stmt);
2039 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2041 gimple use_stmt = USE_STMT (use_p);
2042 if (is_gimple_debug (use_stmt))
2044 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2045 && vinfo_for_stmt (use_stmt)
2046 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2050 if (vect_print_dump_info (REPORT_DETAILS))
2051 fprintf (vect_dump, "reduction used in loop.");
2056 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2057 defined in the inner loop. */
2060 op1 = PHI_ARG_DEF (def_stmt, 0);
2062 if (gimple_phi_num_args (def_stmt) != 1
2063 || TREE_CODE (op1) != SSA_NAME)
2065 if (vect_print_dump_info (REPORT_DETAILS))
2066 fprintf (vect_dump, "unsupported phi node definition.");
2071 def1 = SSA_NAME_DEF_STMT (op1);
2072 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2074 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2075 && is_gimple_assign (def1))
2077 if (vect_print_dump_info (REPORT_DETAILS))
2078 report_vect_op (def_stmt, "detected double reduction: ");
2080 *double_reduc = true;
2087 code = orig_code = gimple_assign_rhs_code (def_stmt);
2089 /* We can handle "res -= x[i]", which is non-associative by
2090 simply rewriting this into "res += -x[i]". Avoid changing
2091 gimple instruction for the first simple tests and only do this
2092 if we're allowed to change code at all. */
2093 if (code == MINUS_EXPR
2095 && (op1 = gimple_assign_rhs1 (def_stmt))
2096 && TREE_CODE (op1) == SSA_NAME
2097 && SSA_NAME_DEF_STMT (op1) == phi)
2101 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2103 if (vect_print_dump_info (REPORT_DETAILS))
2104 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
2108 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2110 if (code != COND_EXPR)
2112 if (vect_print_dump_info (REPORT_DETAILS))
2113 report_vect_op (def_stmt, "reduction: not binary operation: ");
2118 op3 = gimple_assign_rhs1 (def_stmt);
2119 if (COMPARISON_CLASS_P (op3))
2121 op4 = TREE_OPERAND (op3, 1);
2122 op3 = TREE_OPERAND (op3, 0);
2125 op1 = gimple_assign_rhs2 (def_stmt);
2126 op2 = gimple_assign_rhs3 (def_stmt);
2128 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2130 if (vect_print_dump_info (REPORT_DETAILS))
2131 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2138 op1 = gimple_assign_rhs1 (def_stmt);
2139 op2 = gimple_assign_rhs2 (def_stmt);
2141 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2143 if (vect_print_dump_info (REPORT_DETAILS))
2144 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2150 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2151 if ((TREE_CODE (op1) == SSA_NAME
2152 && !types_compatible_p (type,TREE_TYPE (op1)))
2153 || (TREE_CODE (op2) == SSA_NAME
2154 && !types_compatible_p (type, TREE_TYPE (op2)))
2155 || (op3 && TREE_CODE (op3) == SSA_NAME
2156 && !types_compatible_p (type, TREE_TYPE (op3)))
2157 || (op4 && TREE_CODE (op4) == SSA_NAME
2158 && !types_compatible_p (type, TREE_TYPE (op4))))
2160 if (vect_print_dump_info (REPORT_DETAILS))
2162 fprintf (vect_dump, "reduction: multiple types: operation type: ");
2163 print_generic_expr (vect_dump, type, TDF_SLIM);
2164 fprintf (vect_dump, ", operands types: ");
2165 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
2166 fprintf (vect_dump, ",");
2167 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
2170 fprintf (vect_dump, ",");
2171 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
2176 fprintf (vect_dump, ",");
2177 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
2184 /* Check that it's ok to change the order of the computation.
2185 Generally, when vectorizing a reduction we change the order of the
2186 computation. This may change the behavior of the program in some
2187 cases, so we need to check that this is ok. One exception is when
2188 vectorizing an outer-loop: the inner-loop is executed sequentially,
2189 and therefore vectorizing reductions in the inner-loop during
2190 outer-loop vectorization is safe. */
2192 /* CHECKME: check for !flag_finite_math_only too? */
2193 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2196 /* Changing the order of operations changes the semantics. */
2197 if (vect_print_dump_info (REPORT_DETAILS))
2198 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
2201 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2204 /* Changing the order of operations changes the semantics. */
2205 if (vect_print_dump_info (REPORT_DETAILS))
2206 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
2209 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2211 /* Changing the order of operations changes the semantics. */
2212 if (vect_print_dump_info (REPORT_DETAILS))
2213 report_vect_op (def_stmt,
2214 "reduction: unsafe fixed-point math optimization: ");
2218 /* If we detected "res -= x[i]" earlier, rewrite it into
2219 "res += -x[i]" now. If this turns out to be useless reassoc
2220 will clean it up again. */
2221 if (orig_code == MINUS_EXPR)
2223 tree rhs = gimple_assign_rhs2 (def_stmt);
2224 tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
2225 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2227 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2228 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2230 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2231 gimple_assign_set_rhs2 (def_stmt, negrhs);
2232 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2233 update_stmt (def_stmt);
2236 /* Reduction is safe. We're dealing with one of the following:
2237 1) integer arithmetic and no trapv
2238 2) floating point arithmetic, and special flags permit this optimization
2239 3) nested cycle (i.e., outer loop vectorization). */
2240 if (TREE_CODE (op1) == SSA_NAME)
2241 def1 = SSA_NAME_DEF_STMT (op1);
2243 if (TREE_CODE (op2) == SSA_NAME)
2244 def2 = SSA_NAME_DEF_STMT (op2);
2246 if (code != COND_EXPR
2247 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2249 if (vect_print_dump_info (REPORT_DETAILS))
2250 report_vect_op (def_stmt, "reduction: no defs for operands: ");
2254 /* Check that one def is the reduction def, defined by PHI,
2255 the other def is either defined in the loop ("vect_internal_def"),
2256 or it's an induction (defined by a loop-header phi-node). */
2258 if (def2 && def2 == phi
2259 && (code == COND_EXPR
2260 || !def1 || gimple_nop_p (def1)
2261 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2262 && (is_gimple_assign (def1)
2263 || is_gimple_call (def1)
2264 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2265 == vect_induction_def
2266 || (gimple_code (def1) == GIMPLE_PHI
2267 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2268 == vect_internal_def
2269 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2271 if (vect_print_dump_info (REPORT_DETAILS))
2272 report_vect_op (def_stmt, "detected reduction: ");
2276 if (def1 && def1 == phi
2277 && (code == COND_EXPR
2278 || !def2 || gimple_nop_p (def2)
2279 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2280 && (is_gimple_assign (def2)
2281 || is_gimple_call (def2)
2282 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2283 == vect_induction_def
2284 || (gimple_code (def2) == GIMPLE_PHI
2285 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2286 == vect_internal_def
2287 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2289 if (check_reduction)
2291 /* Swap operands (just for simplicity - so that the rest of the code
2292 can assume that the reduction variable is always the last (second)
2294 if (vect_print_dump_info (REPORT_DETAILS))
2295 report_vect_op (def_stmt,
2296 "detected reduction: need to swap operands: ");
2298 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2299 gimple_assign_rhs2_ptr (def_stmt));
2303 if (vect_print_dump_info (REPORT_DETAILS))
2304 report_vect_op (def_stmt, "detected reduction: ");
2310 /* Try to find SLP reduction chain. */
2311 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2313 if (vect_print_dump_info (REPORT_DETAILS))
2314 report_vect_op (def_stmt, "reduction: detected reduction chain: ");
2319 if (vect_print_dump_info (REPORT_DETAILS))
2320 report_vect_op (def_stmt, "reduction: unknown pattern: ");
2325 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2326 in-place. Arguments as there. */
2329 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2330 bool check_reduction, bool *double_reduc)
2332 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2333 double_reduc, false);
2336 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2337 in-place if it enables detection of more reductions. Arguments
2341 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2342 bool check_reduction, bool *double_reduc)
2344 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2345 double_reduc, true);
2348 /* Calculate the cost of one scalar iteration of the loop. */
2350 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2352 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2353 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2354 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2355 int innerloop_iters, i, stmt_cost;
2357 /* Count statements in scalar loop. Using this as scalar cost for a single
2360 TODO: Add outer loop support.
2362 TODO: Consider assigning different costs to different scalar
2366 innerloop_iters = 1;
2368 innerloop_iters = 50; /* FIXME */
2370 for (i = 0; i < nbbs; i++)
2372 gimple_stmt_iterator si;
2373 basic_block bb = bbs[i];
2375 if (bb->loop_father == loop->inner)
2376 factor = innerloop_iters;
2380 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2382 gimple stmt = gsi_stmt (si);
2383 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2385 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2388 /* Skip stmts that are not vectorized inside the loop. */
2390 && !STMT_VINFO_RELEVANT_P (stmt_info)
2391 && (!STMT_VINFO_LIVE_P (stmt_info)
2392 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2395 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2397 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2398 stmt_cost = vect_get_cost (scalar_load);
2400 stmt_cost = vect_get_cost (scalar_store);
2403 stmt_cost = vect_get_cost (scalar_stmt);
2405 scalar_single_iter_cost += stmt_cost * factor;
2408 return scalar_single_iter_cost;
2411 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2413 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2414 int *peel_iters_epilogue,
2415 int scalar_single_iter_cost)
2417 int peel_guard_costs = 0;
2418 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2420 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2422 *peel_iters_epilogue = vf/2;
2423 if (vect_print_dump_info (REPORT_COST))
2424 fprintf (vect_dump, "cost model: "
2425 "epilogue peel iters set to vf/2 because "
2426 "loop iterations are unknown .");
2428 /* If peeled iterations are known but number of scalar loop
2429 iterations are unknown, count a taken branch per peeled loop. */
2430 peel_guard_costs = 2 * vect_get_cost (cond_branch_taken);
2434 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2435 peel_iters_prologue = niters < peel_iters_prologue ?
2436 niters : peel_iters_prologue;
2437 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2438 /* If we need to peel for gaps, but no peeling is required, we have to
2439 peel VF iterations. */
2440 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2441 *peel_iters_epilogue = vf;
2444 return (peel_iters_prologue * scalar_single_iter_cost)
2445 + (*peel_iters_epilogue * scalar_single_iter_cost)
2449 /* Function vect_estimate_min_profitable_iters
2451 Return the number of iterations required for the vector version of the
2452 loop to be profitable relative to the cost of the scalar version of the
2455 TODO: Take profile info into account before making vectorization
2456 decisions, if available. */
2459 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2462 int min_profitable_iters;
2463 int peel_iters_prologue;
2464 int peel_iters_epilogue;
2465 int vec_inside_cost = 0;
2466 int vec_outside_cost = 0;
2467 int scalar_single_iter_cost = 0;
2468 int scalar_outside_cost = 0;
2469 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2470 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2471 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2472 int nbbs = loop->num_nodes;
2473 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2474 int peel_guard_costs = 0;
2475 int innerloop_iters = 0, factor;
2476 VEC (slp_instance, heap) *slp_instances;
2477 slp_instance instance;
2479 /* Cost model disabled. */
2480 if (!flag_vect_cost_model)
2482 if (vect_print_dump_info (REPORT_COST))
2483 fprintf (vect_dump, "cost model disabled.");
2487 /* Requires loop versioning tests to handle misalignment. */
2488 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2490 /* FIXME: Make cost depend on complexity of individual check. */
2492 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2493 if (vect_print_dump_info (REPORT_COST))
2494 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2495 "versioning to treat misalignment.\n");
2498 /* Requires loop versioning with alias checks. */
2499 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2501 /* FIXME: Make cost depend on complexity of individual check. */
2503 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2504 if (vect_print_dump_info (REPORT_COST))
2505 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2506 "versioning aliasing.\n");
2509 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2510 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2511 vec_outside_cost += vect_get_cost (cond_branch_taken);
2513 /* Count statements in scalar loop. Using this as scalar cost for a single
2516 TODO: Add outer loop support.
2518 TODO: Consider assigning different costs to different scalar
2523 innerloop_iters = 50; /* FIXME */
2525 for (i = 0; i < nbbs; i++)
2527 gimple_stmt_iterator si;
2528 basic_block bb = bbs[i];
2530 if (bb->loop_father == loop->inner)
2531 factor = innerloop_iters;
2535 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2537 gimple stmt = gsi_stmt (si);
2538 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2539 /* Skip stmts that are not vectorized inside the loop. */
2540 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2541 && (!STMT_VINFO_LIVE_P (stmt_info)
2542 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2544 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2545 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2546 some of the "outside" costs are generated inside the outer-loop. */
2547 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2551 scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2553 /* Add additional cost for the peeled instructions in prologue and epilogue
2556 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2557 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2559 TODO: Build an expression that represents peel_iters for prologue and
2560 epilogue to be used in a run-time test. */
2564 peel_iters_prologue = vf/2;
2565 if (vect_print_dump_info (REPORT_COST))
2566 fprintf (vect_dump, "cost model: "
2567 "prologue peel iters set to vf/2.");
2569 /* If peeling for alignment is unknown, loop bound of main loop becomes
2571 peel_iters_epilogue = vf/2;
2572 if (vect_print_dump_info (REPORT_COST))
2573 fprintf (vect_dump, "cost model: "
2574 "epilogue peel iters set to vf/2 because "
2575 "peeling for alignment is unknown .");
2577 /* If peeled iterations are unknown, count a taken branch and a not taken
2578 branch per peeled loop. Even if scalar loop iterations are known,
2579 vector iterations are not known since peeled prologue iterations are
2580 not known. Hence guards remain the same. */
2581 peel_guard_costs += 2 * (vect_get_cost (cond_branch_taken)
2582 + vect_get_cost (cond_branch_not_taken));
2583 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2584 + (peel_iters_epilogue * scalar_single_iter_cost)
2589 peel_iters_prologue = npeel;
2590 vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2591 peel_iters_prologue, &peel_iters_epilogue,
2592 scalar_single_iter_cost);
2595 /* FORNOW: The scalar outside cost is incremented in one of the
2598 1. The vectorizer checks for alignment and aliasing and generates
2599 a condition that allows dynamic vectorization. A cost model
2600 check is ANDED with the versioning condition. Hence scalar code
2601 path now has the added cost of the versioning check.
2603 if (cost > th & versioning_check)
2606 Hence run-time scalar is incremented by not-taken branch cost.
2608 2. The vectorizer then checks if a prologue is required. If the
2609 cost model check was not done before during versioning, it has to
2610 be done before the prologue check.
2613 prologue = scalar_iters
2618 if (prologue == num_iters)
2621 Hence the run-time scalar cost is incremented by a taken branch,
2622 plus a not-taken branch, plus a taken branch cost.
2624 3. The vectorizer then checks if an epilogue is required. If the
2625 cost model check was not done before during prologue check, it
2626 has to be done with the epilogue check.
2632 if (prologue == num_iters)
2635 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2638 Hence the run-time scalar cost should be incremented by 2 taken
2641 TODO: The back end may reorder the BBS's differently and reverse
2642 conditions/branch directions. Change the estimates below to
2643 something more reasonable. */
2645 /* If the number of iterations is known and we do not do versioning, we can
2646 decide whether to vectorize at compile time. Hence the scalar version
2647 do not carry cost model guard costs. */
2648 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2649 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2650 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2652 /* Cost model check occurs at versioning. */
2653 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2654 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2655 scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2658 /* Cost model check occurs at prologue generation. */
2659 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2660 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2661 + vect_get_cost (cond_branch_not_taken);
2662 /* Cost model check occurs at epilogue generation. */
2664 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2668 /* Add SLP costs. */
2669 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2670 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2672 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2673 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2676 /* Calculate number of iterations required to make the vector version
2677 profitable, relative to the loop bodies only. The following condition
2679 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2681 SIC = scalar iteration cost, VIC = vector iteration cost,
2682 VOC = vector outside cost, VF = vectorization factor,
2683 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2684 SOC = scalar outside cost for run time cost model check. */
2686 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2688 if (vec_outside_cost <= 0)
2689 min_profitable_iters = 1;
2692 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2693 - vec_inside_cost * peel_iters_prologue
2694 - vec_inside_cost * peel_iters_epilogue)
2695 / ((scalar_single_iter_cost * vf)
2698 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2699 <= ((vec_inside_cost * min_profitable_iters)
2700 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2701 min_profitable_iters++;
2704 /* vector version will never be profitable. */
2707 if (vect_print_dump_info (REPORT_COST))
2708 fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2709 "divided by the scalar iteration cost = %d "
2710 "is greater or equal to the vectorization factor = %d.",
2711 vec_inside_cost, scalar_single_iter_cost, vf);
2715 if (vect_print_dump_info (REPORT_COST))
2717 fprintf (vect_dump, "Cost model analysis: \n");
2718 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2720 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2722 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2723 scalar_single_iter_cost);
2724 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2725 fprintf (vect_dump, " prologue iterations: %d\n",
2726 peel_iters_prologue);
2727 fprintf (vect_dump, " epilogue iterations: %d\n",
2728 peel_iters_epilogue);
2729 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2730 min_profitable_iters);
2733 min_profitable_iters =
2734 min_profitable_iters < vf ? vf : min_profitable_iters;
2736 /* Because the condition we create is:
2737 if (niters <= min_profitable_iters)
2738 then skip the vectorized loop. */
2739 min_profitable_iters--;
2741 if (vect_print_dump_info (REPORT_COST))
2742 fprintf (vect_dump, " Profitability threshold = %d\n",
2743 min_profitable_iters);
2745 return min_profitable_iters;
2749 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2750 functions. Design better to avoid maintenance issues. */
2752 /* Function vect_model_reduction_cost.
2754 Models cost for a reduction operation, including the vector ops
2755 generated within the strip-mine loop, the initial definition before
2756 the loop, and the epilogue code that must be generated. */
2759 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2763 enum tree_code code;
2766 gimple stmt, orig_stmt;
2768 enum machine_mode mode;
2769 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2770 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2773 /* Cost of reduction op inside loop. */
2774 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2775 += ncopies * vect_get_cost (vector_stmt);
2777 stmt = STMT_VINFO_STMT (stmt_info);
2779 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2781 case GIMPLE_SINGLE_RHS:
2782 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2783 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2785 case GIMPLE_UNARY_RHS:
2786 reduction_op = gimple_assign_rhs1 (stmt);
2788 case GIMPLE_BINARY_RHS:
2789 reduction_op = gimple_assign_rhs2 (stmt);
2791 case GIMPLE_TERNARY_RHS:
2792 reduction_op = gimple_assign_rhs3 (stmt);
2798 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2801 if (vect_print_dump_info (REPORT_COST))
2803 fprintf (vect_dump, "unsupported data-type ");
2804 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2809 mode = TYPE_MODE (vectype);
2810 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2813 orig_stmt = STMT_VINFO_STMT (stmt_info);
2815 code = gimple_assign_rhs_code (orig_stmt);
2817 /* Add in cost for initial definition. */
2818 outer_cost += vect_get_cost (scalar_to_vec);
2820 /* Determine cost of epilogue code.
2822 We have a reduction operator that will reduce the vector in one statement.
2823 Also requires scalar extract. */
2825 if (!nested_in_vect_loop_p (loop, orig_stmt))
2827 if (reduc_code != ERROR_MARK)
2828 outer_cost += vect_get_cost (vector_stmt)
2829 + vect_get_cost (vec_to_scalar);
2832 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2834 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2835 int element_bitsize = tree_low_cst (bitsize, 1);
2836 int nelements = vec_size_in_bits / element_bitsize;
2838 optab = optab_for_tree_code (code, vectype, optab_default);
2840 /* We have a whole vector shift available. */
2841 if (VECTOR_MODE_P (mode)
2842 && optab_handler (optab, mode) != CODE_FOR_nothing
2843 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2844 /* Final reduction via vector shifts and the reduction operator. Also
2845 requires scalar extract. */
2846 outer_cost += ((exact_log2(nelements) * 2)
2847 * vect_get_cost (vector_stmt)
2848 + vect_get_cost (vec_to_scalar));
2850 /* Use extracts and reduction op for final reduction. For N elements,
2851 we have N extracts and N-1 reduction ops. */
2852 outer_cost += ((nelements + nelements - 1)
2853 * vect_get_cost (vector_stmt));
2857 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2859 if (vect_print_dump_info (REPORT_COST))
2860 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2861 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2862 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2868 /* Function vect_model_induction_cost.
2870 Models cost for induction operations. */
2873 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2875 /* loop cost for vec_loop. */
2876 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2877 = ncopies * vect_get_cost (vector_stmt);
2878 /* prologue cost for vec_init and vec_step. */
2879 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2880 = 2 * vect_get_cost (scalar_to_vec);
2882 if (vect_print_dump_info (REPORT_COST))
2883 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2884 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2885 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2889 /* Function get_initial_def_for_induction
2892 STMT - a stmt that performs an induction operation in the loop.
2893 IV_PHI - the initial value of the induction variable
2896 Return a vector variable, initialized with the first VF values of
2897 the induction variable. E.g., for an iv with IV_PHI='X' and
2898 evolution S, for a vector of 4 units, we want to return:
2899 [X, X + S, X + 2*S, X + 3*S]. */
2902 get_initial_def_for_induction (gimple iv_phi)
2904 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2905 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2906 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2910 edge pe = loop_preheader_edge (loop);
2911 struct loop *iv_loop;
2913 tree vec, vec_init, vec_step, t;
2917 gimple init_stmt, induction_phi, new_stmt;
2918 tree induc_def, vec_def, vec_dest;
2919 tree init_expr, step_expr;
2920 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2925 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2926 bool nested_in_vect_loop = false;
2927 gimple_seq stmts = NULL;
2928 imm_use_iterator imm_iter;
2929 use_operand_p use_p;
2933 gimple_stmt_iterator si;
2934 basic_block bb = gimple_bb (iv_phi);
2938 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2939 if (nested_in_vect_loop_p (loop, iv_phi))
2941 nested_in_vect_loop = true;
2942 iv_loop = loop->inner;
2946 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2948 latch_e = loop_latch_edge (iv_loop);
2949 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2951 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2952 gcc_assert (access_fn);
2953 STRIP_NOPS (access_fn);
2954 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2955 &init_expr, &step_expr);
2957 pe = loop_preheader_edge (iv_loop);
2959 scalar_type = TREE_TYPE (init_expr);
2960 vectype = get_vectype_for_scalar_type (scalar_type);
2961 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
2962 gcc_assert (vectype);
2963 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2964 ncopies = vf / nunits;
2966 gcc_assert (phi_info);
2967 gcc_assert (ncopies >= 1);
2969 /* Find the first insertion point in the BB. */
2970 si = gsi_after_labels (bb);
2972 /* Create the vector that holds the initial_value of the induction. */
2973 if (nested_in_vect_loop)
2975 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2976 been created during vectorization of previous stmts. We obtain it
2977 from the STMT_VINFO_VEC_STMT of the defining stmt. */
2978 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2979 loop_preheader_edge (iv_loop));
2980 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2984 /* iv_loop is the loop to be vectorized. Create:
2985 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
2986 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2987 add_referenced_var (new_var);
2989 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2992 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2993 gcc_assert (!new_bb);
2997 t = tree_cons (NULL_TREE, new_name, t);
2998 for (i = 1; i < nunits; i++)
3000 /* Create: new_name_i = new_name + step_expr */
3001 enum tree_code code = POINTER_TYPE_P (scalar_type)
3002 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3003 init_stmt = gimple_build_assign_with_ops (code, new_var,
3004 new_name, step_expr);
3005 new_name = make_ssa_name (new_var, init_stmt);
3006 gimple_assign_set_lhs (init_stmt, new_name);
3008 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3009 gcc_assert (!new_bb);
3011 if (vect_print_dump_info (REPORT_DETAILS))
3013 fprintf (vect_dump, "created new init_stmt: ");
3014 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
3016 t = tree_cons (NULL_TREE, new_name, t);
3018 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3019 vec = build_constructor_from_list (vectype, nreverse (t));
3020 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
3024 /* Create the vector that holds the step of the induction. */
3025 if (nested_in_vect_loop)
3026 /* iv_loop is nested in the loop to be vectorized. Generate:
3027 vec_step = [S, S, S, S] */
3028 new_name = step_expr;
3031 /* iv_loop is the loop to be vectorized. Generate:
3032 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3033 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3034 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3038 t = unshare_expr (new_name);
3039 gcc_assert (CONSTANT_CLASS_P (new_name));
3040 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3041 gcc_assert (stepvectype);
3042 vec = build_vector_from_val (stepvectype, t);
3043 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3046 /* Create the following def-use cycle:
3051 vec_iv = PHI <vec_init, vec_loop>
3055 vec_loop = vec_iv + vec_step; */
3057 /* Create the induction-phi that defines the induction-operand. */
3058 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3059 add_referenced_var (vec_dest);
3060 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3061 set_vinfo_for_stmt (induction_phi,
3062 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3063 induc_def = PHI_RESULT (induction_phi);
3065 /* Create the iv update inside the loop */
3066 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3067 induc_def, vec_step);
3068 vec_def = make_ssa_name (vec_dest, new_stmt);
3069 gimple_assign_set_lhs (new_stmt, vec_def);
3070 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3071 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3074 /* Set the arguments of the phi node: */
3075 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3076 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3080 /* In case that vectorization factor (VF) is bigger than the number
3081 of elements that we can fit in a vectype (nunits), we have to generate
3082 more than one vector stmt - i.e - we need to "unroll" the
3083 vector stmt by a factor VF/nunits. For more details see documentation
3084 in vectorizable_operation. */
3088 stmt_vec_info prev_stmt_vinfo;
3089 /* FORNOW. This restriction should be relaxed. */
3090 gcc_assert (!nested_in_vect_loop);
3092 /* Create the vector that holds the step of the induction. */
3093 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3094 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3096 t = unshare_expr (new_name);
3097 gcc_assert (CONSTANT_CLASS_P (new_name));
3098 vec = build_vector_from_val (stepvectype, t);
3099 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3101 vec_def = induc_def;
3102 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3103 for (i = 1; i < ncopies; i++)
3105 /* vec_i = vec_prev + vec_step */
3106 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3108 vec_def = make_ssa_name (vec_dest, new_stmt);
3109 gimple_assign_set_lhs (new_stmt, vec_def);
3111 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3112 if (!useless_type_conversion_p (resvectype, vectype))
3114 new_stmt = gimple_build_assign_with_ops
3116 vect_get_new_vect_var (resvectype, vect_simple_var,
3118 build1 (VIEW_CONVERT_EXPR, resvectype,
3119 gimple_assign_lhs (new_stmt)), NULL_TREE);
3120 gimple_assign_set_lhs (new_stmt,
3122 (gimple_assign_lhs (new_stmt), new_stmt));
3123 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3125 set_vinfo_for_stmt (new_stmt,
3126 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3127 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3128 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3132 if (nested_in_vect_loop)
3134 /* Find the loop-closed exit-phi of the induction, and record
3135 the final vector of induction results: */
3137 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3139 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3141 exit_phi = USE_STMT (use_p);
3147 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3148 /* FORNOW. Currently not supporting the case that an inner-loop induction
3149 is not used in the outer-loop (i.e. only outside the outer-loop). */
3150 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3151 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3153 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3154 if (vect_print_dump_info (REPORT_DETAILS))
3156 fprintf (vect_dump, "vector of inductions after inner-loop:");
3157 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
3163 if (vect_print_dump_info (REPORT_DETAILS))
3165 fprintf (vect_dump, "transform induction: created def-use cycle: ");
3166 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
3167 fprintf (vect_dump, "\n");
3168 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
3171 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3172 if (!useless_type_conversion_p (resvectype, vectype))
3174 new_stmt = gimple_build_assign_with_ops
3176 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3177 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3178 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3179 gimple_assign_set_lhs (new_stmt, induc_def);
3180 si = gsi_start_bb (bb);
3181 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3182 set_vinfo_for_stmt (new_stmt,
3183 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3184 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3185 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3192 /* Function get_initial_def_for_reduction
3195 STMT - a stmt that performs a reduction operation in the loop.
3196 INIT_VAL - the initial value of the reduction variable
3199 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3200 of the reduction (used for adjusting the epilog - see below).
3201 Return a vector variable, initialized according to the operation that STMT
3202 performs. This vector will be used as the initial value of the
3203 vector of partial results.
3205 Option1 (adjust in epilog): Initialize the vector as follows:
3206 add/bit or/xor: [0,0,...,0,0]
3207 mult/bit and: [1,1,...,1,1]
3208 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3209 and when necessary (e.g. add/mult case) let the caller know
3210 that it needs to adjust the result by init_val.
3212 Option2: Initialize the vector as follows:
3213 add/bit or/xor: [init_val,0,0,...,0]
3214 mult/bit and: [init_val,1,1,...,1]
3215 min/max/cond_expr: [init_val,init_val,...,init_val]
3216 and no adjustments are needed.
3218 For example, for the following code:
3224 STMT is 's = s + a[i]', and the reduction variable is 's'.
3225 For a vector of 4 units, we want to return either [0,0,0,init_val],
3226 or [0,0,0,0] and let the caller know that it needs to adjust
3227 the result at the end by 'init_val'.
3229 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3230 initialization vector is simpler (same element in all entries), if
3231 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3233 A cost model should help decide between these two schemes. */
3236 get_initial_def_for_reduction (gimple stmt, tree init_val,
3237 tree *adjustment_def)
3239 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3240 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3241 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3242 tree scalar_type = TREE_TYPE (init_val);
3243 tree vectype = get_vectype_for_scalar_type (scalar_type);
3245 enum tree_code code = gimple_assign_rhs_code (stmt);
3250 bool nested_in_vect_loop = false;
3252 REAL_VALUE_TYPE real_init_val = dconst0;
3253 int int_init_val = 0;
3254 gimple def_stmt = NULL;
3256 gcc_assert (vectype);
3257 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3259 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3260 || SCALAR_FLOAT_TYPE_P (scalar_type));
3262 if (nested_in_vect_loop_p (loop, stmt))
3263 nested_in_vect_loop = true;
3265 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3267 /* In case of double reduction we only create a vector variable to be put
3268 in the reduction phi node. The actual statement creation is done in
3269 vect_create_epilog_for_reduction. */
3270 if (adjustment_def && nested_in_vect_loop
3271 && TREE_CODE (init_val) == SSA_NAME
3272 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3273 && gimple_code (def_stmt) == GIMPLE_PHI
3274 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3275 && vinfo_for_stmt (def_stmt)
3276 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3277 == vect_double_reduction_def)
3279 *adjustment_def = NULL;
3280 return vect_create_destination_var (init_val, vectype);
3283 if (TREE_CONSTANT (init_val))
3285 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3286 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3288 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3291 init_value = init_val;
3295 case WIDEN_SUM_EXPR:
3303 /* ADJUSMENT_DEF is NULL when called from
3304 vect_create_epilog_for_reduction to vectorize double reduction. */
3307 if (nested_in_vect_loop)
3308 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3311 *adjustment_def = init_val;
3314 if (code == MULT_EXPR)
3316 real_init_val = dconst1;
3320 if (code == BIT_AND_EXPR)
3323 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3324 def_for_init = build_real (scalar_type, real_init_val);
3326 def_for_init = build_int_cst (scalar_type, int_init_val);
3328 /* Create a vector of '0' or '1' except the first element. */
3329 for (i = nunits - 2; i >= 0; --i)
3330 t = tree_cons (NULL_TREE, def_for_init, t);
3332 /* Option1: the first element is '0' or '1' as well. */
3335 t = tree_cons (NULL_TREE, def_for_init, t);
3336 init_def = build_vector (vectype, t);
3340 /* Option2: the first element is INIT_VAL. */
3341 t = tree_cons (NULL_TREE, init_value, t);
3342 if (TREE_CONSTANT (init_val))
3343 init_def = build_vector (vectype, t);
3345 init_def = build_constructor_from_list (vectype, t);
3354 *adjustment_def = NULL_TREE;
3355 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3359 init_def = build_vector_from_val (vectype, init_value);
3370 /* Function vect_create_epilog_for_reduction
3372 Create code at the loop-epilog to finalize the result of a reduction
3375 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3376 reduction statements.
3377 STMT is the scalar reduction stmt that is being vectorized.
3378 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3379 number of elements that we can fit in a vectype (nunits). In this case
3380 we have to generate more than one vector stmt - i.e - we need to "unroll"
3381 the vector stmt by a factor VF/nunits. For more details see documentation
3382 in vectorizable_operation.
3383 REDUC_CODE is the tree-code for the epilog reduction.
3384 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3386 REDUC_INDEX is the index of the operand in the right hand side of the
3387 statement that is defined by REDUCTION_PHI.
3388 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3389 SLP_NODE is an SLP node containing a group of reduction statements. The
3390 first one in this group is STMT.
3393 1. Creates the reduction def-use cycles: sets the arguments for
3395 The loop-entry argument is the vectorized initial-value of the reduction.
3396 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3398 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3399 by applying the operation specified by REDUC_CODE if available, or by
3400 other means (whole-vector shifts or a scalar loop).
3401 The function also creates a new phi node at the loop exit to preserve
3402 loop-closed form, as illustrated below.
3404 The flow at the entry to this function:
3407 vec_def = phi <null, null> # REDUCTION_PHI
3408 VECT_DEF = vector_stmt # vectorized form of STMT
3409 s_loop = scalar_stmt # (scalar) STMT
3411 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3415 The above is transformed by this function into:
3418 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3419 VECT_DEF = vector_stmt # vectorized form of STMT
3420 s_loop = scalar_stmt # (scalar) STMT
3422 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3423 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3424 v_out2 = reduce <v_out1>
3425 s_out3 = extract_field <v_out2, 0>
3426 s_out4 = adjust_result <s_out3>
3432 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3433 int ncopies, enum tree_code reduc_code,
3434 VEC (gimple, heap) *reduction_phis,
3435 int reduc_index, bool double_reduc,
3438 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3439 stmt_vec_info prev_phi_info;
3441 enum machine_mode mode;
3442 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3443 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3444 basic_block exit_bb;
3447 gimple new_phi = NULL, phi;
3448 gimple_stmt_iterator exit_gsi;
3450 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3451 gimple epilog_stmt = NULL;
3452 enum tree_code code = gimple_assign_rhs_code (stmt);
3454 tree bitsize, bitpos;
3455 tree adjustment_def = NULL;
3456 tree vec_initial_def = NULL;
3457 tree reduction_op, expr, def;
3458 tree orig_name, scalar_result;
3459 imm_use_iterator imm_iter, phi_imm_iter;
3460 use_operand_p use_p, phi_use_p;
3461 bool extract_scalar_result = false;
3462 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3463 bool nested_in_vect_loop = false;
3464 VEC (gimple, heap) *new_phis = NULL;
3465 enum vect_def_type dt = vect_unknown_def_type;
3467 VEC (tree, heap) *scalar_results = NULL;
3468 unsigned int group_size = 1, k, ratio;
3469 VEC (tree, heap) *vec_initial_defs = NULL;
3470 VEC (gimple, heap) *phis;
3471 bool slp_reduc = false;
3472 tree new_phi_result;
3475 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3477 if (nested_in_vect_loop_p (loop, stmt))
3481 nested_in_vect_loop = true;
3482 gcc_assert (!slp_node);
3485 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3487 case GIMPLE_SINGLE_RHS:
3488 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3490 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3492 case GIMPLE_UNARY_RHS:
3493 reduction_op = gimple_assign_rhs1 (stmt);
3495 case GIMPLE_BINARY_RHS:
3496 reduction_op = reduc_index ?
3497 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3499 case GIMPLE_TERNARY_RHS:
3500 reduction_op = gimple_op (stmt, reduc_index + 1);
3506 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3507 gcc_assert (vectype);
3508 mode = TYPE_MODE (vectype);
3510 /* 1. Create the reduction def-use cycle:
3511 Set the arguments of REDUCTION_PHIS, i.e., transform
3514 vec_def = phi <null, null> # REDUCTION_PHI
3515 VECT_DEF = vector_stmt # vectorized form of STMT
3521 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3522 VECT_DEF = vector_stmt # vectorized form of STMT
3525 (in case of SLP, do it for all the phis). */
3527 /* Get the loop-entry arguments. */
3529 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3530 NULL, slp_node, reduc_index);
3533 vec_initial_defs = VEC_alloc (tree, heap, 1);
3534 /* For the case of reduction, vect_get_vec_def_for_operand returns
3535 the scalar def before the loop, that defines the initial value
3536 of the reduction variable. */
3537 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3539 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3542 /* Set phi nodes arguments. */
3543 FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3545 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3546 tree def = VEC_index (tree, vect_defs, i);
3547 for (j = 0; j < ncopies; j++)
3549 /* Set the loop-entry arg of the reduction-phi. */
3550 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3553 /* Set the loop-latch arg for the reduction-phi. */
3555 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3557 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3559 if (vect_print_dump_info (REPORT_DETAILS))
3561 fprintf (vect_dump, "transform reduction: created def-use"
3563 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3564 fprintf (vect_dump, "\n");
3565 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3569 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3573 VEC_free (tree, heap, vec_initial_defs);
3575 /* 2. Create epilog code.
3576 The reduction epilog code operates across the elements of the vector
3577 of partial results computed by the vectorized loop.
3578 The reduction epilog code consists of:
3580 step 1: compute the scalar result in a vector (v_out2)
3581 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3582 step 3: adjust the scalar result (s_out3) if needed.
3584 Step 1 can be accomplished using one the following three schemes:
3585 (scheme 1) using reduc_code, if available.
3586 (scheme 2) using whole-vector shifts, if available.
3587 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3590 The overall epilog code looks like this:
3592 s_out0 = phi <s_loop> # original EXIT_PHI
3593 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3594 v_out2 = reduce <v_out1> # step 1
3595 s_out3 = extract_field <v_out2, 0> # step 2
3596 s_out4 = adjust_result <s_out3> # step 3
3598 (step 3 is optional, and steps 1 and 2 may be combined).
3599 Lastly, the uses of s_out0 are replaced by s_out4. */
3602 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3603 v_out1 = phi <VECT_DEF>
3604 Store them in NEW_PHIS. */
3606 exit_bb = single_exit (loop)->dest;
3607 prev_phi_info = NULL;
3608 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3609 FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3611 for (j = 0; j < ncopies; j++)
3613 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3614 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3616 VEC_quick_push (gimple, new_phis, phi);
3619 def = vect_get_vec_def_for_stmt_copy (dt, def);
3620 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3623 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3624 prev_phi_info = vinfo_for_stmt (phi);
3628 /* The epilogue is created for the outer-loop, i.e., for the loop being
3633 exit_bb = single_exit (loop)->dest;
3636 exit_gsi = gsi_after_labels (exit_bb);
3638 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3639 (i.e. when reduc_code is not available) and in the final adjustment
3640 code (if needed). Also get the original scalar reduction variable as
3641 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3642 represents a reduction pattern), the tree-code and scalar-def are
3643 taken from the original stmt that the pattern-stmt (STMT) replaces.
3644 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3645 are taken from STMT. */
3647 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3650 /* Regular reduction */
3655 /* Reduction pattern */
3656 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3657 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3658 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3661 code = gimple_assign_rhs_code (orig_stmt);
3662 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3663 partial results are added and not subtracted. */
3664 if (code == MINUS_EXPR)
3667 scalar_dest = gimple_assign_lhs (orig_stmt);
3668 scalar_type = TREE_TYPE (scalar_dest);
3669 scalar_results = VEC_alloc (tree, heap, group_size);
3670 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3671 bitsize = TYPE_SIZE (scalar_type);
3673 /* In case this is a reduction in an inner-loop while vectorizing an outer
3674 loop - we don't need to extract a single scalar result at the end of the
3675 inner-loop (unless it is double reduction, i.e., the use of reduction is
3676 outside the outer-loop). The final vector of partial results will be used
3677 in the vectorized outer-loop, or reduced to a scalar result at the end of
3679 if (nested_in_vect_loop && !double_reduc)
3680 goto vect_finalize_reduction;
3682 /* SLP reduction without reduction chain, e.g.,
3686 b2 = operation (b1) */
3687 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3689 /* In case of reduction chain, e.g.,
3692 a3 = operation (a2),
3694 we may end up with more than one vector result. Here we reduce them to
3696 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3698 tree first_vect = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3700 gimple new_vec_stmt = NULL;
3702 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3703 for (k = 1; k < VEC_length (gimple, new_phis); k++)
3705 gimple next_phi = VEC_index (gimple, new_phis, k);
3706 tree second_vect = PHI_RESULT (next_phi);
3708 tmp = build2 (code, vectype, first_vect, second_vect);
3709 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3710 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3711 gimple_assign_set_lhs (new_vec_stmt, first_vect);
3712 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3715 new_phi_result = first_vect;
3718 VEC_truncate (gimple, new_phis, 0);
3719 VEC_safe_push (gimple, heap, new_phis, new_vec_stmt);
3723 new_phi_result = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3725 /* 2.3 Create the reduction code, using one of the three schemes described
3726 above. In SLP we simply need to extract all the elements from the
3727 vector (without reducing them), so we use scalar shifts. */
3728 if (reduc_code != ERROR_MARK && !slp_reduc)
3732 /*** Case 1: Create:
3733 v_out2 = reduc_expr <v_out1> */
3735 if (vect_print_dump_info (REPORT_DETAILS))
3736 fprintf (vect_dump, "Reduce using direct vector reduction.");
3738 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3739 tmp = build1 (reduc_code, vectype, new_phi_result);
3740 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3741 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3742 gimple_assign_set_lhs (epilog_stmt, new_temp);
3743 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3745 extract_scalar_result = true;
3749 enum tree_code shift_code = ERROR_MARK;
3750 bool have_whole_vector_shift = true;
3752 int element_bitsize = tree_low_cst (bitsize, 1);
3753 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3756 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3757 shift_code = VEC_RSHIFT_EXPR;
3759 have_whole_vector_shift = false;
3761 /* Regardless of whether we have a whole vector shift, if we're
3762 emulating the operation via tree-vect-generic, we don't want
3763 to use it. Only the first round of the reduction is likely
3764 to still be profitable via emulation. */
3765 /* ??? It might be better to emit a reduction tree code here, so that
3766 tree-vect-generic can expand the first round via bit tricks. */
3767 if (!VECTOR_MODE_P (mode))
3768 have_whole_vector_shift = false;
3771 optab optab = optab_for_tree_code (code, vectype, optab_default);
3772 if (optab_handler (optab, mode) == CODE_FOR_nothing)
3773 have_whole_vector_shift = false;
3776 if (have_whole_vector_shift && !slp_reduc)
3778 /*** Case 2: Create:
3779 for (offset = VS/2; offset >= element_size; offset/=2)
3781 Create: va' = vec_shift <va, offset>
3782 Create: va = vop <va, va'>
3785 if (vect_print_dump_info (REPORT_DETAILS))
3786 fprintf (vect_dump, "Reduce using vector shifts");
3788 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3789 new_temp = new_phi_result;
3790 for (bit_offset = vec_size_in_bits/2;
3791 bit_offset >= element_bitsize;
3794 tree bitpos = size_int (bit_offset);
3796 epilog_stmt = gimple_build_assign_with_ops (shift_code,
3797 vec_dest, new_temp, bitpos);
3798 new_name = make_ssa_name (vec_dest, epilog_stmt);
3799 gimple_assign_set_lhs (epilog_stmt, new_name);
3800 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3802 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3803 new_name, new_temp);
3804 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3805 gimple_assign_set_lhs (epilog_stmt, new_temp);
3806 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3809 extract_scalar_result = true;
3815 /*** Case 3: Create:
3816 s = extract_field <v_out2, 0>
3817 for (offset = element_size;
3818 offset < vector_size;
3819 offset += element_size;)
3821 Create: s' = extract_field <v_out2, offset>
3822 Create: s = op <s, s'> // For non SLP cases
3825 if (vect_print_dump_info (REPORT_DETAILS))
3826 fprintf (vect_dump, "Reduce using scalar code. ");
3828 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3829 FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3831 if (gimple_code (new_phi) == GIMPLE_PHI)
3832 vec_temp = PHI_RESULT (new_phi);
3834 vec_temp = gimple_assign_lhs (new_phi);
3835 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3837 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3838 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3839 gimple_assign_set_lhs (epilog_stmt, new_temp);
3840 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3842 /* In SLP we don't need to apply reduction operation, so we just
3843 collect s' values in SCALAR_RESULTS. */
3845 VEC_safe_push (tree, heap, scalar_results, new_temp);
3847 for (bit_offset = element_bitsize;
3848 bit_offset < vec_size_in_bits;
3849 bit_offset += element_bitsize)
3851 tree bitpos = bitsize_int (bit_offset);
3852 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3855 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3856 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3857 gimple_assign_set_lhs (epilog_stmt, new_name);
3858 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3862 /* In SLP we don't need to apply reduction operation, so
3863 we just collect s' values in SCALAR_RESULTS. */
3864 new_temp = new_name;
3865 VEC_safe_push (tree, heap, scalar_results, new_name);
3869 epilog_stmt = gimple_build_assign_with_ops (code,
3870 new_scalar_dest, new_name, new_temp);
3871 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3872 gimple_assign_set_lhs (epilog_stmt, new_temp);
3873 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3878 /* The only case where we need to reduce scalar results in SLP, is
3879 unrolling. If the size of SCALAR_RESULTS is greater than
3880 GROUP_SIZE, we reduce them combining elements modulo
3884 tree res, first_res, new_res;
3887 /* Reduce multiple scalar results in case of SLP unrolling. */
3888 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3891 first_res = VEC_index (tree, scalar_results, j % group_size);
3892 new_stmt = gimple_build_assign_with_ops (code,
3893 new_scalar_dest, first_res, res);
3894 new_res = make_ssa_name (new_scalar_dest, new_stmt);
3895 gimple_assign_set_lhs (new_stmt, new_res);
3896 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3897 VEC_replace (tree, scalar_results, j % group_size, new_res);
3901 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
3902 VEC_safe_push (tree, heap, scalar_results, new_temp);
3904 extract_scalar_result = false;
3908 /* 2.4 Extract the final scalar result. Create:
3909 s_out3 = extract_field <v_out2, bitpos> */
3911 if (extract_scalar_result)
3915 if (vect_print_dump_info (REPORT_DETAILS))
3916 fprintf (vect_dump, "extract scalar result");
3918 if (BYTES_BIG_ENDIAN)
3919 bitpos = size_binop (MULT_EXPR,
3920 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3921 TYPE_SIZE (scalar_type));
3923 bitpos = bitsize_zero_node;
3925 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3926 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3927 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3928 gimple_assign_set_lhs (epilog_stmt, new_temp);
3929 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3930 VEC_safe_push (tree, heap, scalar_results, new_temp);
3933 vect_finalize_reduction:
3938 /* 2.5 Adjust the final result by the initial value of the reduction
3939 variable. (When such adjustment is not needed, then
3940 'adjustment_def' is zero). For example, if code is PLUS we create:
3941 new_temp = loop_exit_def + adjustment_def */
3945 gcc_assert (!slp_reduc);
3946 if (nested_in_vect_loop)
3948 new_phi = VEC_index (gimple, new_phis, 0);
3949 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3950 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3951 new_dest = vect_create_destination_var (scalar_dest, vectype);
3955 new_temp = VEC_index (tree, scalar_results, 0);
3956 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3957 expr = build2 (code, scalar_type, new_temp, adjustment_def);
3958 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3961 epilog_stmt = gimple_build_assign (new_dest, expr);
3962 new_temp = make_ssa_name (new_dest, epilog_stmt);
3963 gimple_assign_set_lhs (epilog_stmt, new_temp);
3964 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3965 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3966 if (nested_in_vect_loop)
3968 set_vinfo_for_stmt (epilog_stmt,
3969 new_stmt_vec_info (epilog_stmt, loop_vinfo,
3971 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3972 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3975 VEC_quick_push (tree, scalar_results, new_temp);
3977 VEC_replace (tree, scalar_results, 0, new_temp);
3980 VEC_replace (tree, scalar_results, 0, new_temp);
3982 VEC_replace (gimple, new_phis, 0, epilog_stmt);
3985 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
3986 phis with new adjusted scalar results, i.e., replace use <s_out0>
3991 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3992 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3993 v_out2 = reduce <v_out1>
3994 s_out3 = extract_field <v_out2, 0>
3995 s_out4 = adjust_result <s_out3>
4002 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4003 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4004 v_out2 = reduce <v_out1>
4005 s_out3 = extract_field <v_out2, 0>
4006 s_out4 = adjust_result <s_out3>
4011 /* In SLP reduction chain we reduce vector results into one vector if
4012 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4013 the last stmt in the reduction chain, since we are looking for the loop
4015 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4017 scalar_dest = gimple_assign_lhs (VEC_index (gimple,
4018 SLP_TREE_SCALAR_STMTS (slp_node),
4023 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4024 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4025 need to match SCALAR_RESULTS with corresponding statements. The first
4026 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4027 the first vector stmt, etc.
4028 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4029 if (group_size > VEC_length (gimple, new_phis))
4031 ratio = group_size / VEC_length (gimple, new_phis);
4032 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
4037 for (k = 0; k < group_size; k++)
4041 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
4042 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
4047 gimple current_stmt = VEC_index (gimple,
4048 SLP_TREE_SCALAR_STMTS (slp_node), k);
4050 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4051 /* SLP statements can't participate in patterns. */
4052 gcc_assert (!orig_stmt);
4053 scalar_dest = gimple_assign_lhs (current_stmt);
4056 phis = VEC_alloc (gimple, heap, 3);
4057 /* Find the loop-closed-use at the loop exit of the original scalar
4058 result. (The reduction result is expected to have two immediate uses -
4059 one at the latch block, and one at the loop exit). */
4060 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4061 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4062 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4064 /* We expect to have found an exit_phi because of loop-closed-ssa
4066 gcc_assert (!VEC_empty (gimple, phis));
4068 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4072 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4075 /* FORNOW. Currently not supporting the case that an inner-loop
4076 reduction is not used in the outer-loop (but only outside the
4077 outer-loop), unless it is double reduction. */
4078 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4079 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4082 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4084 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4085 != vect_double_reduction_def)
4088 /* Handle double reduction:
4090 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4091 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4092 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4093 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4095 At that point the regular reduction (stmt2 and stmt3) is
4096 already vectorized, as well as the exit phi node, stmt4.
4097 Here we vectorize the phi node of double reduction, stmt1, and
4098 update all relevant statements. */
4100 /* Go through all the uses of s2 to find double reduction phi
4101 node, i.e., stmt1 above. */
4102 orig_name = PHI_RESULT (exit_phi);
4103 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4105 stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4106 stmt_vec_info new_phi_vinfo;
4107 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4108 basic_block bb = gimple_bb (use_stmt);
4111 /* Check that USE_STMT is really double reduction phi
4113 if (gimple_code (use_stmt) != GIMPLE_PHI
4114 || gimple_phi_num_args (use_stmt) != 2
4116 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4117 != vect_double_reduction_def
4118 || bb->loop_father != outer_loop)
4121 /* Create vector phi node for double reduction:
4122 vs1 = phi <vs0, vs2>
4123 vs1 was created previously in this function by a call to
4124 vect_get_vec_def_for_operand and is stored in
4126 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
4127 vs0 is created here. */
4129 /* Create vector phi node. */
4130 vect_phi = create_phi_node (vec_initial_def, bb);
4131 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4132 loop_vec_info_for_loop (outer_loop), NULL);
4133 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4135 /* Create vs0 - initial def of the double reduction phi. */
4136 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4137 loop_preheader_edge (outer_loop));
4138 init_def = get_initial_def_for_reduction (stmt,
4139 preheader_arg, NULL);
4140 vect_phi_init = vect_init_vector (use_stmt, init_def,
4143 /* Update phi node arguments with vs0 and vs2. */
4144 add_phi_arg (vect_phi, vect_phi_init,
4145 loop_preheader_edge (outer_loop),
4147 add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
4148 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4149 if (vect_print_dump_info (REPORT_DETAILS))
4151 fprintf (vect_dump, "created double reduction phi "
4153 print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
4156 vect_phi_res = PHI_RESULT (vect_phi);
4158 /* Replace the use, i.e., set the correct vs1 in the regular
4159 reduction phi node. FORNOW, NCOPIES is always 1, so the
4160 loop is redundant. */
4161 use = reduction_phi;
4162 for (j = 0; j < ncopies; j++)
4164 edge pr_edge = loop_preheader_edge (loop);
4165 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4166 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4172 VEC_free (gimple, heap, phis);
4173 if (nested_in_vect_loop)
4181 phis = VEC_alloc (gimple, heap, 3);
4182 /* Find the loop-closed-use at the loop exit of the original scalar
4183 result. (The reduction result is expected to have two immediate uses,
4184 one at the latch block, and one at the loop exit). For double
4185 reductions we are looking for exit phis of the outer loop. */
4186 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4188 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4189 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4192 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4194 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4196 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4198 if (!flow_bb_inside_loop_p (loop,
4199 gimple_bb (USE_STMT (phi_use_p))))
4200 VEC_safe_push (gimple, heap, phis,
4201 USE_STMT (phi_use_p));
4207 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4209 /* Replace the uses: */
4210 orig_name = PHI_RESULT (exit_phi);
4211 scalar_result = VEC_index (tree, scalar_results, k);
4212 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4213 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4214 SET_USE (use_p, scalar_result);
4217 VEC_free (gimple, heap, phis);
4220 VEC_free (tree, heap, scalar_results);
4221 VEC_free (gimple, heap, new_phis);
4225 /* Function vectorizable_reduction.
4227 Check if STMT performs a reduction operation that can be vectorized.
4228 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4229 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4230 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4232 This function also handles reduction idioms (patterns) that have been
4233 recognized in advance during vect_pattern_recog. In this case, STMT may be
4235 X = pattern_expr (arg0, arg1, ..., X)
4236 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4237 sequence that had been detected and replaced by the pattern-stmt (STMT).
4239 In some cases of reduction patterns, the type of the reduction variable X is
4240 different than the type of the other arguments of STMT.
4241 In such cases, the vectype that is used when transforming STMT into a vector
4242 stmt is different than the vectype that is used to determine the
4243 vectorization factor, because it consists of a different number of elements
4244 than the actual number of elements that are being operated upon in parallel.
4246 For example, consider an accumulation of shorts into an int accumulator.
4247 On some targets it's possible to vectorize this pattern operating on 8
4248 shorts at a time (hence, the vectype for purposes of determining the
4249 vectorization factor should be V8HI); on the other hand, the vectype that
4250 is used to create the vector form is actually V4SI (the type of the result).
4252 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4253 indicates what is the actual level of parallelism (V8HI in the example), so
4254 that the right vectorization factor would be derived. This vectype
4255 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4256 be used to create the vectorized stmt. The right vectype for the vectorized
4257 stmt is obtained from the type of the result X:
4258 get_vectype_for_scalar_type (TREE_TYPE (X))
4260 This means that, contrary to "regular" reductions (or "regular" stmts in
4261 general), the following equation:
4262 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4263 does *NOT* necessarily hold for reduction patterns. */
4266 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4267 gimple *vec_stmt, slp_tree slp_node)
4271 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4272 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4273 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4274 tree vectype_in = NULL_TREE;
4275 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4276 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4277 enum tree_code code, orig_code, epilog_reduc_code;
4278 enum machine_mode vec_mode;
4280 optab optab, reduc_optab;
4281 tree new_temp = NULL_TREE;
4284 enum vect_def_type dt;
4285 gimple new_phi = NULL;
4289 stmt_vec_info orig_stmt_info;
4290 tree expr = NULL_TREE;
4294 stmt_vec_info prev_stmt_info, prev_phi_info;
4295 bool single_defuse_cycle = false;
4296 tree reduc_def = NULL_TREE;
4297 gimple new_stmt = NULL;
4300 bool nested_cycle = false, found_nested_cycle_def = false;
4301 gimple reduc_def_stmt = NULL;
4302 /* The default is that the reduction variable is the last in statement. */
4303 int reduc_index = 2;
4304 bool double_reduc = false, dummy;
4306 struct loop * def_stmt_loop, *outer_loop = NULL;
4308 gimple def_arg_stmt;
4309 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
4310 VEC (gimple, heap) *phis = NULL;
4312 tree def0, def1, tem, op0, op1 = NULL_TREE;
4314 /* In case of reduction chain we switch to the first stmt in the chain, but
4315 we don't update STMT_INFO, since only the last stmt is marked as reduction
4316 and has reduction properties. */
4317 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4318 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4320 if (nested_in_vect_loop_p (loop, stmt))
4324 nested_cycle = true;
4327 /* 1. Is vectorizable reduction? */
4328 /* Not supportable if the reduction variable is used in the loop, unless
4329 it's a reduction chain. */
4330 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4331 && !GROUP_FIRST_ELEMENT (stmt_info))
4334 /* Reductions that are not used even in an enclosing outer-loop,
4335 are expected to be "live" (used out of the loop). */
4336 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4337 && !STMT_VINFO_LIVE_P (stmt_info))
4340 /* Make sure it was already recognized as a reduction computation. */
4341 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4342 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4345 /* 2. Has this been recognized as a reduction pattern?
4347 Check if STMT represents a pattern that has been recognized
4348 in earlier analysis stages. For stmts that represent a pattern,
4349 the STMT_VINFO_RELATED_STMT field records the last stmt in
4350 the original sequence that constitutes the pattern. */
4352 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4355 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4356 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
4357 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4358 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4361 /* 3. Check the operands of the operation. The first operands are defined
4362 inside the loop body. The last operand is the reduction variable,
4363 which is defined by the loop-header-phi. */
4365 gcc_assert (is_gimple_assign (stmt));
4368 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4370 case GIMPLE_SINGLE_RHS:
4371 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4372 if (op_type == ternary_op)
4374 tree rhs = gimple_assign_rhs1 (stmt);
4375 ops[0] = TREE_OPERAND (rhs, 0);
4376 ops[1] = TREE_OPERAND (rhs, 1);
4377 ops[2] = TREE_OPERAND (rhs, 2);
4378 code = TREE_CODE (rhs);
4384 case GIMPLE_BINARY_RHS:
4385 code = gimple_assign_rhs_code (stmt);
4386 op_type = TREE_CODE_LENGTH (code);
4387 gcc_assert (op_type == binary_op);
4388 ops[0] = gimple_assign_rhs1 (stmt);
4389 ops[1] = gimple_assign_rhs2 (stmt);
4392 case GIMPLE_TERNARY_RHS:
4393 code = gimple_assign_rhs_code (stmt);
4394 op_type = TREE_CODE_LENGTH (code);
4395 gcc_assert (op_type == ternary_op);
4396 ops[0] = gimple_assign_rhs1 (stmt);
4397 ops[1] = gimple_assign_rhs2 (stmt);
4398 ops[2] = gimple_assign_rhs3 (stmt);
4401 case GIMPLE_UNARY_RHS:
4408 if (code == COND_EXPR && slp_node)
4411 scalar_dest = gimple_assign_lhs (stmt);
4412 scalar_type = TREE_TYPE (scalar_dest);
4413 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4414 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4417 /* Do not try to vectorize bit-precision reductions. */
4418 if ((TYPE_PRECISION (scalar_type)
4419 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4422 /* All uses but the last are expected to be defined in the loop.
4423 The last use is the reduction variable. In case of nested cycle this
4424 assumption is not true: we use reduc_index to record the index of the
4425 reduction variable. */
4426 for (i = 0; i < op_type-1; i++)
4428 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4429 if (i == 0 && code == COND_EXPR)
4432 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
4433 &def_stmt, &def, &dt, &tem);
4436 gcc_assert (is_simple_use);
4438 if (dt != vect_internal_def
4439 && dt != vect_external_def
4440 && dt != vect_constant_def
4441 && dt != vect_induction_def
4442 && !(dt == vect_nested_cycle && nested_cycle))
4445 if (dt == vect_nested_cycle)
4447 found_nested_cycle_def = true;
4448 reduc_def_stmt = def_stmt;
4453 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, &def_stmt,
4457 gcc_assert (is_simple_use);
4458 gcc_assert (dt == vect_reduction_def
4459 || dt == vect_nested_cycle
4460 || ((dt == vect_internal_def || dt == vect_external_def
4461 || dt == vect_constant_def || dt == vect_induction_def)
4462 && nested_cycle && found_nested_cycle_def));
4463 if (!found_nested_cycle_def)
4464 reduc_def_stmt = def_stmt;
4466 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4468 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4474 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4475 !nested_cycle, &dummy);
4476 /* We changed STMT to be the first stmt in reduction chain, hence we
4477 check that in this case the first element in the chain is STMT. */
4478 gcc_assert (stmt == tmp
4479 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4482 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4485 if (slp_node || PURE_SLP_STMT (stmt_info))
4488 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4489 / TYPE_VECTOR_SUBPARTS (vectype_in));
4491 gcc_assert (ncopies >= 1);
4493 vec_mode = TYPE_MODE (vectype_in);
4495 if (code == COND_EXPR)
4497 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4499 if (vect_print_dump_info (REPORT_DETAILS))
4500 fprintf (vect_dump, "unsupported condition in reduction");
4507 /* 4. Supportable by target? */
4509 /* 4.1. check support for the operation in the loop */
4510 optab = optab_for_tree_code (code, vectype_in, optab_default);
4513 if (vect_print_dump_info (REPORT_DETAILS))
4514 fprintf (vect_dump, "no optab.");
4519 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4521 if (vect_print_dump_info (REPORT_DETAILS))
4522 fprintf (vect_dump, "op not supported by target.");
4524 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4525 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4526 < vect_min_worthwhile_factor (code))
4529 if (vect_print_dump_info (REPORT_DETAILS))
4530 fprintf (vect_dump, "proceeding using word mode.");
4533 /* Worthwhile without SIMD support? */
4534 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4535 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4536 < vect_min_worthwhile_factor (code))
4538 if (vect_print_dump_info (REPORT_DETAILS))
4539 fprintf (vect_dump, "not worthwhile without SIMD support.");
4545 /* 4.2. Check support for the epilog operation.
4547 If STMT represents a reduction pattern, then the type of the
4548 reduction variable may be different than the type of the rest
4549 of the arguments. For example, consider the case of accumulation
4550 of shorts into an int accumulator; The original code:
4551 S1: int_a = (int) short_a;
4552 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4555 STMT: int_acc = widen_sum <short_a, int_acc>
4558 1. The tree-code that is used to create the vector operation in the
4559 epilog code (that reduces the partial results) is not the
4560 tree-code of STMT, but is rather the tree-code of the original
4561 stmt from the pattern that STMT is replacing. I.e, in the example
4562 above we want to use 'widen_sum' in the loop, but 'plus' in the
4564 2. The type (mode) we use to check available target support
4565 for the vector operation to be created in the *epilog*, is
4566 determined by the type of the reduction variable (in the example
4567 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4568 However the type (mode) we use to check available target support
4569 for the vector operation to be created *inside the loop*, is
4570 determined by the type of the other arguments to STMT (in the
4571 example we'd check this: optab_handler (widen_sum_optab,
4574 This is contrary to "regular" reductions, in which the types of all
4575 the arguments are the same as the type of the reduction variable.
4576 For "regular" reductions we can therefore use the same vector type
4577 (and also the same tree-code) when generating the epilog code and
4578 when generating the code inside the loop. */
4582 /* This is a reduction pattern: get the vectype from the type of the
4583 reduction variable, and get the tree-code from orig_stmt. */
4584 orig_code = gimple_assign_rhs_code (orig_stmt);
4585 gcc_assert (vectype_out);
4586 vec_mode = TYPE_MODE (vectype_out);
4590 /* Regular reduction: use the same vectype and tree-code as used for
4591 the vector code inside the loop can be used for the epilog code. */
4597 def_bb = gimple_bb (reduc_def_stmt);
4598 def_stmt_loop = def_bb->loop_father;
4599 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4600 loop_preheader_edge (def_stmt_loop));
4601 if (TREE_CODE (def_arg) == SSA_NAME
4602 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4603 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4604 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4605 && vinfo_for_stmt (def_arg_stmt)
4606 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4607 == vect_double_reduction_def)
4608 double_reduc = true;
4611 epilog_reduc_code = ERROR_MARK;
4612 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4614 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4618 if (vect_print_dump_info (REPORT_DETAILS))
4619 fprintf (vect_dump, "no optab for reduction.");
4621 epilog_reduc_code = ERROR_MARK;
4625 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4627 if (vect_print_dump_info (REPORT_DETAILS))
4628 fprintf (vect_dump, "reduc op not supported by target.");
4630 epilog_reduc_code = ERROR_MARK;
4635 if (!nested_cycle || double_reduc)
4637 if (vect_print_dump_info (REPORT_DETAILS))
4638 fprintf (vect_dump, "no reduc code for scalar code.");
4644 if (double_reduc && ncopies > 1)
4646 if (vect_print_dump_info (REPORT_DETAILS))
4647 fprintf (vect_dump, "multiple types in double reduction");
4652 /* In case of widenning multiplication by a constant, we update the type
4653 of the constant to be the type of the other operand. We check that the
4654 constant fits the type in the pattern recognition pass. */
4655 if (code == DOT_PROD_EXPR
4656 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4658 if (TREE_CODE (ops[0]) == INTEGER_CST)
4659 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4660 else if (TREE_CODE (ops[1]) == INTEGER_CST)
4661 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4664 if (vect_print_dump_info (REPORT_DETAILS))
4665 fprintf (vect_dump, "invalid types in dot-prod");
4671 if (!vec_stmt) /* transformation not required. */
4673 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4675 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4681 if (vect_print_dump_info (REPORT_DETAILS))
4682 fprintf (vect_dump, "transform reduction.");
4684 /* FORNOW: Multiple types are not supported for condition. */
4685 if (code == COND_EXPR)
4686 gcc_assert (ncopies == 1);
4688 /* Create the destination vector */
4689 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4691 /* In case the vectorization factor (VF) is bigger than the number
4692 of elements that we can fit in a vectype (nunits), we have to generate
4693 more than one vector stmt - i.e - we need to "unroll" the
4694 vector stmt by a factor VF/nunits. For more details see documentation
4695 in vectorizable_operation. */
4697 /* If the reduction is used in an outer loop we need to generate
4698 VF intermediate results, like so (e.g. for ncopies=2):
4703 (i.e. we generate VF results in 2 registers).
4704 In this case we have a separate def-use cycle for each copy, and therefore
4705 for each copy we get the vector def for the reduction variable from the
4706 respective phi node created for this copy.
4708 Otherwise (the reduction is unused in the loop nest), we can combine
4709 together intermediate results, like so (e.g. for ncopies=2):
4713 (i.e. we generate VF/2 results in a single register).
4714 In this case for each copy we get the vector def for the reduction variable
4715 from the vectorized reduction operation generated in the previous iteration.
4718 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4720 single_defuse_cycle = true;
4724 epilog_copies = ncopies;
4726 prev_stmt_info = NULL;
4727 prev_phi_info = NULL;
4730 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4731 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4732 == TYPE_VECTOR_SUBPARTS (vectype_in));
4737 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4738 if (op_type == ternary_op)
4739 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4742 phis = VEC_alloc (gimple, heap, vec_num);
4743 vect_defs = VEC_alloc (tree, heap, vec_num);
4745 VEC_quick_push (tree, vect_defs, NULL_TREE);
4747 for (j = 0; j < ncopies; j++)
4749 if (j == 0 || !single_defuse_cycle)
4751 for (i = 0; i < vec_num; i++)
4753 /* Create the reduction-phi that defines the reduction
4755 new_phi = create_phi_node (vec_dest, loop->header);
4756 set_vinfo_for_stmt (new_phi,
4757 new_stmt_vec_info (new_phi, loop_vinfo,
4759 if (j == 0 || slp_node)
4760 VEC_quick_push (gimple, phis, new_phi);
4764 if (code == COND_EXPR)
4766 gcc_assert (!slp_node);
4767 vectorizable_condition (stmt, gsi, vec_stmt,
4768 PHI_RESULT (VEC_index (gimple, phis, 0)),
4770 /* Multiple types are not supported for condition. */
4777 op0 = ops[!reduc_index];
4778 if (op_type == ternary_op)
4780 if (reduc_index == 0)
4787 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4791 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4793 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4794 if (op_type == ternary_op)
4796 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
4798 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4806 enum vect_def_type dt;
4810 vect_is_simple_use (ops[!reduc_index], loop_vinfo, NULL,
4811 &dummy_stmt, &dummy, &dt);
4812 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
4814 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4815 if (op_type == ternary_op)
4817 vect_is_simple_use (op1, loop_vinfo, NULL, &dummy_stmt,
4819 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4821 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4825 if (single_defuse_cycle)
4826 reduc_def = gimple_assign_lhs (new_stmt);
4828 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4831 FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4834 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4837 if (!single_defuse_cycle || j == 0)
4838 reduc_def = PHI_RESULT (new_phi);
4841 def1 = ((op_type == ternary_op)
4842 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4843 if (op_type == binary_op)
4845 if (reduc_index == 0)
4846 expr = build2 (code, vectype_out, reduc_def, def0);
4848 expr = build2 (code, vectype_out, def0, reduc_def);
4852 if (reduc_index == 0)
4853 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4856 if (reduc_index == 1)
4857 expr = build3 (code, vectype_out, def0, reduc_def, def1);
4859 expr = build3 (code, vectype_out, def0, def1, reduc_def);
4863 new_stmt = gimple_build_assign (vec_dest, expr);
4864 new_temp = make_ssa_name (vec_dest, new_stmt);
4865 gimple_assign_set_lhs (new_stmt, new_temp);
4866 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4870 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4871 VEC_quick_push (tree, vect_defs, new_temp);
4874 VEC_replace (tree, vect_defs, 0, new_temp);
4881 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4883 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4885 prev_stmt_info = vinfo_for_stmt (new_stmt);
4886 prev_phi_info = vinfo_for_stmt (new_phi);
4889 /* Finalize the reduction-phi (set its arguments) and create the
4890 epilog reduction code. */
4891 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4893 new_temp = gimple_assign_lhs (*vec_stmt);
4894 VEC_replace (tree, vect_defs, 0, new_temp);
4897 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4898 epilog_reduc_code, phis, reduc_index,
4899 double_reduc, slp_node);
4901 VEC_free (gimple, heap, phis);
4902 VEC_free (tree, heap, vec_oprnds0);
4904 VEC_free (tree, heap, vec_oprnds1);
4909 /* Function vect_min_worthwhile_factor.
4911 For a loop where we could vectorize the operation indicated by CODE,
4912 return the minimum vectorization factor that makes it worthwhile
4913 to use generic vectors. */
4915 vect_min_worthwhile_factor (enum tree_code code)
4936 /* Function vectorizable_induction
4938 Check if PHI performs an induction computation that can be vectorized.
4939 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4940 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4941 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4944 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4947 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4948 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4949 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4950 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4951 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4952 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4955 gcc_assert (ncopies >= 1);
4956 /* FORNOW. This restriction should be relaxed. */
4957 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4959 if (vect_print_dump_info (REPORT_DETAILS))
4960 fprintf (vect_dump, "multiple types in nested loop.");
4964 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4967 /* FORNOW: SLP not supported. */
4968 if (STMT_SLP_TYPE (stmt_info))
4971 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4973 if (gimple_code (phi) != GIMPLE_PHI)
4976 if (!vec_stmt) /* transformation not required. */
4978 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4979 if (vect_print_dump_info (REPORT_DETAILS))
4980 fprintf (vect_dump, "=== vectorizable_induction ===");
4981 vect_model_induction_cost (stmt_info, ncopies);
4987 if (vect_print_dump_info (REPORT_DETAILS))
4988 fprintf (vect_dump, "transform induction phi.");
4990 vec_def = get_initial_def_for_induction (phi);
4991 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4995 /* Function vectorizable_live_operation.
4997 STMT computes a value that is used outside the loop. Check if
4998 it can be supported. */
5001 vectorizable_live_operation (gimple stmt,
5002 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5003 gimple *vec_stmt ATTRIBUTE_UNUSED)
5005 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5006 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5007 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5013 enum vect_def_type dt;
5014 enum tree_code code;
5015 enum gimple_rhs_class rhs_class;
5017 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5019 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5022 if (!is_gimple_assign (stmt))
5025 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5028 /* FORNOW. CHECKME. */
5029 if (nested_in_vect_loop_p (loop, stmt))
5032 code = gimple_assign_rhs_code (stmt);
5033 op_type = TREE_CODE_LENGTH (code);
5034 rhs_class = get_gimple_rhs_class (code);
5035 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5036 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5038 /* FORNOW: support only if all uses are invariant. This means
5039 that the scalar operations can remain in place, unvectorized.
5040 The original last scalar value that they compute will be used. */
5042 for (i = 0; i < op_type; i++)
5044 if (rhs_class == GIMPLE_SINGLE_RHS)
5045 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5047 op = gimple_op (stmt, i + 1);
5049 && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
5051 if (vect_print_dump_info (REPORT_DETAILS))
5052 fprintf (vect_dump, "use not simple.");
5056 if (dt != vect_external_def && dt != vect_constant_def)
5060 /* No transformation is required for the cases we currently support. */
5064 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5067 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5069 ssa_op_iter op_iter;
5070 imm_use_iterator imm_iter;
5071 def_operand_p def_p;
5074 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5076 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5080 if (!is_gimple_debug (ustmt))
5083 bb = gimple_bb (ustmt);
5085 if (!flow_bb_inside_loop_p (loop, bb))
5087 if (gimple_debug_bind_p (ustmt))
5089 if (vect_print_dump_info (REPORT_DETAILS))
5090 fprintf (vect_dump, "killing debug use");
5092 gimple_debug_bind_reset_value (ustmt);
5093 update_stmt (ustmt);
5102 /* Function vect_transform_loop.
5104 The analysis phase has determined that the loop is vectorizable.
5105 Vectorize the loop - created vectorized stmts to replace the scalar
5106 stmts in the loop, and update the loop exit condition. */
5109 vect_transform_loop (loop_vec_info loop_vinfo)
5111 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5112 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5113 int nbbs = loop->num_nodes;
5114 gimple_stmt_iterator si;
5117 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5119 bool slp_scheduled = false;
5120 unsigned int nunits;
5121 tree cond_expr = NULL_TREE;
5122 gimple_seq cond_expr_stmt_list = NULL;
5123 bool do_peeling_for_loop_bound;
5124 gimple stmt, pattern_stmt, pattern_def_stmt;
5125 bool transform_pattern_stmt = false, pattern_def = false;
5127 if (vect_print_dump_info (REPORT_DETAILS))
5128 fprintf (vect_dump, "=== vec_transform_loop ===");
5130 /* Peel the loop if there are data refs with unknown alignment.
5131 Only one data ref with unknown store is allowed. */
5133 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5134 vect_do_peeling_for_alignment (loop_vinfo);
5136 do_peeling_for_loop_bound
5137 = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5138 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5139 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5140 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
5142 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5143 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5144 vect_loop_versioning (loop_vinfo,
5145 !do_peeling_for_loop_bound,
5146 &cond_expr, &cond_expr_stmt_list);
5148 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5149 compile time constant), or it is a constant that doesn't divide by the
5150 vectorization factor, then an epilog loop needs to be created.
5151 We therefore duplicate the loop: the original loop will be vectorized,
5152 and will compute the first (n/VF) iterations. The second copy of the loop
5153 will remain scalar and will compute the remaining (n%VF) iterations.
5154 (VF is the vectorization factor). */
5156 if (do_peeling_for_loop_bound)
5157 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5158 cond_expr, cond_expr_stmt_list);
5160 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5161 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5163 /* 1) Make sure the loop header has exactly two entries
5164 2) Make sure we have a preheader basic block. */
5166 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5168 split_edge (loop_preheader_edge (loop));
5170 /* FORNOW: the vectorizer supports only loops which body consist
5171 of one basic block (header + empty latch). When the vectorizer will
5172 support more involved loop forms, the order by which the BBs are
5173 traversed need to be reconsidered. */
5175 for (i = 0; i < nbbs; i++)
5177 basic_block bb = bbs[i];
5178 stmt_vec_info stmt_info;
5181 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5183 phi = gsi_stmt (si);
5184 if (vect_print_dump_info (REPORT_DETAILS))
5186 fprintf (vect_dump, "------>vectorizing phi: ");
5187 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
5189 stmt_info = vinfo_for_stmt (phi);
5193 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5194 vect_loop_kill_debug_uses (loop, phi);
5196 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5197 && !STMT_VINFO_LIVE_P (stmt_info))
5200 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5201 != (unsigned HOST_WIDE_INT) vectorization_factor)
5202 && vect_print_dump_info (REPORT_DETAILS))
5203 fprintf (vect_dump, "multiple-types.");
5205 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5207 if (vect_print_dump_info (REPORT_DETAILS))
5208 fprintf (vect_dump, "transform phi.");
5209 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5213 pattern_stmt = NULL;
5214 for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5218 if (transform_pattern_stmt)
5220 stmt = pattern_stmt;
5221 transform_pattern_stmt = false;
5224 stmt = gsi_stmt (si);
5226 if (vect_print_dump_info (REPORT_DETAILS))
5228 fprintf (vect_dump, "------>vectorizing statement: ");
5229 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
5232 stmt_info = vinfo_for_stmt (stmt);
5234 /* vector stmts created in the outer-loop during vectorization of
5235 stmts in an inner-loop may not have a stmt_info, and do not
5236 need to be vectorized. */
5243 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5244 vect_loop_kill_debug_uses (loop, stmt);
5246 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5247 && !STMT_VINFO_LIVE_P (stmt_info))
5249 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5250 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5251 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5252 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5254 stmt = pattern_stmt;
5255 stmt_info = vinfo_for_stmt (stmt);
5263 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5264 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5265 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5266 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5267 transform_pattern_stmt = true;
5269 /* If pattern statement has a def stmt, vectorize it too. */
5270 if (is_pattern_stmt_p (stmt_info)
5271 && (pattern_def_stmt = STMT_VINFO_PATTERN_DEF_STMT (stmt_info))
5272 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_def_stmt))
5273 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_def_stmt))))
5276 pattern_def = false;
5279 if (vect_print_dump_info (REPORT_DETAILS))
5281 fprintf (vect_dump, "==> vectorizing pattern def"
5283 print_gimple_stmt (vect_dump, pattern_def_stmt, 0,
5288 stmt = pattern_def_stmt;
5289 stmt_info = vinfo_for_stmt (stmt);
5293 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5294 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5295 STMT_VINFO_VECTYPE (stmt_info));
5296 if (!STMT_SLP_TYPE (stmt_info)
5297 && nunits != (unsigned int) vectorization_factor
5298 && vect_print_dump_info (REPORT_DETAILS))
5299 /* For SLP VF is set according to unrolling factor, and not to
5300 vector size, hence for SLP this print is not valid. */
5301 fprintf (vect_dump, "multiple-types.");
5303 /* SLP. Schedule all the SLP instances when the first SLP stmt is
5305 if (STMT_SLP_TYPE (stmt_info))
5309 slp_scheduled = true;
5311 if (vect_print_dump_info (REPORT_DETAILS))
5312 fprintf (vect_dump, "=== scheduling SLP instances ===");
5314 vect_schedule_slp (loop_vinfo, NULL);
5317 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
5318 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5320 if (!transform_pattern_stmt && !pattern_def)
5326 /* -------- vectorize statement ------------ */
5327 if (vect_print_dump_info (REPORT_DETAILS))
5328 fprintf (vect_dump, "transform statement.");
5330 strided_store = false;
5331 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
5334 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5336 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5337 interleaving chain was completed - free all the stores in
5340 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5345 /* Free the attached stmt_vec_info and remove the stmt. */
5346 free_stmt_vec_info (gsi_stmt (si));
5347 gsi_remove (&si, true);
5352 if (!transform_pattern_stmt && !pattern_def)
5357 slpeel_make_loop_iterate_ntimes (loop, ratio);
5359 /* The memory tags and pointers in vectorized statements need to
5360 have their SSA forms updated. FIXME, why can't this be delayed
5361 until all the loops have been transformed? */
5362 update_ssa (TODO_update_ssa);
5364 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5365 fprintf (vect_dump, "LOOP VECTORIZED.");
5366 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5367 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");