2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
7 This file is part of GCC.
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
25 #include "coretypes.h"
29 #include "basic-block.h"
30 #include "tree-pretty-print.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
35 #include "cfglayout.h"
40 #include "diagnostic-core.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
46 /* Loop Vectorization Pass.
48 This pass tries to vectorize loops.
50 For example, the vectorizer transforms the following simple loop:
52 short a[N]; short b[N]; short c[N]; int i;
58 as if it was manually vectorized by rewriting the source code into:
60 typedef int __attribute__((mode(V8HI))) v8hi;
61 short a[N]; short b[N]; short c[N]; int i;
62 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
65 for (i=0; i<N/8; i++){
72 The main entry to this pass is vectorize_loops(), in which
73 the vectorizer applies a set of analyses on a given set of loops,
74 followed by the actual vectorization transformation for the loops that
75 had successfully passed the analysis phase.
76 Throughout this pass we make a distinction between two types of
77 data: scalars (which are represented by SSA_NAMES), and memory references
78 ("data-refs"). These two types of data require different handling both
79 during analysis and transformation. The types of data-refs that the
80 vectorizer currently supports are ARRAY_REFS which base is an array DECL
81 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
82 accesses are required to have a simple (consecutive) access pattern.
86 The driver for the analysis phase is vect_analyze_loop().
87 It applies a set of analyses, some of which rely on the scalar evolution
88 analyzer (scev) developed by Sebastian Pop.
90 During the analysis phase the vectorizer records some information
91 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
92 loop, as well as general information about the loop as a whole, which is
93 recorded in a "loop_vec_info" struct attached to each loop.
97 The loop transformation phase scans all the stmts in the loop, and
98 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
99 the loop that needs to be vectorized. It inserts the vector code sequence
100 just before the scalar stmt S, and records a pointer to the vector code
101 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
102 attached to S). This pointer will be used for the vectorization of following
103 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
104 otherwise, we rely on dead code elimination for removing it.
106 For example, say stmt S1 was vectorized into stmt VS1:
109 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
112 To vectorize stmt S2, the vectorizer first finds the stmt that defines
113 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
114 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
115 resulting sequence would be:
118 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
120 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
122 Operands that are not SSA_NAMEs, are data-refs that appear in
123 load/store operations (like 'x[i]' in S1), and are handled differently.
127 Currently the only target specific information that is used is the
128 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
129 Targets that can support different sizes of vectors, for now will need
130 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
131 flexibility will be added in the future.
133 Since we only vectorize operations which vector form can be
134 expressed using existing tree codes, to verify that an operation is
135 supported, the vectorizer checks the relevant optab at the relevant
136 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
137 the value found is CODE_FOR_nothing, then there's no target support, and
138 we can't vectorize the stmt.
140 For additional information on this project see:
141 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
144 /* Function vect_determine_vectorization_factor
146 Determine the vectorization factor (VF). VF is the number of data elements
147 that are operated upon in parallel in a single iteration of the vectorized
148 loop. For example, when vectorizing a loop that operates on 4byte elements,
149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150 elements can fit in a single vector register.
152 We currently support vectorization of loops in which all types operated upon
153 are of the same size. Therefore this function currently sets VF according to
154 the size of the types operated upon, and fails if there are multiple sizes
157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
164 for (i=0; i<N; i+=VF){
165 a[i:VF] = b[i:VF] + c[i:VF];
170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
172 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
173 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
174 int nbbs = loop->num_nodes;
175 gimple_stmt_iterator si;
176 unsigned int vectorization_factor = 0;
181 stmt_vec_info stmt_info;
185 if (vect_print_dump_info (REPORT_DETAILS))
186 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
188 for (i = 0; i < nbbs; i++)
190 basic_block bb = bbs[i];
192 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
195 stmt_info = vinfo_for_stmt (phi);
196 if (vect_print_dump_info (REPORT_DETAILS))
198 fprintf (vect_dump, "==> examining phi: ");
199 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
202 gcc_assert (stmt_info);
204 if (STMT_VINFO_RELEVANT_P (stmt_info))
206 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
207 scalar_type = TREE_TYPE (PHI_RESULT (phi));
209 if (vect_print_dump_info (REPORT_DETAILS))
211 fprintf (vect_dump, "get vectype for scalar type: ");
212 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
215 vectype = get_vectype_for_scalar_type (scalar_type);
218 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
221 "not vectorized: unsupported data-type ");
222 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
226 STMT_VINFO_VECTYPE (stmt_info) = vectype;
228 if (vect_print_dump_info (REPORT_DETAILS))
230 fprintf (vect_dump, "vectype: ");
231 print_generic_expr (vect_dump, vectype, TDF_SLIM);
234 nunits = TYPE_VECTOR_SUBPARTS (vectype);
235 if (vect_print_dump_info (REPORT_DETAILS))
236 fprintf (vect_dump, "nunits = %d", nunits);
238 if (!vectorization_factor
239 || (nunits > vectorization_factor))
240 vectorization_factor = nunits;
244 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
247 gimple stmt = gsi_stmt (si);
248 stmt_info = vinfo_for_stmt (stmt);
250 if (vect_print_dump_info (REPORT_DETAILS))
252 fprintf (vect_dump, "==> examining statement: ");
253 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
256 gcc_assert (stmt_info);
258 /* skip stmts which do not need to be vectorized. */
259 if (!STMT_VINFO_RELEVANT_P (stmt_info)
260 && !STMT_VINFO_LIVE_P (stmt_info))
262 if (vect_print_dump_info (REPORT_DETAILS))
263 fprintf (vect_dump, "skip.");
267 if (gimple_get_lhs (stmt) == NULL_TREE)
269 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
271 fprintf (vect_dump, "not vectorized: irregular stmt.");
272 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
277 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
279 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
281 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
282 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
287 if (STMT_VINFO_VECTYPE (stmt_info))
289 /* The only case when a vectype had been already set is for stmts
290 that contain a dataref, or for "pattern-stmts" (stmts generated
291 by the vectorizer to represent/replace a certain idiom). */
292 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
293 || is_pattern_stmt_p (stmt_info));
294 vectype = STMT_VINFO_VECTYPE (stmt_info);
298 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
299 && !is_pattern_stmt_p (stmt_info));
301 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
302 if (vect_print_dump_info (REPORT_DETAILS))
304 fprintf (vect_dump, "get vectype for scalar type: ");
305 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
307 vectype = get_vectype_for_scalar_type (scalar_type);
310 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
313 "not vectorized: unsupported data-type ");
314 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
319 STMT_VINFO_VECTYPE (stmt_info) = vectype;
322 /* The vectorization factor is according to the smallest
323 scalar type (or the largest vector size, but we only
324 support one vector size per loop). */
325 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
327 if (vect_print_dump_info (REPORT_DETAILS))
329 fprintf (vect_dump, "get vectype for scalar type: ");
330 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
332 vf_vectype = get_vectype_for_scalar_type (scalar_type);
335 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
338 "not vectorized: unsupported data-type ");
339 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
344 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
345 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
347 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
350 "not vectorized: different sized vector "
351 "types in statement, ");
352 print_generic_expr (vect_dump, vectype, TDF_SLIM);
353 fprintf (vect_dump, " and ");
354 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
359 if (vect_print_dump_info (REPORT_DETAILS))
361 fprintf (vect_dump, "vectype: ");
362 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
365 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
366 if (vect_print_dump_info (REPORT_DETAILS))
367 fprintf (vect_dump, "nunits = %d", nunits);
369 if (!vectorization_factor
370 || (nunits > vectorization_factor))
371 vectorization_factor = nunits;
375 /* TODO: Analyze cost. Decide if worth while to vectorize. */
376 if (vect_print_dump_info (REPORT_DETAILS))
377 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
378 if (vectorization_factor <= 1)
380 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
381 fprintf (vect_dump, "not vectorized: unsupported data-type");
384 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
390 /* Function vect_is_simple_iv_evolution.
392 FORNOW: A simple evolution of an induction variables in the loop is
393 considered a polynomial evolution with constant step. */
396 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
401 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
403 /* When there is no evolution in this loop, the evolution function
405 if (evolution_part == NULL_TREE)
408 /* When the evolution is a polynomial of degree >= 2
409 the evolution function is not "simple". */
410 if (tree_is_chrec (evolution_part))
413 step_expr = evolution_part;
414 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
416 if (vect_print_dump_info (REPORT_DETAILS))
418 fprintf (vect_dump, "step: ");
419 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
420 fprintf (vect_dump, ", init: ");
421 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
427 if (TREE_CODE (step_expr) != INTEGER_CST)
429 if (vect_print_dump_info (REPORT_DETAILS))
430 fprintf (vect_dump, "step unknown.");
437 /* Function vect_analyze_scalar_cycles_1.
439 Examine the cross iteration def-use cycles of scalar variables
440 in LOOP. LOOP_VINFO represents the loop that is now being
441 considered for vectorization (can be LOOP, or an outer-loop
445 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
447 basic_block bb = loop->header;
449 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
450 gimple_stmt_iterator gsi;
453 if (vect_print_dump_info (REPORT_DETAILS))
454 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
456 /* First - identify all inductions. Reduction detection assumes that all the
457 inductions have been identified, therefore, this order must not be
459 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
461 gimple phi = gsi_stmt (gsi);
462 tree access_fn = NULL;
463 tree def = PHI_RESULT (phi);
464 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
466 if (vect_print_dump_info (REPORT_DETAILS))
468 fprintf (vect_dump, "Analyze phi: ");
469 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
472 /* Skip virtual phi's. The data dependences that are associated with
473 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
474 if (!is_gimple_reg (SSA_NAME_VAR (def)))
477 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
479 /* Analyze the evolution function. */
480 access_fn = analyze_scalar_evolution (loop, def);
482 STRIP_NOPS (access_fn);
483 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
485 fprintf (vect_dump, "Access function of PHI: ");
486 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
490 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
492 VEC_safe_push (gimple, heap, worklist, phi);
496 if (vect_print_dump_info (REPORT_DETAILS))
497 fprintf (vect_dump, "Detected induction.");
498 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
502 /* Second - identify all reductions and nested cycles. */
503 while (VEC_length (gimple, worklist) > 0)
505 gimple phi = VEC_pop (gimple, worklist);
506 tree def = PHI_RESULT (phi);
507 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
511 if (vect_print_dump_info (REPORT_DETAILS))
513 fprintf (vect_dump, "Analyze phi: ");
514 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
517 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
518 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
520 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
521 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
527 if (vect_print_dump_info (REPORT_DETAILS))
528 fprintf (vect_dump, "Detected double reduction.");
530 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
531 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
532 vect_double_reduction_def;
538 if (vect_print_dump_info (REPORT_DETAILS))
539 fprintf (vect_dump, "Detected vectorizable nested cycle.");
541 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
542 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
547 if (vect_print_dump_info (REPORT_DETAILS))
548 fprintf (vect_dump, "Detected reduction.");
550 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
551 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
553 /* Store the reduction cycles for possible vectorization in
555 VEC_safe_push (gimple, heap,
556 LOOP_VINFO_REDUCTIONS (loop_vinfo),
562 if (vect_print_dump_info (REPORT_DETAILS))
563 fprintf (vect_dump, "Unknown def-use cycle pattern.");
566 VEC_free (gimple, heap, worklist);
570 /* Function vect_analyze_scalar_cycles.
572 Examine the cross iteration def-use cycles of scalar variables, by
573 analyzing the loop-header PHIs of scalar variables. Classify each
574 cycle as one of the following: invariant, induction, reduction, unknown.
575 We do that for the loop represented by LOOP_VINFO, and also to its
576 inner-loop, if exists.
577 Examples for scalar cycles:
592 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
594 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
596 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
598 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
599 Reductions in such inner-loop therefore have different properties than
600 the reductions in the nest that gets vectorized:
601 1. When vectorized, they are executed in the same order as in the original
602 scalar loop, so we can't change the order of computation when
604 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
605 current checks are too strict. */
608 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
611 /* Function vect_get_loop_niters.
613 Determine how many iterations the loop is executed.
614 If an expression that represents the number of iterations
615 can be constructed, place it in NUMBER_OF_ITERATIONS.
616 Return the loop exit condition. */
619 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
623 if (vect_print_dump_info (REPORT_DETAILS))
624 fprintf (vect_dump, "=== get_loop_niters ===");
626 niters = number_of_exit_cond_executions (loop);
628 if (niters != NULL_TREE
629 && niters != chrec_dont_know)
631 *number_of_iterations = niters;
633 if (vect_print_dump_info (REPORT_DETAILS))
635 fprintf (vect_dump, "==> get_loop_niters:" );
636 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
640 return get_loop_exit_condition (loop);
644 /* Function bb_in_loop_p
646 Used as predicate for dfs order traversal of the loop bbs. */
649 bb_in_loop_p (const_basic_block bb, const void *data)
651 const struct loop *const loop = (const struct loop *)data;
652 if (flow_bb_inside_loop_p (loop, bb))
658 /* Function new_loop_vec_info.
660 Create and initialize a new loop_vec_info struct for LOOP, as well as
661 stmt_vec_info structs for all the stmts in LOOP. */
664 new_loop_vec_info (struct loop *loop)
668 gimple_stmt_iterator si;
669 unsigned int i, nbbs;
671 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
672 LOOP_VINFO_LOOP (res) = loop;
674 bbs = get_loop_body (loop);
676 /* Create/Update stmt_info for all stmts in the loop. */
677 for (i = 0; i < loop->num_nodes; i++)
679 basic_block bb = bbs[i];
681 /* BBs in a nested inner-loop will have been already processed (because
682 we will have called vect_analyze_loop_form for any nested inner-loop).
683 Therefore, for stmts in an inner-loop we just want to update the
684 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
685 loop_info of the outer-loop we are currently considering to vectorize
686 (instead of the loop_info of the inner-loop).
687 For stmts in other BBs we need to create a stmt_info from scratch. */
688 if (bb->loop_father != loop)
691 gcc_assert (loop->inner && bb->loop_father == loop->inner);
692 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
694 gimple phi = gsi_stmt (si);
695 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
696 loop_vec_info inner_loop_vinfo =
697 STMT_VINFO_LOOP_VINFO (stmt_info);
698 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
699 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
701 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
703 gimple stmt = gsi_stmt (si);
704 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
705 loop_vec_info inner_loop_vinfo =
706 STMT_VINFO_LOOP_VINFO (stmt_info);
707 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
708 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
713 /* bb in current nest. */
714 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
716 gimple phi = gsi_stmt (si);
717 gimple_set_uid (phi, 0);
718 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
721 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
723 gimple stmt = gsi_stmt (si);
724 gimple_set_uid (stmt, 0);
725 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
730 /* CHECKME: We want to visit all BBs before their successors (except for
731 latch blocks, for which this assertion wouldn't hold). In the simple
732 case of the loop forms we allow, a dfs order of the BBs would the same
733 as reversed postorder traversal, so we are safe. */
736 bbs = XCNEWVEC (basic_block, loop->num_nodes);
737 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
738 bbs, loop->num_nodes, loop);
739 gcc_assert (nbbs == loop->num_nodes);
741 LOOP_VINFO_BBS (res) = bbs;
742 LOOP_VINFO_NITERS (res) = NULL;
743 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
744 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
745 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
746 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
747 LOOP_VINFO_VECT_FACTOR (res) = 0;
748 LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
749 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
750 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
751 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
752 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
753 VEC_alloc (gimple, heap,
754 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
755 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
756 VEC_alloc (ddr_p, heap,
757 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
758 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
759 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
760 LOOP_VINFO_REDUCTION_CHAINS (res) = VEC_alloc (gimple, heap, 10);
761 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
762 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
763 LOOP_VINFO_PEELING_HTAB (res) = NULL;
769 /* Function destroy_loop_vec_info.
771 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
772 stmts in the loop. */
775 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
780 gimple_stmt_iterator si;
782 VEC (slp_instance, heap) *slp_instances;
783 slp_instance instance;
788 loop = LOOP_VINFO_LOOP (loop_vinfo);
790 bbs = LOOP_VINFO_BBS (loop_vinfo);
791 nbbs = loop->num_nodes;
795 free (LOOP_VINFO_BBS (loop_vinfo));
796 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
797 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
798 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
799 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
800 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
807 for (j = 0; j < nbbs; j++)
809 basic_block bb = bbs[j];
810 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
811 free_stmt_vec_info (gsi_stmt (si));
813 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
815 gimple stmt = gsi_stmt (si);
816 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
820 /* Check if this is a "pattern stmt" (introduced by the
821 vectorizer during the pattern recognition pass). */
822 bool remove_stmt_p = false;
823 gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
826 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
828 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
829 remove_stmt_p = true;
832 /* Free stmt_vec_info. */
833 free_stmt_vec_info (stmt);
835 /* Remove dead "pattern stmts". */
837 gsi_remove (&si, true);
843 free (LOOP_VINFO_BBS (loop_vinfo));
844 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
845 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
846 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
847 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
848 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
849 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
850 FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
851 vect_free_slp_instance (instance);
853 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
854 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
855 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
856 VEC_free (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo));
858 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
859 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
866 /* Function vect_analyze_loop_1.
868 Apply a set of analyses on LOOP, and create a loop_vec_info struct
869 for it. The different analyses will record information in the
870 loop_vec_info struct. This is a subset of the analyses applied in
871 vect_analyze_loop, to be applied on an inner-loop nested in the loop
872 that is now considered for (outer-loop) vectorization. */
875 vect_analyze_loop_1 (struct loop *loop)
877 loop_vec_info loop_vinfo;
879 if (vect_print_dump_info (REPORT_DETAILS))
880 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
882 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
884 loop_vinfo = vect_analyze_loop_form (loop);
887 if (vect_print_dump_info (REPORT_DETAILS))
888 fprintf (vect_dump, "bad inner-loop form.");
896 /* Function vect_analyze_loop_form.
898 Verify that certain CFG restrictions hold, including:
899 - the loop has a pre-header
900 - the loop has a single entry and exit
901 - the loop exit condition is simple enough, and the number of iterations
902 can be analyzed (a countable loop). */
905 vect_analyze_loop_form (struct loop *loop)
907 loop_vec_info loop_vinfo;
909 tree number_of_iterations = NULL;
910 loop_vec_info inner_loop_vinfo = NULL;
912 if (vect_print_dump_info (REPORT_DETAILS))
913 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
915 /* Different restrictions apply when we are considering an inner-most loop,
916 vs. an outer (nested) loop.
917 (FORNOW. May want to relax some of these restrictions in the future). */
921 /* Inner-most loop. We currently require that the number of BBs is
922 exactly 2 (the header and latch). Vectorizable inner-most loops
933 if (loop->num_nodes != 2)
935 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
936 fprintf (vect_dump, "not vectorized: control flow in loop.");
940 if (empty_block_p (loop->header))
942 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
943 fprintf (vect_dump, "not vectorized: empty loop.");
949 struct loop *innerloop = loop->inner;
952 /* Nested loop. We currently require that the loop is doubly-nested,
953 contains a single inner loop, and the number of BBs is exactly 5.
954 Vectorizable outer-loops look like this:
966 The inner-loop has the properties expected of inner-most loops
967 as described above. */
969 if ((loop->inner)->inner || (loop->inner)->next)
971 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
972 fprintf (vect_dump, "not vectorized: multiple nested loops.");
976 /* Analyze the inner-loop. */
977 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
978 if (!inner_loop_vinfo)
980 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
981 fprintf (vect_dump, "not vectorized: Bad inner loop.");
985 if (!expr_invariant_in_loop_p (loop,
986 LOOP_VINFO_NITERS (inner_loop_vinfo)))
988 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
990 "not vectorized: inner-loop count not invariant.");
991 destroy_loop_vec_info (inner_loop_vinfo, true);
995 if (loop->num_nodes != 5)
997 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
998 fprintf (vect_dump, "not vectorized: control flow in loop.");
999 destroy_loop_vec_info (inner_loop_vinfo, true);
1003 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1004 entryedge = EDGE_PRED (innerloop->header, 0);
1005 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1006 entryedge = EDGE_PRED (innerloop->header, 1);
1008 if (entryedge->src != loop->header
1009 || !single_exit (innerloop)
1010 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1012 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1013 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1014 destroy_loop_vec_info (inner_loop_vinfo, true);
1018 if (vect_print_dump_info (REPORT_DETAILS))
1019 fprintf (vect_dump, "Considering outer-loop vectorization.");
1022 if (!single_exit (loop)
1023 || EDGE_COUNT (loop->header->preds) != 2)
1025 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1027 if (!single_exit (loop))
1028 fprintf (vect_dump, "not vectorized: multiple exits.");
1029 else if (EDGE_COUNT (loop->header->preds) != 2)
1030 fprintf (vect_dump, "not vectorized: too many incoming edges.");
1032 if (inner_loop_vinfo)
1033 destroy_loop_vec_info (inner_loop_vinfo, true);
1037 /* We assume that the loop exit condition is at the end of the loop. i.e,
1038 that the loop is represented as a do-while (with a proper if-guard
1039 before the loop if needed), where the loop header contains all the
1040 executable statements, and the latch is empty. */
1041 if (!empty_block_p (loop->latch)
1042 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1044 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1045 fprintf (vect_dump, "not vectorized: unexpected loop form.");
1046 if (inner_loop_vinfo)
1047 destroy_loop_vec_info (inner_loop_vinfo, true);
1051 /* Make sure there exists a single-predecessor exit bb: */
1052 if (!single_pred_p (single_exit (loop)->dest))
1054 edge e = single_exit (loop);
1055 if (!(e->flags & EDGE_ABNORMAL))
1057 split_loop_exit_edge (e);
1058 if (vect_print_dump_info (REPORT_DETAILS))
1059 fprintf (vect_dump, "split exit edge.");
1063 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1064 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1065 if (inner_loop_vinfo)
1066 destroy_loop_vec_info (inner_loop_vinfo, true);
1071 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1074 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1075 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1076 if (inner_loop_vinfo)
1077 destroy_loop_vec_info (inner_loop_vinfo, true);
1081 if (!number_of_iterations)
1083 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1085 "not vectorized: number of iterations cannot be computed.");
1086 if (inner_loop_vinfo)
1087 destroy_loop_vec_info (inner_loop_vinfo, true);
1091 if (chrec_contains_undetermined (number_of_iterations))
1093 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1094 fprintf (vect_dump, "Infinite number of iterations.");
1095 if (inner_loop_vinfo)
1096 destroy_loop_vec_info (inner_loop_vinfo, true);
1100 if (!NITERS_KNOWN_P (number_of_iterations))
1102 if (vect_print_dump_info (REPORT_DETAILS))
1104 fprintf (vect_dump, "Symbolic number of iterations is ");
1105 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1108 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1110 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1111 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1112 if (inner_loop_vinfo)
1113 destroy_loop_vec_info (inner_loop_vinfo, false);
1117 loop_vinfo = new_loop_vec_info (loop);
1118 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1119 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1121 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1123 /* CHECKME: May want to keep it around it in the future. */
1124 if (inner_loop_vinfo)
1125 destroy_loop_vec_info (inner_loop_vinfo, false);
1127 gcc_assert (!loop->aux);
1128 loop->aux = loop_vinfo;
1133 /* Get cost by calling cost target builtin. */
1136 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1138 tree dummy_type = NULL;
1141 return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1146 /* Function vect_analyze_loop_operations.
1148 Scan the loop stmts and make sure they are all vectorizable. */
1151 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1153 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1154 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1155 int nbbs = loop->num_nodes;
1156 gimple_stmt_iterator si;
1157 unsigned int vectorization_factor = 0;
1160 stmt_vec_info stmt_info;
1161 bool need_to_vectorize = false;
1162 int min_profitable_iters;
1163 int min_scalar_loop_bound;
1165 bool only_slp_in_loop = true, ok;
1167 if (vect_print_dump_info (REPORT_DETAILS))
1168 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1170 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1171 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1174 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1175 vectorization factor of the loop is the unrolling factor required by
1176 the SLP instances. If that unrolling factor is 1, we say, that we
1177 perform pure SLP on loop - cross iteration parallelism is not
1179 for (i = 0; i < nbbs; i++)
1181 basic_block bb = bbs[i];
1182 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1184 gimple stmt = gsi_stmt (si);
1185 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1186 gcc_assert (stmt_info);
1187 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1188 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1189 && !PURE_SLP_STMT (stmt_info))
1190 /* STMT needs both SLP and loop-based vectorization. */
1191 only_slp_in_loop = false;
1195 if (only_slp_in_loop)
1196 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1198 vectorization_factor = least_common_multiple (vectorization_factor,
1199 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1201 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1202 if (vect_print_dump_info (REPORT_DETAILS))
1203 fprintf (vect_dump, "Updating vectorization factor to %d ",
1204 vectorization_factor);
1207 for (i = 0; i < nbbs; i++)
1209 basic_block bb = bbs[i];
1211 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1213 phi = gsi_stmt (si);
1216 stmt_info = vinfo_for_stmt (phi);
1217 if (vect_print_dump_info (REPORT_DETAILS))
1219 fprintf (vect_dump, "examining phi: ");
1220 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1223 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1224 (i.e., a phi in the tail of the outer-loop). */
1225 if (! is_loop_header_bb_p (bb))
1227 /* FORNOW: we currently don't support the case that these phis
1228 are not used in the outerloop (unless it is double reduction,
1229 i.e., this phi is vect_reduction_def), cause this case
1230 requires to actually do something here. */
1231 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1232 || STMT_VINFO_LIVE_P (stmt_info))
1233 && STMT_VINFO_DEF_TYPE (stmt_info)
1234 != vect_double_reduction_def)
1236 if (vect_print_dump_info (REPORT_DETAILS))
1238 "Unsupported loop-closed phi in outer-loop.");
1242 /* If PHI is used in the outer loop, we check that its operand
1243 is defined in the inner loop. */
1244 if (STMT_VINFO_RELEVANT_P (stmt_info))
1249 if (gimple_phi_num_args (phi) != 1)
1252 phi_op = PHI_ARG_DEF (phi, 0);
1253 if (TREE_CODE (phi_op) != SSA_NAME)
1256 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1257 if (!op_def_stmt || !vinfo_for_stmt (op_def_stmt))
1260 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1261 != vect_used_in_outer
1262 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1263 != vect_used_in_outer_by_reduction)
1270 gcc_assert (stmt_info);
1272 if (STMT_VINFO_LIVE_P (stmt_info))
1274 /* FORNOW: not yet supported. */
1275 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1276 fprintf (vect_dump, "not vectorized: value used after loop.");
1280 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1281 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1283 /* A scalar-dependence cycle that we don't support. */
1284 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1285 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1289 if (STMT_VINFO_RELEVANT_P (stmt_info))
1291 need_to_vectorize = true;
1292 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1293 ok = vectorizable_induction (phi, NULL, NULL);
1298 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1301 "not vectorized: relevant phi not supported: ");
1302 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1308 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1310 gimple stmt = gsi_stmt (si);
1311 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1316 /* All operations in the loop are either irrelevant (deal with loop
1317 control, or dead), or only used outside the loop and can be moved
1318 out of the loop (e.g. invariants, inductions). The loop can be
1319 optimized away by scalar optimizations. We're better off not
1320 touching this loop. */
1321 if (!need_to_vectorize)
1323 if (vect_print_dump_info (REPORT_DETAILS))
1325 "All the computation can be taken out of the loop.");
1326 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1328 "not vectorized: redundant loop. no profit to vectorize.");
1332 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1333 && vect_print_dump_info (REPORT_DETAILS))
1335 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1336 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1338 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1339 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1341 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1342 fprintf (vect_dump, "not vectorized: iteration count too small.");
1343 if (vect_print_dump_info (REPORT_DETAILS))
1344 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1345 "vectorization factor.");
1349 /* Analyze cost. Decide if worth while to vectorize. */
1351 /* Once VF is set, SLP costs should be updated since the number of created
1352 vector stmts depends on VF. */
1353 vect_update_slp_costs_according_to_vf (loop_vinfo);
1355 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1356 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1358 if (min_profitable_iters < 0)
1360 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1361 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1362 if (vect_print_dump_info (REPORT_DETAILS))
1363 fprintf (vect_dump, "not vectorized: vector version will never be "
1368 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1369 * vectorization_factor) - 1);
1371 /* Use the cost model only if it is more conservative than user specified
1374 th = (unsigned) min_scalar_loop_bound;
1375 if (min_profitable_iters
1376 && (!min_scalar_loop_bound
1377 || min_profitable_iters > min_scalar_loop_bound))
1378 th = (unsigned) min_profitable_iters;
1380 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1381 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1383 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1384 fprintf (vect_dump, "not vectorized: vectorization not "
1386 if (vect_print_dump_info (REPORT_DETAILS))
1387 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1388 "user specified loop bound parameter or minimum "
1389 "profitable iterations (whichever is more conservative).");
1393 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1394 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1395 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1397 if (vect_print_dump_info (REPORT_DETAILS))
1398 fprintf (vect_dump, "epilog loop required.");
1399 if (!vect_can_advance_ivs_p (loop_vinfo))
1401 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1403 "not vectorized: can't create epilog loop 1.");
1406 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1408 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1410 "not vectorized: can't create epilog loop 2.");
1419 /* Function vect_analyze_loop_2.
1421 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1422 for it. The different analyses will record information in the
1423 loop_vec_info struct. */
1425 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1427 bool ok, dummy, slp = false;
1428 int max_vf = MAX_VECTORIZATION_FACTOR;
1431 /* Find all data references in the loop (which correspond to vdefs/vuses)
1432 and analyze their evolution in the loop. Also adjust the minimal
1433 vectorization factor according to the loads and stores.
1435 FORNOW: Handle only simple, array references, which
1436 alignment can be forced, and aligned pointer-references. */
1438 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1441 if (vect_print_dump_info (REPORT_DETAILS))
1442 fprintf (vect_dump, "bad data references.");
1446 /* Classify all cross-iteration scalar data-flow cycles.
1447 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1449 vect_analyze_scalar_cycles (loop_vinfo);
1451 vect_pattern_recog (loop_vinfo);
1453 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1455 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1458 if (vect_print_dump_info (REPORT_DETAILS))
1459 fprintf (vect_dump, "unexpected pattern.");
1463 /* Analyze data dependences between the data-refs in the loop
1464 and adjust the maximum vectorization factor according to
1466 FORNOW: fail at the first data dependence that we encounter. */
1468 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf, &dummy);
1472 if (vect_print_dump_info (REPORT_DETAILS))
1473 fprintf (vect_dump, "bad data dependence.");
1477 ok = vect_determine_vectorization_factor (loop_vinfo);
1480 if (vect_print_dump_info (REPORT_DETAILS))
1481 fprintf (vect_dump, "can't determine vectorization factor.");
1484 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1486 if (vect_print_dump_info (REPORT_DETAILS))
1487 fprintf (vect_dump, "bad data dependence.");
1491 /* Analyze the alignment of the data-refs in the loop.
1492 Fail if a data reference is found that cannot be vectorized. */
1494 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1497 if (vect_print_dump_info (REPORT_DETAILS))
1498 fprintf (vect_dump, "bad data alignment.");
1502 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1503 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1505 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1508 if (vect_print_dump_info (REPORT_DETAILS))
1509 fprintf (vect_dump, "bad data access.");
1513 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1514 It is important to call pruning after vect_analyze_data_ref_accesses,
1515 since we use grouping information gathered by interleaving analysis. */
1516 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1519 if (vect_print_dump_info (REPORT_DETAILS))
1520 fprintf (vect_dump, "too long list of versioning for alias "
1525 /* This pass will decide on using loop versioning and/or loop peeling in
1526 order to enhance the alignment of data references in the loop. */
1528 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1531 if (vect_print_dump_info (REPORT_DETAILS))
1532 fprintf (vect_dump, "bad data alignment.");
1536 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1537 ok = vect_analyze_slp (loop_vinfo, NULL);
1540 /* Decide which possible SLP instances to SLP. */
1541 slp = vect_make_slp_decision (loop_vinfo);
1543 /* Find stmts that need to be both vectorized and SLPed. */
1544 vect_detect_hybrid_slp (loop_vinfo);
1549 /* Scan all the operations in the loop and make sure they are
1552 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1555 if (vect_print_dump_info (REPORT_DETAILS))
1556 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1563 /* Function vect_analyze_loop.
1565 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1566 for it. The different analyses will record information in the
1567 loop_vec_info struct. */
1569 vect_analyze_loop (struct loop *loop)
1571 loop_vec_info loop_vinfo;
1572 unsigned int vector_sizes;
1574 /* Autodetect first vector size we try. */
1575 current_vector_size = 0;
1576 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1578 if (vect_print_dump_info (REPORT_DETAILS))
1579 fprintf (vect_dump, "===== analyze_loop_nest =====");
1581 if (loop_outer (loop)
1582 && loop_vec_info_for_loop (loop_outer (loop))
1583 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1585 if (vect_print_dump_info (REPORT_DETAILS))
1586 fprintf (vect_dump, "outer-loop already vectorized.");
1592 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1593 loop_vinfo = vect_analyze_loop_form (loop);
1596 if (vect_print_dump_info (REPORT_DETAILS))
1597 fprintf (vect_dump, "bad loop form.");
1601 if (vect_analyze_loop_2 (loop_vinfo))
1603 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1608 destroy_loop_vec_info (loop_vinfo, true);
1610 vector_sizes &= ~current_vector_size;
1611 if (vector_sizes == 0
1612 || current_vector_size == 0)
1615 /* Try the next biggest vector size. */
1616 current_vector_size = 1 << floor_log2 (vector_sizes);
1617 if (vect_print_dump_info (REPORT_DETAILS))
1618 fprintf (vect_dump, "***** Re-trying analysis with "
1619 "vector size %d\n", current_vector_size);
1624 /* Function reduction_code_for_scalar_code
1627 CODE - tree_code of a reduction operations.
1630 REDUC_CODE - the corresponding tree-code to be used to reduce the
1631 vector of partial results into a single scalar result (which
1632 will also reside in a vector) or ERROR_MARK if the operation is
1633 a supported reduction operation, but does not have such tree-code.
1635 Return FALSE if CODE currently cannot be vectorized as reduction. */
1638 reduction_code_for_scalar_code (enum tree_code code,
1639 enum tree_code *reduc_code)
1644 *reduc_code = REDUC_MAX_EXPR;
1648 *reduc_code = REDUC_MIN_EXPR;
1652 *reduc_code = REDUC_PLUS_EXPR;
1660 *reduc_code = ERROR_MARK;
1669 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1670 STMT is printed with a message MSG. */
1673 report_vect_op (gimple stmt, const char *msg)
1675 fprintf (vect_dump, "%s", msg);
1676 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1680 /* Detect SLP reduction of the form:
1690 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1691 FIRST_STMT is the first reduction stmt in the chain
1692 (a2 = operation (a1)).
1694 Return TRUE if a reduction chain was detected. */
1697 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1699 struct loop *loop = (gimple_bb (phi))->loop_father;
1700 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1701 enum tree_code code;
1702 gimple current_stmt = NULL, use_stmt = NULL, first;
1703 stmt_vec_info use_stmt_info, current_stmt_info;
1705 imm_use_iterator imm_iter;
1706 use_operand_p use_p;
1707 int nloop_uses, size = 0;
1710 if (loop != vect_loop)
1713 lhs = PHI_RESULT (phi);
1714 code = gimple_assign_rhs_code (first_stmt);
1718 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1720 use_stmt = USE_STMT (use_p);
1721 if (is_gimple_debug (use_stmt))
1724 /* Check if we got back to the reduction phi. */
1725 if (gimple_code (use_stmt) == GIMPLE_PHI
1732 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1733 && vinfo_for_stmt (use_stmt)
1734 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt))
1735 && use_stmt != first_stmt)
1745 /* This is a loop exit phi, and we haven't reached the reduction phi. */
1746 if (gimple_code (use_stmt) == GIMPLE_PHI)
1749 if (!is_gimple_assign (use_stmt)
1750 || code != gimple_assign_rhs_code (use_stmt)
1751 || !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1754 /* Insert USE_STMT into reduction chain. */
1755 use_stmt_info = vinfo_for_stmt (use_stmt);
1758 current_stmt_info = vinfo_for_stmt (current_stmt);
1759 GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt;
1760 GROUP_FIRST_ELEMENT (use_stmt_info)
1761 = GROUP_FIRST_ELEMENT (current_stmt_info);
1764 GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt;
1766 lhs = gimple_assign_lhs (use_stmt);
1767 current_stmt = use_stmt;
1771 if (!found || use_stmt != phi || size < 2)
1774 /* Save the chain for further analysis in SLP detection. */
1775 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1776 VEC_safe_push (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_info), first);
1777 GROUP_SIZE (vinfo_for_stmt (first)) = size;
1779 /* Swap the operands, if needed, to make the reduction operand be the second
1781 lhs = PHI_RESULT (phi);
1782 current_stmt = first;
1783 while (current_stmt)
1785 if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS
1786 && gimple_assign_rhs2 (current_stmt) != lhs)
1788 if (vect_print_dump_info (REPORT_DETAILS))
1790 fprintf (vect_dump, "swapping oprnds: ");
1791 print_gimple_stmt (vect_dump, current_stmt, 0, TDF_SLIM);
1794 swap_tree_operands (current_stmt,
1795 gimple_assign_rhs1_ptr (current_stmt),
1796 gimple_assign_rhs2_ptr (current_stmt));
1797 mark_symbols_for_renaming (current_stmt);
1800 lhs = gimple_assign_lhs (current_stmt);
1801 current_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (current_stmt));
1808 /* Function vect_is_simple_reduction_1
1810 (1) Detect a cross-iteration def-use cycle that represents a simple
1811 reduction computation. We look for the following pattern:
1816 a2 = operation (a3, a1)
1819 1. operation is commutative and associative and it is safe to
1820 change the order of the computation (if CHECK_REDUCTION is true)
1821 2. no uses for a2 in the loop (a2 is used out of the loop)
1822 3. no uses of a1 in the loop besides the reduction operation
1823 4. no uses of a1 outside the loop.
1825 Conditions 1,4 are tested here.
1826 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1828 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1829 nested cycles, if CHECK_REDUCTION is false.
1831 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1835 inner loop (def of a3)
1838 If MODIFY is true it tries also to rework the code in-place to enable
1839 detection of more reduction patterns. For the time being we rewrite
1840 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1844 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1845 bool check_reduction, bool *double_reduc,
1848 struct loop *loop = (gimple_bb (phi))->loop_father;
1849 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1850 edge latch_e = loop_latch_edge (loop);
1851 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1852 gimple def_stmt, def1 = NULL, def2 = NULL;
1853 enum tree_code orig_code, code;
1854 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1858 imm_use_iterator imm_iter;
1859 use_operand_p use_p;
1862 *double_reduc = false;
1864 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1865 otherwise, we assume outer loop vectorization. */
1866 gcc_assert ((check_reduction && loop == vect_loop)
1867 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1869 name = PHI_RESULT (phi);
1871 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1873 gimple use_stmt = USE_STMT (use_p);
1874 if (is_gimple_debug (use_stmt))
1877 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1879 if (vect_print_dump_info (REPORT_DETAILS))
1880 fprintf (vect_dump, "intermediate value used outside loop.");
1885 if (vinfo_for_stmt (use_stmt)
1886 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1890 if (vect_print_dump_info (REPORT_DETAILS))
1891 fprintf (vect_dump, "reduction used in loop.");
1896 if (TREE_CODE (loop_arg) != SSA_NAME)
1898 if (vect_print_dump_info (REPORT_DETAILS))
1900 fprintf (vect_dump, "reduction: not ssa_name: ");
1901 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1906 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1909 if (vect_print_dump_info (REPORT_DETAILS))
1910 fprintf (vect_dump, "reduction: no def_stmt.");
1914 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1916 if (vect_print_dump_info (REPORT_DETAILS))
1917 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1921 if (is_gimple_assign (def_stmt))
1923 name = gimple_assign_lhs (def_stmt);
1928 name = PHI_RESULT (def_stmt);
1933 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1935 gimple use_stmt = USE_STMT (use_p);
1936 if (is_gimple_debug (use_stmt))
1938 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1939 && vinfo_for_stmt (use_stmt)
1940 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1944 if (vect_print_dump_info (REPORT_DETAILS))
1945 fprintf (vect_dump, "reduction used in loop.");
1950 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1951 defined in the inner loop. */
1954 op1 = PHI_ARG_DEF (def_stmt, 0);
1956 if (gimple_phi_num_args (def_stmt) != 1
1957 || TREE_CODE (op1) != SSA_NAME)
1959 if (vect_print_dump_info (REPORT_DETAILS))
1960 fprintf (vect_dump, "unsupported phi node definition.");
1965 def1 = SSA_NAME_DEF_STMT (op1);
1966 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1968 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1969 && is_gimple_assign (def1))
1971 if (vect_print_dump_info (REPORT_DETAILS))
1972 report_vect_op (def_stmt, "detected double reduction: ");
1974 *double_reduc = true;
1981 code = orig_code = gimple_assign_rhs_code (def_stmt);
1983 /* We can handle "res -= x[i]", which is non-associative by
1984 simply rewriting this into "res += -x[i]". Avoid changing
1985 gimple instruction for the first simple tests and only do this
1986 if we're allowed to change code at all. */
1987 if (code == MINUS_EXPR
1989 && (op1 = gimple_assign_rhs1 (def_stmt))
1990 && TREE_CODE (op1) == SSA_NAME
1991 && SSA_NAME_DEF_STMT (op1) == phi)
1995 && (!commutative_tree_code (code) || !associative_tree_code (code)))
1997 if (vect_print_dump_info (REPORT_DETAILS))
1998 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
2002 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2004 if (code != COND_EXPR)
2006 if (vect_print_dump_info (REPORT_DETAILS))
2007 report_vect_op (def_stmt, "reduction: not binary operation: ");
2012 op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
2013 if (COMPARISON_CLASS_P (op3))
2015 op4 = TREE_OPERAND (op3, 1);
2016 op3 = TREE_OPERAND (op3, 0);
2019 op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
2020 op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
2022 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2024 if (vect_print_dump_info (REPORT_DETAILS))
2025 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2032 op1 = gimple_assign_rhs1 (def_stmt);
2033 op2 = gimple_assign_rhs2 (def_stmt);
2035 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
2037 if (vect_print_dump_info (REPORT_DETAILS))
2038 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2044 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2045 if ((TREE_CODE (op1) == SSA_NAME
2046 && !types_compatible_p (type,TREE_TYPE (op1)))
2047 || (TREE_CODE (op2) == SSA_NAME
2048 && !types_compatible_p (type, TREE_TYPE (op2)))
2049 || (op3 && TREE_CODE (op3) == SSA_NAME
2050 && !types_compatible_p (type, TREE_TYPE (op3)))
2051 || (op4 && TREE_CODE (op4) == SSA_NAME
2052 && !types_compatible_p (type, TREE_TYPE (op4))))
2054 if (vect_print_dump_info (REPORT_DETAILS))
2056 fprintf (vect_dump, "reduction: multiple types: operation type: ");
2057 print_generic_expr (vect_dump, type, TDF_SLIM);
2058 fprintf (vect_dump, ", operands types: ");
2059 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
2060 fprintf (vect_dump, ",");
2061 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
2064 fprintf (vect_dump, ",");
2065 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
2070 fprintf (vect_dump, ",");
2071 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
2078 /* Check that it's ok to change the order of the computation.
2079 Generally, when vectorizing a reduction we change the order of the
2080 computation. This may change the behavior of the program in some
2081 cases, so we need to check that this is ok. One exception is when
2082 vectorizing an outer-loop: the inner-loop is executed sequentially,
2083 and therefore vectorizing reductions in the inner-loop during
2084 outer-loop vectorization is safe. */
2086 /* CHECKME: check for !flag_finite_math_only too? */
2087 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2090 /* Changing the order of operations changes the semantics. */
2091 if (vect_print_dump_info (REPORT_DETAILS))
2092 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
2095 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2098 /* Changing the order of operations changes the semantics. */
2099 if (vect_print_dump_info (REPORT_DETAILS))
2100 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
2103 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2105 /* Changing the order of operations changes the semantics. */
2106 if (vect_print_dump_info (REPORT_DETAILS))
2107 report_vect_op (def_stmt,
2108 "reduction: unsafe fixed-point math optimization: ");
2112 /* If we detected "res -= x[i]" earlier, rewrite it into
2113 "res += -x[i]" now. If this turns out to be useless reassoc
2114 will clean it up again. */
2115 if (orig_code == MINUS_EXPR)
2117 tree rhs = gimple_assign_rhs2 (def_stmt);
2118 tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
2119 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2121 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2122 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2124 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2125 gimple_assign_set_rhs2 (def_stmt, negrhs);
2126 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2127 update_stmt (def_stmt);
2130 /* Reduction is safe. We're dealing with one of the following:
2131 1) integer arithmetic and no trapv
2132 2) floating point arithmetic, and special flags permit this optimization
2133 3) nested cycle (i.e., outer loop vectorization). */
2134 if (TREE_CODE (op1) == SSA_NAME)
2135 def1 = SSA_NAME_DEF_STMT (op1);
2137 if (TREE_CODE (op2) == SSA_NAME)
2138 def2 = SSA_NAME_DEF_STMT (op2);
2140 if (code != COND_EXPR
2141 && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
2143 if (vect_print_dump_info (REPORT_DETAILS))
2144 report_vect_op (def_stmt, "reduction: no defs for operands: ");
2148 /* Check that one def is the reduction def, defined by PHI,
2149 the other def is either defined in the loop ("vect_internal_def"),
2150 or it's an induction (defined by a loop-header phi-node). */
2152 if (def2 && def2 == phi
2153 && (code == COND_EXPR
2154 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2155 && (is_gimple_assign (def1)
2156 || is_gimple_call (def1)
2157 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2158 == vect_induction_def
2159 || (gimple_code (def1) == GIMPLE_PHI
2160 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2161 == vect_internal_def
2162 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2164 if (vect_print_dump_info (REPORT_DETAILS))
2165 report_vect_op (def_stmt, "detected reduction: ");
2169 if (def1 && def1 == phi
2170 && (code == COND_EXPR
2171 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2172 && (is_gimple_assign (def2)
2173 || is_gimple_call (def2)
2174 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2175 == vect_induction_def
2176 || (gimple_code (def2) == GIMPLE_PHI
2177 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2178 == vect_internal_def
2179 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2181 if (check_reduction)
2183 /* Swap operands (just for simplicity - so that the rest of the code
2184 can assume that the reduction variable is always the last (second)
2186 if (vect_print_dump_info (REPORT_DETAILS))
2187 report_vect_op (def_stmt,
2188 "detected reduction: need to swap operands: ");
2190 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2191 gimple_assign_rhs2_ptr (def_stmt));
2195 if (vect_print_dump_info (REPORT_DETAILS))
2196 report_vect_op (def_stmt, "detected reduction: ");
2202 /* Try to find SLP reduction chain. */
2203 if (vect_is_slp_reduction (loop_info, phi, def_stmt))
2205 if (vect_print_dump_info (REPORT_DETAILS))
2206 report_vect_op (def_stmt, "reduction: detected reduction chain: ");
2211 if (vect_print_dump_info (REPORT_DETAILS))
2212 report_vect_op (def_stmt, "reduction: unknown pattern: ");
2217 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2218 in-place. Arguments as there. */
2221 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2222 bool check_reduction, bool *double_reduc)
2224 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2225 double_reduc, false);
2228 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2229 in-place if it enables detection of more reductions. Arguments
2233 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2234 bool check_reduction, bool *double_reduc)
2236 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2237 double_reduc, true);
2240 /* Calculate the cost of one scalar iteration of the loop. */
2242 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2244 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2245 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2246 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2247 int innerloop_iters, i, stmt_cost;
2249 /* Count statements in scalar loop. Using this as scalar cost for a single
2252 TODO: Add outer loop support.
2254 TODO: Consider assigning different costs to different scalar
2258 innerloop_iters = 1;
2260 innerloop_iters = 50; /* FIXME */
2262 for (i = 0; i < nbbs; i++)
2264 gimple_stmt_iterator si;
2265 basic_block bb = bbs[i];
2267 if (bb->loop_father == loop->inner)
2268 factor = innerloop_iters;
2272 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2274 gimple stmt = gsi_stmt (si);
2275 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2277 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2280 /* Skip stmts that are not vectorized inside the loop. */
2282 && !STMT_VINFO_RELEVANT_P (stmt_info)
2283 && (!STMT_VINFO_LIVE_P (stmt_info)
2284 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2287 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2289 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2290 stmt_cost = vect_get_cost (scalar_load);
2292 stmt_cost = vect_get_cost (scalar_store);
2295 stmt_cost = vect_get_cost (scalar_stmt);
2297 scalar_single_iter_cost += stmt_cost * factor;
2300 return scalar_single_iter_cost;
2303 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2305 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2306 int *peel_iters_epilogue,
2307 int scalar_single_iter_cost)
2309 int peel_guard_costs = 0;
2310 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2312 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2314 *peel_iters_epilogue = vf/2;
2315 if (vect_print_dump_info (REPORT_COST))
2316 fprintf (vect_dump, "cost model: "
2317 "epilogue peel iters set to vf/2 because "
2318 "loop iterations are unknown .");
2320 /* If peeled iterations are known but number of scalar loop
2321 iterations are unknown, count a taken branch per peeled loop. */
2322 peel_guard_costs = 2 * vect_get_cost (cond_branch_taken);
2326 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2327 peel_iters_prologue = niters < peel_iters_prologue ?
2328 niters : peel_iters_prologue;
2329 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2332 return (peel_iters_prologue * scalar_single_iter_cost)
2333 + (*peel_iters_epilogue * scalar_single_iter_cost)
2337 /* Function vect_estimate_min_profitable_iters
2339 Return the number of iterations required for the vector version of the
2340 loop to be profitable relative to the cost of the scalar version of the
2343 TODO: Take profile info into account before making vectorization
2344 decisions, if available. */
2347 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2350 int min_profitable_iters;
2351 int peel_iters_prologue;
2352 int peel_iters_epilogue;
2353 int vec_inside_cost = 0;
2354 int vec_outside_cost = 0;
2355 int scalar_single_iter_cost = 0;
2356 int scalar_outside_cost = 0;
2357 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2358 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2359 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2360 int nbbs = loop->num_nodes;
2361 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2362 int peel_guard_costs = 0;
2363 int innerloop_iters = 0, factor;
2364 VEC (slp_instance, heap) *slp_instances;
2365 slp_instance instance;
2367 /* Cost model disabled. */
2368 if (!flag_vect_cost_model)
2370 if (vect_print_dump_info (REPORT_COST))
2371 fprintf (vect_dump, "cost model disabled.");
2375 /* Requires loop versioning tests to handle misalignment. */
2376 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2378 /* FIXME: Make cost depend on complexity of individual check. */
2380 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2381 if (vect_print_dump_info (REPORT_COST))
2382 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2383 "versioning to treat misalignment.\n");
2386 /* Requires loop versioning with alias checks. */
2387 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2389 /* FIXME: Make cost depend on complexity of individual check. */
2391 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2392 if (vect_print_dump_info (REPORT_COST))
2393 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2394 "versioning aliasing.\n");
2397 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2398 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2399 vec_outside_cost += vect_get_cost (cond_branch_taken);
2401 /* Count statements in scalar loop. Using this as scalar cost for a single
2404 TODO: Add outer loop support.
2406 TODO: Consider assigning different costs to different scalar
2411 innerloop_iters = 50; /* FIXME */
2413 for (i = 0; i < nbbs; i++)
2415 gimple_stmt_iterator si;
2416 basic_block bb = bbs[i];
2418 if (bb->loop_father == loop->inner)
2419 factor = innerloop_iters;
2423 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2425 gimple stmt = gsi_stmt (si);
2426 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2427 /* Skip stmts that are not vectorized inside the loop. */
2428 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2429 && (!STMT_VINFO_LIVE_P (stmt_info)
2430 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2432 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2433 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2434 some of the "outside" costs are generated inside the outer-loop. */
2435 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2439 scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2441 /* Add additional cost for the peeled instructions in prologue and epilogue
2444 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2445 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2447 TODO: Build an expression that represents peel_iters for prologue and
2448 epilogue to be used in a run-time test. */
2452 peel_iters_prologue = vf/2;
2453 if (vect_print_dump_info (REPORT_COST))
2454 fprintf (vect_dump, "cost model: "
2455 "prologue peel iters set to vf/2.");
2457 /* If peeling for alignment is unknown, loop bound of main loop becomes
2459 peel_iters_epilogue = vf/2;
2460 if (vect_print_dump_info (REPORT_COST))
2461 fprintf (vect_dump, "cost model: "
2462 "epilogue peel iters set to vf/2 because "
2463 "peeling for alignment is unknown .");
2465 /* If peeled iterations are unknown, count a taken branch and a not taken
2466 branch per peeled loop. Even if scalar loop iterations are known,
2467 vector iterations are not known since peeled prologue iterations are
2468 not known. Hence guards remain the same. */
2469 peel_guard_costs += 2 * (vect_get_cost (cond_branch_taken)
2470 + vect_get_cost (cond_branch_not_taken));
2471 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2472 + (peel_iters_epilogue * scalar_single_iter_cost)
2477 peel_iters_prologue = npeel;
2478 vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2479 peel_iters_prologue, &peel_iters_epilogue,
2480 scalar_single_iter_cost);
2483 /* FORNOW: The scalar outside cost is incremented in one of the
2486 1. The vectorizer checks for alignment and aliasing and generates
2487 a condition that allows dynamic vectorization. A cost model
2488 check is ANDED with the versioning condition. Hence scalar code
2489 path now has the added cost of the versioning check.
2491 if (cost > th & versioning_check)
2494 Hence run-time scalar is incremented by not-taken branch cost.
2496 2. The vectorizer then checks if a prologue is required. If the
2497 cost model check was not done before during versioning, it has to
2498 be done before the prologue check.
2501 prologue = scalar_iters
2506 if (prologue == num_iters)
2509 Hence the run-time scalar cost is incremented by a taken branch,
2510 plus a not-taken branch, plus a taken branch cost.
2512 3. The vectorizer then checks if an epilogue is required. If the
2513 cost model check was not done before during prologue check, it
2514 has to be done with the epilogue check.
2520 if (prologue == num_iters)
2523 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2526 Hence the run-time scalar cost should be incremented by 2 taken
2529 TODO: The back end may reorder the BBS's differently and reverse
2530 conditions/branch directions. Change the estimates below to
2531 something more reasonable. */
2533 /* If the number of iterations is known and we do not do versioning, we can
2534 decide whether to vectorize at compile time. Hence the scalar version
2535 do not carry cost model guard costs. */
2536 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2537 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2538 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2540 /* Cost model check occurs at versioning. */
2541 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2542 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2543 scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2546 /* Cost model check occurs at prologue generation. */
2547 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2548 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2549 + vect_get_cost (cond_branch_not_taken);
2550 /* Cost model check occurs at epilogue generation. */
2552 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2556 /* Add SLP costs. */
2557 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2558 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2560 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2561 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2564 /* Calculate number of iterations required to make the vector version
2565 profitable, relative to the loop bodies only. The following condition
2567 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2569 SIC = scalar iteration cost, VIC = vector iteration cost,
2570 VOC = vector outside cost, VF = vectorization factor,
2571 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2572 SOC = scalar outside cost for run time cost model check. */
2574 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2576 if (vec_outside_cost <= 0)
2577 min_profitable_iters = 1;
2580 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2581 - vec_inside_cost * peel_iters_prologue
2582 - vec_inside_cost * peel_iters_epilogue)
2583 / ((scalar_single_iter_cost * vf)
2586 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2587 <= ((vec_inside_cost * min_profitable_iters)
2588 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2589 min_profitable_iters++;
2592 /* vector version will never be profitable. */
2595 if (vect_print_dump_info (REPORT_COST))
2596 fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2597 "divided by the scalar iteration cost = %d "
2598 "is greater or equal to the vectorization factor = %d.",
2599 vec_inside_cost, scalar_single_iter_cost, vf);
2603 if (vect_print_dump_info (REPORT_COST))
2605 fprintf (vect_dump, "Cost model analysis: \n");
2606 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2608 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2610 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2611 scalar_single_iter_cost);
2612 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2613 fprintf (vect_dump, " prologue iterations: %d\n",
2614 peel_iters_prologue);
2615 fprintf (vect_dump, " epilogue iterations: %d\n",
2616 peel_iters_epilogue);
2617 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2618 min_profitable_iters);
2621 min_profitable_iters =
2622 min_profitable_iters < vf ? vf : min_profitable_iters;
2624 /* Because the condition we create is:
2625 if (niters <= min_profitable_iters)
2626 then skip the vectorized loop. */
2627 min_profitable_iters--;
2629 if (vect_print_dump_info (REPORT_COST))
2630 fprintf (vect_dump, " Profitability threshold = %d\n",
2631 min_profitable_iters);
2633 return min_profitable_iters;
2637 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2638 functions. Design better to avoid maintenance issues. */
2640 /* Function vect_model_reduction_cost.
2642 Models cost for a reduction operation, including the vector ops
2643 generated within the strip-mine loop, the initial definition before
2644 the loop, and the epilogue code that must be generated. */
2647 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2651 enum tree_code code;
2654 gimple stmt, orig_stmt;
2656 enum machine_mode mode;
2657 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2658 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2661 /* Cost of reduction op inside loop. */
2662 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2663 += ncopies * vect_get_cost (vector_stmt);
2665 stmt = STMT_VINFO_STMT (stmt_info);
2667 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2669 case GIMPLE_SINGLE_RHS:
2670 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2671 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2673 case GIMPLE_UNARY_RHS:
2674 reduction_op = gimple_assign_rhs1 (stmt);
2676 case GIMPLE_BINARY_RHS:
2677 reduction_op = gimple_assign_rhs2 (stmt);
2679 case GIMPLE_TERNARY_RHS:
2680 reduction_op = gimple_assign_rhs3 (stmt);
2686 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2689 if (vect_print_dump_info (REPORT_COST))
2691 fprintf (vect_dump, "unsupported data-type ");
2692 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2697 mode = TYPE_MODE (vectype);
2698 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2701 orig_stmt = STMT_VINFO_STMT (stmt_info);
2703 code = gimple_assign_rhs_code (orig_stmt);
2705 /* Add in cost for initial definition. */
2706 outer_cost += vect_get_cost (scalar_to_vec);
2708 /* Determine cost of epilogue code.
2710 We have a reduction operator that will reduce the vector in one statement.
2711 Also requires scalar extract. */
2713 if (!nested_in_vect_loop_p (loop, orig_stmt))
2715 if (reduc_code != ERROR_MARK)
2716 outer_cost += vect_get_cost (vector_stmt)
2717 + vect_get_cost (vec_to_scalar);
2720 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2722 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2723 int element_bitsize = tree_low_cst (bitsize, 1);
2724 int nelements = vec_size_in_bits / element_bitsize;
2726 optab = optab_for_tree_code (code, vectype, optab_default);
2728 /* We have a whole vector shift available. */
2729 if (VECTOR_MODE_P (mode)
2730 && optab_handler (optab, mode) != CODE_FOR_nothing
2731 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2732 /* Final reduction via vector shifts and the reduction operator. Also
2733 requires scalar extract. */
2734 outer_cost += ((exact_log2(nelements) * 2)
2735 * vect_get_cost (vector_stmt)
2736 + vect_get_cost (vec_to_scalar));
2738 /* Use extracts and reduction op for final reduction. For N elements,
2739 we have N extracts and N-1 reduction ops. */
2740 outer_cost += ((nelements + nelements - 1)
2741 * vect_get_cost (vector_stmt));
2745 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2747 if (vect_print_dump_info (REPORT_COST))
2748 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2749 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2750 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2756 /* Function vect_model_induction_cost.
2758 Models cost for induction operations. */
2761 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2763 /* loop cost for vec_loop. */
2764 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2765 = ncopies * vect_get_cost (vector_stmt);
2766 /* prologue cost for vec_init and vec_step. */
2767 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2768 = 2 * vect_get_cost (scalar_to_vec);
2770 if (vect_print_dump_info (REPORT_COST))
2771 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2772 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2773 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2777 /* Function get_initial_def_for_induction
2780 STMT - a stmt that performs an induction operation in the loop.
2781 IV_PHI - the initial value of the induction variable
2784 Return a vector variable, initialized with the first VF values of
2785 the induction variable. E.g., for an iv with IV_PHI='X' and
2786 evolution S, for a vector of 4 units, we want to return:
2787 [X, X + S, X + 2*S, X + 3*S]. */
2790 get_initial_def_for_induction (gimple iv_phi)
2792 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2793 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2794 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2798 edge pe = loop_preheader_edge (loop);
2799 struct loop *iv_loop;
2801 tree vec, vec_init, vec_step, t;
2805 gimple init_stmt, induction_phi, new_stmt;
2806 tree induc_def, vec_def, vec_dest;
2807 tree init_expr, step_expr;
2808 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2813 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2814 bool nested_in_vect_loop = false;
2815 gimple_seq stmts = NULL;
2816 imm_use_iterator imm_iter;
2817 use_operand_p use_p;
2821 gimple_stmt_iterator si;
2822 basic_block bb = gimple_bb (iv_phi);
2826 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2827 if (nested_in_vect_loop_p (loop, iv_phi))
2829 nested_in_vect_loop = true;
2830 iv_loop = loop->inner;
2834 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2836 latch_e = loop_latch_edge (iv_loop);
2837 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2839 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2840 gcc_assert (access_fn);
2841 STRIP_NOPS (access_fn);
2842 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2843 &init_expr, &step_expr);
2845 pe = loop_preheader_edge (iv_loop);
2847 scalar_type = TREE_TYPE (init_expr);
2848 vectype = get_vectype_for_scalar_type (scalar_type);
2849 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
2850 gcc_assert (vectype);
2851 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2852 ncopies = vf / nunits;
2854 gcc_assert (phi_info);
2855 gcc_assert (ncopies >= 1);
2857 /* Find the first insertion point in the BB. */
2858 si = gsi_after_labels (bb);
2860 /* Create the vector that holds the initial_value of the induction. */
2861 if (nested_in_vect_loop)
2863 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2864 been created during vectorization of previous stmts. We obtain it
2865 from the STMT_VINFO_VEC_STMT of the defining stmt. */
2866 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2867 loop_preheader_edge (iv_loop));
2868 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2872 /* iv_loop is the loop to be vectorized. Create:
2873 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
2874 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2875 add_referenced_var (new_var);
2877 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2880 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2881 gcc_assert (!new_bb);
2885 t = tree_cons (NULL_TREE, new_name, t);
2886 for (i = 1; i < nunits; i++)
2888 /* Create: new_name_i = new_name + step_expr */
2889 enum tree_code code = POINTER_TYPE_P (scalar_type)
2890 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2891 init_stmt = gimple_build_assign_with_ops (code, new_var,
2892 new_name, step_expr);
2893 new_name = make_ssa_name (new_var, init_stmt);
2894 gimple_assign_set_lhs (init_stmt, new_name);
2896 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2897 gcc_assert (!new_bb);
2899 if (vect_print_dump_info (REPORT_DETAILS))
2901 fprintf (vect_dump, "created new init_stmt: ");
2902 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2904 t = tree_cons (NULL_TREE, new_name, t);
2906 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
2907 vec = build_constructor_from_list (vectype, nreverse (t));
2908 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2912 /* Create the vector that holds the step of the induction. */
2913 if (nested_in_vect_loop)
2914 /* iv_loop is nested in the loop to be vectorized. Generate:
2915 vec_step = [S, S, S, S] */
2916 new_name = step_expr;
2919 /* iv_loop is the loop to be vectorized. Generate:
2920 vec_step = [VF*S, VF*S, VF*S, VF*S] */
2921 expr = build_int_cst (TREE_TYPE (step_expr), vf);
2922 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2926 t = unshare_expr (new_name);
2927 gcc_assert (CONSTANT_CLASS_P (new_name));
2928 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2929 gcc_assert (stepvectype);
2930 vec = build_vector_from_val (stepvectype, t);
2931 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2934 /* Create the following def-use cycle:
2939 vec_iv = PHI <vec_init, vec_loop>
2943 vec_loop = vec_iv + vec_step; */
2945 /* Create the induction-phi that defines the induction-operand. */
2946 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2947 add_referenced_var (vec_dest);
2948 induction_phi = create_phi_node (vec_dest, iv_loop->header);
2949 set_vinfo_for_stmt (induction_phi,
2950 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2951 induc_def = PHI_RESULT (induction_phi);
2953 /* Create the iv update inside the loop */
2954 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2955 induc_def, vec_step);
2956 vec_def = make_ssa_name (vec_dest, new_stmt);
2957 gimple_assign_set_lhs (new_stmt, vec_def);
2958 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2959 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2962 /* Set the arguments of the phi node: */
2963 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2964 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2968 /* In case that vectorization factor (VF) is bigger than the number
2969 of elements that we can fit in a vectype (nunits), we have to generate
2970 more than one vector stmt - i.e - we need to "unroll" the
2971 vector stmt by a factor VF/nunits. For more details see documentation
2972 in vectorizable_operation. */
2976 stmt_vec_info prev_stmt_vinfo;
2977 /* FORNOW. This restriction should be relaxed. */
2978 gcc_assert (!nested_in_vect_loop);
2980 /* Create the vector that holds the step of the induction. */
2981 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2982 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2984 t = unshare_expr (new_name);
2985 gcc_assert (CONSTANT_CLASS_P (new_name));
2986 vec = build_vector_from_val (stepvectype, t);
2987 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2989 vec_def = induc_def;
2990 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2991 for (i = 1; i < ncopies; i++)
2993 /* vec_i = vec_prev + vec_step */
2994 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2996 vec_def = make_ssa_name (vec_dest, new_stmt);
2997 gimple_assign_set_lhs (new_stmt, vec_def);
2999 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3000 if (!useless_type_conversion_p (resvectype, vectype))
3002 new_stmt = gimple_build_assign_with_ops
3004 vect_get_new_vect_var (resvectype, vect_simple_var,
3006 build1 (VIEW_CONVERT_EXPR, resvectype,
3007 gimple_assign_lhs (new_stmt)), NULL_TREE);
3008 gimple_assign_set_lhs (new_stmt,
3010 (gimple_assign_lhs (new_stmt), new_stmt));
3011 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3013 set_vinfo_for_stmt (new_stmt,
3014 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3015 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3016 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3020 if (nested_in_vect_loop)
3022 /* Find the loop-closed exit-phi of the induction, and record
3023 the final vector of induction results: */
3025 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3027 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3029 exit_phi = USE_STMT (use_p);
3035 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3036 /* FORNOW. Currently not supporting the case that an inner-loop induction
3037 is not used in the outer-loop (i.e. only outside the outer-loop). */
3038 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3039 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3041 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3042 if (vect_print_dump_info (REPORT_DETAILS))
3044 fprintf (vect_dump, "vector of inductions after inner-loop:");
3045 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
3051 if (vect_print_dump_info (REPORT_DETAILS))
3053 fprintf (vect_dump, "transform induction: created def-use cycle: ");
3054 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
3055 fprintf (vect_dump, "\n");
3056 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
3059 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3060 if (!useless_type_conversion_p (resvectype, vectype))
3062 new_stmt = gimple_build_assign_with_ops
3064 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3065 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3066 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3067 gimple_assign_set_lhs (new_stmt, induc_def);
3068 si = gsi_start_bb (bb);
3069 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3070 set_vinfo_for_stmt (new_stmt,
3071 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3072 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3073 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3080 /* Function get_initial_def_for_reduction
3083 STMT - a stmt that performs a reduction operation in the loop.
3084 INIT_VAL - the initial value of the reduction variable
3087 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3088 of the reduction (used for adjusting the epilog - see below).
3089 Return a vector variable, initialized according to the operation that STMT
3090 performs. This vector will be used as the initial value of the
3091 vector of partial results.
3093 Option1 (adjust in epilog): Initialize the vector as follows:
3094 add/bit or/xor: [0,0,...,0,0]
3095 mult/bit and: [1,1,...,1,1]
3096 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3097 and when necessary (e.g. add/mult case) let the caller know
3098 that it needs to adjust the result by init_val.
3100 Option2: Initialize the vector as follows:
3101 add/bit or/xor: [init_val,0,0,...,0]
3102 mult/bit and: [init_val,1,1,...,1]
3103 min/max/cond_expr: [init_val,init_val,...,init_val]
3104 and no adjustments are needed.
3106 For example, for the following code:
3112 STMT is 's = s + a[i]', and the reduction variable is 's'.
3113 For a vector of 4 units, we want to return either [0,0,0,init_val],
3114 or [0,0,0,0] and let the caller know that it needs to adjust
3115 the result at the end by 'init_val'.
3117 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3118 initialization vector is simpler (same element in all entries), if
3119 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3121 A cost model should help decide between these two schemes. */
3124 get_initial_def_for_reduction (gimple stmt, tree init_val,
3125 tree *adjustment_def)
3127 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3128 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3129 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3130 tree scalar_type = TREE_TYPE (init_val);
3131 tree vectype = get_vectype_for_scalar_type (scalar_type);
3133 enum tree_code code = gimple_assign_rhs_code (stmt);
3138 bool nested_in_vect_loop = false;
3140 REAL_VALUE_TYPE real_init_val = dconst0;
3141 int int_init_val = 0;
3142 gimple def_stmt = NULL;
3144 gcc_assert (vectype);
3145 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3147 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3148 || SCALAR_FLOAT_TYPE_P (scalar_type));
3150 if (nested_in_vect_loop_p (loop, stmt))
3151 nested_in_vect_loop = true;
3153 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3155 /* In case of double reduction we only create a vector variable to be put
3156 in the reduction phi node. The actual statement creation is done in
3157 vect_create_epilog_for_reduction. */
3158 if (adjustment_def && nested_in_vect_loop
3159 && TREE_CODE (init_val) == SSA_NAME
3160 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3161 && gimple_code (def_stmt) == GIMPLE_PHI
3162 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3163 && vinfo_for_stmt (def_stmt)
3164 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3165 == vect_double_reduction_def)
3167 *adjustment_def = NULL;
3168 return vect_create_destination_var (init_val, vectype);
3171 if (TREE_CONSTANT (init_val))
3173 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3174 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3176 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3179 init_value = init_val;
3183 case WIDEN_SUM_EXPR:
3191 /* ADJUSMENT_DEF is NULL when called from
3192 vect_create_epilog_for_reduction to vectorize double reduction. */
3195 if (nested_in_vect_loop)
3196 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3199 *adjustment_def = init_val;
3202 if (code == MULT_EXPR)
3204 real_init_val = dconst1;
3208 if (code == BIT_AND_EXPR)
3211 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3212 def_for_init = build_real (scalar_type, real_init_val);
3214 def_for_init = build_int_cst (scalar_type, int_init_val);
3216 /* Create a vector of '0' or '1' except the first element. */
3217 for (i = nunits - 2; i >= 0; --i)
3218 t = tree_cons (NULL_TREE, def_for_init, t);
3220 /* Option1: the first element is '0' or '1' as well. */
3223 t = tree_cons (NULL_TREE, def_for_init, t);
3224 init_def = build_vector (vectype, t);
3228 /* Option2: the first element is INIT_VAL. */
3229 t = tree_cons (NULL_TREE, init_value, t);
3230 if (TREE_CONSTANT (init_val))
3231 init_def = build_vector (vectype, t);
3233 init_def = build_constructor_from_list (vectype, t);
3242 *adjustment_def = NULL_TREE;
3243 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3247 init_def = build_vector_from_val (vectype, init_value);
3258 /* Function vect_create_epilog_for_reduction
3260 Create code at the loop-epilog to finalize the result of a reduction
3263 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3264 reduction statements.
3265 STMT is the scalar reduction stmt that is being vectorized.
3266 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3267 number of elements that we can fit in a vectype (nunits). In this case
3268 we have to generate more than one vector stmt - i.e - we need to "unroll"
3269 the vector stmt by a factor VF/nunits. For more details see documentation
3270 in vectorizable_operation.
3271 REDUC_CODE is the tree-code for the epilog reduction.
3272 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3274 REDUC_INDEX is the index of the operand in the right hand side of the
3275 statement that is defined by REDUCTION_PHI.
3276 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3277 SLP_NODE is an SLP node containing a group of reduction statements. The
3278 first one in this group is STMT.
3281 1. Creates the reduction def-use cycles: sets the arguments for
3283 The loop-entry argument is the vectorized initial-value of the reduction.
3284 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3286 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3287 by applying the operation specified by REDUC_CODE if available, or by
3288 other means (whole-vector shifts or a scalar loop).
3289 The function also creates a new phi node at the loop exit to preserve
3290 loop-closed form, as illustrated below.
3292 The flow at the entry to this function:
3295 vec_def = phi <null, null> # REDUCTION_PHI
3296 VECT_DEF = vector_stmt # vectorized form of STMT
3297 s_loop = scalar_stmt # (scalar) STMT
3299 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3303 The above is transformed by this function into:
3306 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3307 VECT_DEF = vector_stmt # vectorized form of STMT
3308 s_loop = scalar_stmt # (scalar) STMT
3310 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3311 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3312 v_out2 = reduce <v_out1>
3313 s_out3 = extract_field <v_out2, 0>
3314 s_out4 = adjust_result <s_out3>
3320 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3321 int ncopies, enum tree_code reduc_code,
3322 VEC (gimple, heap) *reduction_phis,
3323 int reduc_index, bool double_reduc,
3326 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3327 stmt_vec_info prev_phi_info;
3329 enum machine_mode mode;
3330 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3331 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3332 basic_block exit_bb;
3335 gimple new_phi = NULL, phi;
3336 gimple_stmt_iterator exit_gsi;
3338 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3339 gimple epilog_stmt = NULL;
3340 enum tree_code code = gimple_assign_rhs_code (stmt);
3342 tree bitsize, bitpos;
3343 tree adjustment_def = NULL;
3344 tree vec_initial_def = NULL;
3345 tree reduction_op, expr, def;
3346 tree orig_name, scalar_result;
3347 imm_use_iterator imm_iter, phi_imm_iter;
3348 use_operand_p use_p, phi_use_p;
3349 bool extract_scalar_result = false;
3350 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3351 bool nested_in_vect_loop = false;
3352 VEC (gimple, heap) *new_phis = NULL;
3353 enum vect_def_type dt = vect_unknown_def_type;
3355 VEC (tree, heap) *scalar_results = NULL;
3356 unsigned int group_size = 1, k, ratio;
3357 VEC (tree, heap) *vec_initial_defs = NULL;
3358 VEC (gimple, heap) *phis;
3359 bool slp_reduc = false;
3360 tree new_phi_result;
3363 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3365 if (nested_in_vect_loop_p (loop, stmt))
3369 nested_in_vect_loop = true;
3370 gcc_assert (!slp_node);
3373 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3375 case GIMPLE_SINGLE_RHS:
3376 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3378 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3380 case GIMPLE_UNARY_RHS:
3381 reduction_op = gimple_assign_rhs1 (stmt);
3383 case GIMPLE_BINARY_RHS:
3384 reduction_op = reduc_index ?
3385 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3387 case GIMPLE_TERNARY_RHS:
3388 reduction_op = gimple_op (stmt, reduc_index + 1);
3394 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3395 gcc_assert (vectype);
3396 mode = TYPE_MODE (vectype);
3398 /* 1. Create the reduction def-use cycle:
3399 Set the arguments of REDUCTION_PHIS, i.e., transform
3402 vec_def = phi <null, null> # REDUCTION_PHI
3403 VECT_DEF = vector_stmt # vectorized form of STMT
3409 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3410 VECT_DEF = vector_stmt # vectorized form of STMT
3413 (in case of SLP, do it for all the phis). */
3415 /* Get the loop-entry arguments. */
3417 vect_get_slp_defs (reduction_op, NULL_TREE, slp_node, &vec_initial_defs,
3421 vec_initial_defs = VEC_alloc (tree, heap, 1);
3422 /* For the case of reduction, vect_get_vec_def_for_operand returns
3423 the scalar def before the loop, that defines the initial value
3424 of the reduction variable. */
3425 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3427 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3430 /* Set phi nodes arguments. */
3431 FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3433 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3434 tree def = VEC_index (tree, vect_defs, i);
3435 for (j = 0; j < ncopies; j++)
3437 /* Set the loop-entry arg of the reduction-phi. */
3438 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3441 /* Set the loop-latch arg for the reduction-phi. */
3443 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3445 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3447 if (vect_print_dump_info (REPORT_DETAILS))
3449 fprintf (vect_dump, "transform reduction: created def-use"
3451 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3452 fprintf (vect_dump, "\n");
3453 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3457 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3461 VEC_free (tree, heap, vec_initial_defs);
3463 /* 2. Create epilog code.
3464 The reduction epilog code operates across the elements of the vector
3465 of partial results computed by the vectorized loop.
3466 The reduction epilog code consists of:
3468 step 1: compute the scalar result in a vector (v_out2)
3469 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3470 step 3: adjust the scalar result (s_out3) if needed.
3472 Step 1 can be accomplished using one the following three schemes:
3473 (scheme 1) using reduc_code, if available.
3474 (scheme 2) using whole-vector shifts, if available.
3475 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3478 The overall epilog code looks like this:
3480 s_out0 = phi <s_loop> # original EXIT_PHI
3481 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3482 v_out2 = reduce <v_out1> # step 1
3483 s_out3 = extract_field <v_out2, 0> # step 2
3484 s_out4 = adjust_result <s_out3> # step 3
3486 (step 3 is optional, and steps 1 and 2 may be combined).
3487 Lastly, the uses of s_out0 are replaced by s_out4. */
3490 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3491 v_out1 = phi <VECT_DEF>
3492 Store them in NEW_PHIS. */
3494 exit_bb = single_exit (loop)->dest;
3495 prev_phi_info = NULL;
3496 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3497 FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3499 for (j = 0; j < ncopies; j++)
3501 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3502 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3504 VEC_quick_push (gimple, new_phis, phi);
3507 def = vect_get_vec_def_for_stmt_copy (dt, def);
3508 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3511 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3512 prev_phi_info = vinfo_for_stmt (phi);
3516 /* The epilogue is created for the outer-loop, i.e., for the loop being
3521 exit_bb = single_exit (loop)->dest;
3524 exit_gsi = gsi_after_labels (exit_bb);
3526 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3527 (i.e. when reduc_code is not available) and in the final adjustment
3528 code (if needed). Also get the original scalar reduction variable as
3529 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3530 represents a reduction pattern), the tree-code and scalar-def are
3531 taken from the original stmt that the pattern-stmt (STMT) replaces.
3532 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3533 are taken from STMT. */
3535 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3538 /* Regular reduction */
3543 /* Reduction pattern */
3544 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3545 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3546 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3549 code = gimple_assign_rhs_code (orig_stmt);
3550 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3551 partial results are added and not subtracted. */
3552 if (code == MINUS_EXPR)
3555 scalar_dest = gimple_assign_lhs (orig_stmt);
3556 scalar_type = TREE_TYPE (scalar_dest);
3557 scalar_results = VEC_alloc (tree, heap, group_size);
3558 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3559 bitsize = TYPE_SIZE (scalar_type);
3561 /* In case this is a reduction in an inner-loop while vectorizing an outer
3562 loop - we don't need to extract a single scalar result at the end of the
3563 inner-loop (unless it is double reduction, i.e., the use of reduction is
3564 outside the outer-loop). The final vector of partial results will be used
3565 in the vectorized outer-loop, or reduced to a scalar result at the end of
3567 if (nested_in_vect_loop && !double_reduc)
3568 goto vect_finalize_reduction;
3570 /* SLP reduction without reduction chain, e.g.,
3574 b2 = operation (b1) */
3575 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3577 /* In case of reduction chain, e.g.,
3580 a3 = operation (a2),
3582 we may end up with more than one vector result. Here we reduce them to
3584 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3586 tree first_vect = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3589 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3590 for (k = 1; k < VEC_length (gimple, new_phis); k++)
3592 gimple next_phi = VEC_index (gimple, new_phis, k);
3593 tree second_vect = PHI_RESULT (next_phi);
3594 gimple new_vec_stmt;
3596 tmp = build2 (code, vectype, first_vect, second_vect);
3597 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3598 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3599 gimple_assign_set_lhs (new_vec_stmt, first_vect);
3600 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3603 new_phi_result = first_vect;
3606 new_phi_result = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3608 /* 2.3 Create the reduction code, using one of the three schemes described
3609 above. In SLP we simply need to extract all the elements from the
3610 vector (without reducing them), so we use scalar shifts. */
3611 if (reduc_code != ERROR_MARK && !slp_reduc)
3615 /*** Case 1: Create:
3616 v_out2 = reduc_expr <v_out1> */
3618 if (vect_print_dump_info (REPORT_DETAILS))
3619 fprintf (vect_dump, "Reduce using direct vector reduction.");
3621 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3622 tmp = build1 (reduc_code, vectype, new_phi_result);
3623 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3624 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3625 gimple_assign_set_lhs (epilog_stmt, new_temp);
3626 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3628 extract_scalar_result = true;
3632 enum tree_code shift_code = ERROR_MARK;
3633 bool have_whole_vector_shift = true;
3635 int element_bitsize = tree_low_cst (bitsize, 1);
3636 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3639 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3640 shift_code = VEC_RSHIFT_EXPR;
3642 have_whole_vector_shift = false;
3644 /* Regardless of whether we have a whole vector shift, if we're
3645 emulating the operation via tree-vect-generic, we don't want
3646 to use it. Only the first round of the reduction is likely
3647 to still be profitable via emulation. */
3648 /* ??? It might be better to emit a reduction tree code here, so that
3649 tree-vect-generic can expand the first round via bit tricks. */
3650 if (!VECTOR_MODE_P (mode))
3651 have_whole_vector_shift = false;
3654 optab optab = optab_for_tree_code (code, vectype, optab_default);
3655 if (optab_handler (optab, mode) == CODE_FOR_nothing)
3656 have_whole_vector_shift = false;
3659 if (have_whole_vector_shift && !slp_reduc)
3661 /*** Case 2: Create:
3662 for (offset = VS/2; offset >= element_size; offset/=2)
3664 Create: va' = vec_shift <va, offset>
3665 Create: va = vop <va, va'>
3668 if (vect_print_dump_info (REPORT_DETAILS))
3669 fprintf (vect_dump, "Reduce using vector shifts");
3671 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3672 new_temp = new_phi_result;
3673 for (bit_offset = vec_size_in_bits/2;
3674 bit_offset >= element_bitsize;
3677 tree bitpos = size_int (bit_offset);
3679 epilog_stmt = gimple_build_assign_with_ops (shift_code,
3680 vec_dest, new_temp, bitpos);
3681 new_name = make_ssa_name (vec_dest, epilog_stmt);
3682 gimple_assign_set_lhs (epilog_stmt, new_name);
3683 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3685 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3686 new_name, new_temp);
3687 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3688 gimple_assign_set_lhs (epilog_stmt, new_temp);
3689 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3692 extract_scalar_result = true;
3698 /*** Case 3: Create:
3699 s = extract_field <v_out2, 0>
3700 for (offset = element_size;
3701 offset < vector_size;
3702 offset += element_size;)
3704 Create: s' = extract_field <v_out2, offset>
3705 Create: s = op <s, s'> // For non SLP cases
3708 if (vect_print_dump_info (REPORT_DETAILS))
3709 fprintf (vect_dump, "Reduce using scalar code. ");
3711 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3712 FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3714 vec_temp = PHI_RESULT (new_phi);
3715 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3717 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3718 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3719 gimple_assign_set_lhs (epilog_stmt, new_temp);
3720 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3722 /* In SLP we don't need to apply reduction operation, so we just
3723 collect s' values in SCALAR_RESULTS. */
3725 VEC_safe_push (tree, heap, scalar_results, new_temp);
3727 for (bit_offset = element_bitsize;
3728 bit_offset < vec_size_in_bits;
3729 bit_offset += element_bitsize)
3731 tree bitpos = bitsize_int (bit_offset);
3732 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3735 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3736 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3737 gimple_assign_set_lhs (epilog_stmt, new_name);
3738 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3742 /* In SLP we don't need to apply reduction operation, so
3743 we just collect s' values in SCALAR_RESULTS. */
3744 new_temp = new_name;
3745 VEC_safe_push (tree, heap, scalar_results, new_name);
3749 epilog_stmt = gimple_build_assign_with_ops (code,
3750 new_scalar_dest, new_name, new_temp);
3751 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3752 gimple_assign_set_lhs (epilog_stmt, new_temp);
3753 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3758 /* The only case where we need to reduce scalar results in SLP, is
3759 unrolling. If the size of SCALAR_RESULTS is greater than
3760 GROUP_SIZE, we reduce them combining elements modulo
3764 tree res, first_res, new_res;
3767 /* Reduce multiple scalar results in case of SLP unrolling. */
3768 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3771 first_res = VEC_index (tree, scalar_results, j % group_size);
3772 new_stmt = gimple_build_assign_with_ops (code,
3773 new_scalar_dest, first_res, res);
3774 new_res = make_ssa_name (new_scalar_dest, new_stmt);
3775 gimple_assign_set_lhs (new_stmt, new_res);
3776 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3777 VEC_replace (tree, scalar_results, j % group_size, new_res);
3781 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
3782 VEC_safe_push (tree, heap, scalar_results, new_temp);
3784 extract_scalar_result = false;
3788 /* 2.4 Extract the final scalar result. Create:
3789 s_out3 = extract_field <v_out2, bitpos> */
3791 if (extract_scalar_result)
3795 if (vect_print_dump_info (REPORT_DETAILS))
3796 fprintf (vect_dump, "extract scalar result");
3798 if (BYTES_BIG_ENDIAN)
3799 bitpos = size_binop (MULT_EXPR,
3800 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3801 TYPE_SIZE (scalar_type));
3803 bitpos = bitsize_zero_node;
3805 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3806 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3807 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3808 gimple_assign_set_lhs (epilog_stmt, new_temp);
3809 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3810 VEC_safe_push (tree, heap, scalar_results, new_temp);
3813 vect_finalize_reduction:
3818 /* 2.5 Adjust the final result by the initial value of the reduction
3819 variable. (When such adjustment is not needed, then
3820 'adjustment_def' is zero). For example, if code is PLUS we create:
3821 new_temp = loop_exit_def + adjustment_def */
3825 gcc_assert (!slp_reduc);
3826 if (nested_in_vect_loop)
3828 new_phi = VEC_index (gimple, new_phis, 0);
3829 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3830 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3831 new_dest = vect_create_destination_var (scalar_dest, vectype);
3835 new_temp = VEC_index (tree, scalar_results, 0);
3836 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3837 expr = build2 (code, scalar_type, new_temp, adjustment_def);
3838 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3841 epilog_stmt = gimple_build_assign (new_dest, expr);
3842 new_temp = make_ssa_name (new_dest, epilog_stmt);
3843 gimple_assign_set_lhs (epilog_stmt, new_temp);
3844 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3845 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3846 if (nested_in_vect_loop)
3848 set_vinfo_for_stmt (epilog_stmt,
3849 new_stmt_vec_info (epilog_stmt, loop_vinfo,
3851 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3852 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3855 VEC_quick_push (tree, scalar_results, new_temp);
3857 VEC_replace (tree, scalar_results, 0, new_temp);
3860 VEC_replace (tree, scalar_results, 0, new_temp);
3862 VEC_replace (gimple, new_phis, 0, epilog_stmt);
3865 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
3866 phis with new adjusted scalar results, i.e., replace use <s_out0>
3871 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3872 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3873 v_out2 = reduce <v_out1>
3874 s_out3 = extract_field <v_out2, 0>
3875 s_out4 = adjust_result <s_out3>
3882 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3883 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3884 v_out2 = reduce <v_out1>
3885 s_out3 = extract_field <v_out2, 0>
3886 s_out4 = adjust_result <s_out3>
3891 /* In SLP reduction chain we reduce vector results into one vector if
3892 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
3893 the last stmt in the reduction chain, since we are looking for the loop
3895 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3897 scalar_dest = gimple_assign_lhs (VEC_index (gimple,
3898 SLP_TREE_SCALAR_STMTS (slp_node),
3903 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
3904 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
3905 need to match SCALAR_RESULTS with corresponding statements. The first
3906 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
3907 the first vector stmt, etc.
3908 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
3909 if (group_size > VEC_length (gimple, new_phis))
3911 ratio = group_size / VEC_length (gimple, new_phis);
3912 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
3917 for (k = 0; k < group_size; k++)
3921 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
3922 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
3927 gimple current_stmt = VEC_index (gimple,
3928 SLP_TREE_SCALAR_STMTS (slp_node), k);
3930 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
3931 /* SLP statements can't participate in patterns. */
3932 gcc_assert (!orig_stmt);
3933 scalar_dest = gimple_assign_lhs (current_stmt);
3936 phis = VEC_alloc (gimple, heap, 3);
3937 /* Find the loop-closed-use at the loop exit of the original scalar
3938 result. (The reduction result is expected to have two immediate uses -
3939 one at the latch block, and one at the loop exit). */
3940 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3941 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3942 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3944 /* We expect to have found an exit_phi because of loop-closed-ssa
3946 gcc_assert (!VEC_empty (gimple, phis));
3948 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3952 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
3955 /* FORNOW. Currently not supporting the case that an inner-loop
3956 reduction is not used in the outer-loop (but only outside the
3957 outer-loop), unless it is double reduction. */
3958 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
3959 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
3962 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
3964 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
3965 != vect_double_reduction_def)
3968 /* Handle double reduction:
3970 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
3971 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
3972 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
3973 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
3975 At that point the regular reduction (stmt2 and stmt3) is
3976 already vectorized, as well as the exit phi node, stmt4.
3977 Here we vectorize the phi node of double reduction, stmt1, and
3978 update all relevant statements. */
3980 /* Go through all the uses of s2 to find double reduction phi
3981 node, i.e., stmt1 above. */
3982 orig_name = PHI_RESULT (exit_phi);
3983 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3985 stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3986 stmt_vec_info new_phi_vinfo;
3987 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3988 basic_block bb = gimple_bb (use_stmt);
3991 /* Check that USE_STMT is really double reduction phi
3993 if (gimple_code (use_stmt) != GIMPLE_PHI
3994 || gimple_phi_num_args (use_stmt) != 2
3996 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3997 != vect_double_reduction_def
3998 || bb->loop_father != outer_loop)
4001 /* Create vector phi node for double reduction:
4002 vs1 = phi <vs0, vs2>
4003 vs1 was created previously in this function by a call to
4004 vect_get_vec_def_for_operand and is stored in
4006 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
4007 vs0 is created here. */
4009 /* Create vector phi node. */
4010 vect_phi = create_phi_node (vec_initial_def, bb);
4011 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4012 loop_vec_info_for_loop (outer_loop), NULL);
4013 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4015 /* Create vs0 - initial def of the double reduction phi. */
4016 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4017 loop_preheader_edge (outer_loop));
4018 init_def = get_initial_def_for_reduction (stmt,
4019 preheader_arg, NULL);
4020 vect_phi_init = vect_init_vector (use_stmt, init_def,
4023 /* Update phi node arguments with vs0 and vs2. */
4024 add_phi_arg (vect_phi, vect_phi_init,
4025 loop_preheader_edge (outer_loop),
4027 add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
4028 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4029 if (vect_print_dump_info (REPORT_DETAILS))
4031 fprintf (vect_dump, "created double reduction phi "
4033 print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
4036 vect_phi_res = PHI_RESULT (vect_phi);
4038 /* Replace the use, i.e., set the correct vs1 in the regular
4039 reduction phi node. FORNOW, NCOPIES is always 1, so the
4040 loop is redundant. */
4041 use = reduction_phi;
4042 for (j = 0; j < ncopies; j++)
4044 edge pr_edge = loop_preheader_edge (loop);
4045 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4046 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4052 VEC_free (gimple, heap, phis);
4053 if (nested_in_vect_loop)
4061 phis = VEC_alloc (gimple, heap, 3);
4062 /* Find the loop-closed-use at the loop exit of the original scalar
4063 result. (The reduction result is expected to have two immediate uses,
4064 one at the latch block, and one at the loop exit). For double
4065 reductions we are looking for exit phis of the outer loop. */
4066 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4068 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4069 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4072 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4074 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4076 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4078 if (!flow_bb_inside_loop_p (loop,
4079 gimple_bb (USE_STMT (phi_use_p))))
4080 VEC_safe_push (gimple, heap, phis,
4081 USE_STMT (phi_use_p));
4087 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4089 /* Replace the uses: */
4090 orig_name = PHI_RESULT (exit_phi);
4091 scalar_result = VEC_index (tree, scalar_results, k);
4092 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4093 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4094 SET_USE (use_p, scalar_result);
4097 VEC_free (gimple, heap, phis);
4100 VEC_free (tree, heap, scalar_results);
4101 VEC_free (gimple, heap, new_phis);
4105 /* Function vectorizable_reduction.
4107 Check if STMT performs a reduction operation that can be vectorized.
4108 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4109 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4110 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4112 This function also handles reduction idioms (patterns) that have been
4113 recognized in advance during vect_pattern_recog. In this case, STMT may be
4115 X = pattern_expr (arg0, arg1, ..., X)
4116 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4117 sequence that had been detected and replaced by the pattern-stmt (STMT).
4119 In some cases of reduction patterns, the type of the reduction variable X is
4120 different than the type of the other arguments of STMT.
4121 In such cases, the vectype that is used when transforming STMT into a vector
4122 stmt is different than the vectype that is used to determine the
4123 vectorization factor, because it consists of a different number of elements
4124 than the actual number of elements that are being operated upon in parallel.
4126 For example, consider an accumulation of shorts into an int accumulator.
4127 On some targets it's possible to vectorize this pattern operating on 8
4128 shorts at a time (hence, the vectype for purposes of determining the
4129 vectorization factor should be V8HI); on the other hand, the vectype that
4130 is used to create the vector form is actually V4SI (the type of the result).
4132 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4133 indicates what is the actual level of parallelism (V8HI in the example), so
4134 that the right vectorization factor would be derived. This vectype
4135 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4136 be used to create the vectorized stmt. The right vectype for the vectorized
4137 stmt is obtained from the type of the result X:
4138 get_vectype_for_scalar_type (TREE_TYPE (X))
4140 This means that, contrary to "regular" reductions (or "regular" stmts in
4141 general), the following equation:
4142 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4143 does *NOT* necessarily hold for reduction patterns. */
4146 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4147 gimple *vec_stmt, slp_tree slp_node)
4151 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4152 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4153 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4154 tree vectype_in = NULL_TREE;
4155 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4156 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4157 enum tree_code code, orig_code, epilog_reduc_code;
4158 enum machine_mode vec_mode;
4160 optab optab, reduc_optab;
4161 tree new_temp = NULL_TREE;
4164 enum vect_def_type dt;
4165 gimple new_phi = NULL;
4169 stmt_vec_info orig_stmt_info;
4170 tree expr = NULL_TREE;
4174 stmt_vec_info prev_stmt_info, prev_phi_info;
4175 bool single_defuse_cycle = false;
4176 tree reduc_def = NULL_TREE;
4177 gimple new_stmt = NULL;
4180 bool nested_cycle = false, found_nested_cycle_def = false;
4181 gimple reduc_def_stmt = NULL;
4182 /* The default is that the reduction variable is the last in statement. */
4183 int reduc_index = 2;
4184 bool double_reduc = false, dummy;
4186 struct loop * def_stmt_loop, *outer_loop = NULL;
4188 gimple def_arg_stmt;
4189 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
4190 VEC (gimple, heap) *phis = NULL;
4192 tree def0, def1, tem;
4194 /* In case of reduction chain we switch to the first stmt in the chain, but
4195 we don't update STMT_INFO, since only the last stmt is marked as reduction
4196 and has reduction properties. */
4197 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4198 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4200 if (nested_in_vect_loop_p (loop, stmt))
4204 nested_cycle = true;
4207 /* 1. Is vectorizable reduction? */
4208 /* Not supportable if the reduction variable is used in the loop, unless
4209 it's a reduction chain. */
4210 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4211 && !GROUP_FIRST_ELEMENT (stmt_info))
4214 /* Reductions that are not used even in an enclosing outer-loop,
4215 are expected to be "live" (used out of the loop). */
4216 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4217 && !STMT_VINFO_LIVE_P (stmt_info))
4220 /* Make sure it was already recognized as a reduction computation. */
4221 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4222 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4225 /* 2. Has this been recognized as a reduction pattern?
4227 Check if STMT represents a pattern that has been recognized
4228 in earlier analysis stages. For stmts that represent a pattern,
4229 the STMT_VINFO_RELATED_STMT field records the last stmt in
4230 the original sequence that constitutes the pattern. */
4232 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4235 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4236 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
4237 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4238 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4241 /* 3. Check the operands of the operation. The first operands are defined
4242 inside the loop body. The last operand is the reduction variable,
4243 which is defined by the loop-header-phi. */
4245 gcc_assert (is_gimple_assign (stmt));
4248 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4250 case GIMPLE_SINGLE_RHS:
4251 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4252 if (op_type == ternary_op)
4254 tree rhs = gimple_assign_rhs1 (stmt);
4255 ops[0] = TREE_OPERAND (rhs, 0);
4256 ops[1] = TREE_OPERAND (rhs, 1);
4257 ops[2] = TREE_OPERAND (rhs, 2);
4258 code = TREE_CODE (rhs);
4264 case GIMPLE_BINARY_RHS:
4265 code = gimple_assign_rhs_code (stmt);
4266 op_type = TREE_CODE_LENGTH (code);
4267 gcc_assert (op_type == binary_op);
4268 ops[0] = gimple_assign_rhs1 (stmt);
4269 ops[1] = gimple_assign_rhs2 (stmt);
4272 case GIMPLE_TERNARY_RHS:
4273 code = gimple_assign_rhs_code (stmt);
4274 op_type = TREE_CODE_LENGTH (code);
4275 gcc_assert (op_type == ternary_op);
4276 ops[0] = gimple_assign_rhs1 (stmt);
4277 ops[1] = gimple_assign_rhs2 (stmt);
4278 ops[2] = gimple_assign_rhs3 (stmt);
4281 case GIMPLE_UNARY_RHS:
4288 scalar_dest = gimple_assign_lhs (stmt);
4289 scalar_type = TREE_TYPE (scalar_dest);
4290 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4291 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4294 /* All uses but the last are expected to be defined in the loop.
4295 The last use is the reduction variable. In case of nested cycle this
4296 assumption is not true: we use reduc_index to record the index of the
4297 reduction variable. */
4298 for (i = 0; i < op_type-1; i++)
4300 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4301 if (i == 0 && code == COND_EXPR)
4304 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
4305 &def_stmt, &def, &dt, &tem);
4308 gcc_assert (is_simple_use);
4310 if (dt != vect_internal_def
4311 && dt != vect_external_def
4312 && dt != vect_constant_def
4313 && dt != vect_induction_def
4314 && !(dt == vect_nested_cycle && nested_cycle))
4317 if (dt == vect_nested_cycle)
4319 found_nested_cycle_def = true;
4320 reduc_def_stmt = def_stmt;
4325 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, &def_stmt,
4329 gcc_assert (is_simple_use);
4330 gcc_assert (dt == vect_reduction_def
4331 || dt == vect_nested_cycle
4332 || ((dt == vect_internal_def || dt == vect_external_def
4333 || dt == vect_constant_def || dt == vect_induction_def)
4334 && nested_cycle && found_nested_cycle_def));
4335 if (!found_nested_cycle_def)
4336 reduc_def_stmt = def_stmt;
4338 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4340 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4346 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4347 !nested_cycle, &dummy);
4348 /* We changed STMT to be the first stmt in reduction chain, hence we
4349 check that in this case the first element in the chain is STMT. */
4350 gcc_assert (stmt == tmp
4351 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4354 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4357 if (slp_node || PURE_SLP_STMT (stmt_info))
4360 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4361 / TYPE_VECTOR_SUBPARTS (vectype_in));
4363 gcc_assert (ncopies >= 1);
4365 vec_mode = TYPE_MODE (vectype_in);
4367 if (code == COND_EXPR)
4369 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
4371 if (vect_print_dump_info (REPORT_DETAILS))
4372 fprintf (vect_dump, "unsupported condition in reduction");
4379 /* 4. Supportable by target? */
4381 /* 4.1. check support for the operation in the loop */
4382 optab = optab_for_tree_code (code, vectype_in, optab_default);
4385 if (vect_print_dump_info (REPORT_DETAILS))
4386 fprintf (vect_dump, "no optab.");
4391 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4393 if (vect_print_dump_info (REPORT_DETAILS))
4394 fprintf (vect_dump, "op not supported by target.");
4396 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4397 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4398 < vect_min_worthwhile_factor (code))
4401 if (vect_print_dump_info (REPORT_DETAILS))
4402 fprintf (vect_dump, "proceeding using word mode.");
4405 /* Worthwhile without SIMD support? */
4406 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4407 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4408 < vect_min_worthwhile_factor (code))
4410 if (vect_print_dump_info (REPORT_DETAILS))
4411 fprintf (vect_dump, "not worthwhile without SIMD support.");
4417 /* 4.2. Check support for the epilog operation.
4419 If STMT represents a reduction pattern, then the type of the
4420 reduction variable may be different than the type of the rest
4421 of the arguments. For example, consider the case of accumulation
4422 of shorts into an int accumulator; The original code:
4423 S1: int_a = (int) short_a;
4424 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4427 STMT: int_acc = widen_sum <short_a, int_acc>
4430 1. The tree-code that is used to create the vector operation in the
4431 epilog code (that reduces the partial results) is not the
4432 tree-code of STMT, but is rather the tree-code of the original
4433 stmt from the pattern that STMT is replacing. I.e, in the example
4434 above we want to use 'widen_sum' in the loop, but 'plus' in the
4436 2. The type (mode) we use to check available target support
4437 for the vector operation to be created in the *epilog*, is
4438 determined by the type of the reduction variable (in the example
4439 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4440 However the type (mode) we use to check available target support
4441 for the vector operation to be created *inside the loop*, is
4442 determined by the type of the other arguments to STMT (in the
4443 example we'd check this: optab_handler (widen_sum_optab,
4446 This is contrary to "regular" reductions, in which the types of all
4447 the arguments are the same as the type of the reduction variable.
4448 For "regular" reductions we can therefore use the same vector type
4449 (and also the same tree-code) when generating the epilog code and
4450 when generating the code inside the loop. */
4454 /* This is a reduction pattern: get the vectype from the type of the
4455 reduction variable, and get the tree-code from orig_stmt. */
4456 orig_code = gimple_assign_rhs_code (orig_stmt);
4457 gcc_assert (vectype_out);
4458 vec_mode = TYPE_MODE (vectype_out);
4462 /* Regular reduction: use the same vectype and tree-code as used for
4463 the vector code inside the loop can be used for the epilog code. */
4469 def_bb = gimple_bb (reduc_def_stmt);
4470 def_stmt_loop = def_bb->loop_father;
4471 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4472 loop_preheader_edge (def_stmt_loop));
4473 if (TREE_CODE (def_arg) == SSA_NAME
4474 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4475 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4476 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4477 && vinfo_for_stmt (def_arg_stmt)
4478 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4479 == vect_double_reduction_def)
4480 double_reduc = true;
4483 epilog_reduc_code = ERROR_MARK;
4484 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4486 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4490 if (vect_print_dump_info (REPORT_DETAILS))
4491 fprintf (vect_dump, "no optab for reduction.");
4493 epilog_reduc_code = ERROR_MARK;
4497 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4499 if (vect_print_dump_info (REPORT_DETAILS))
4500 fprintf (vect_dump, "reduc op not supported by target.");
4502 epilog_reduc_code = ERROR_MARK;
4507 if (!nested_cycle || double_reduc)
4509 if (vect_print_dump_info (REPORT_DETAILS))
4510 fprintf (vect_dump, "no reduc code for scalar code.");
4516 if (double_reduc && ncopies > 1)
4518 if (vect_print_dump_info (REPORT_DETAILS))
4519 fprintf (vect_dump, "multiple types in double reduction");
4524 if (!vec_stmt) /* transformation not required. */
4526 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4528 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4534 if (vect_print_dump_info (REPORT_DETAILS))
4535 fprintf (vect_dump, "transform reduction.");
4537 /* FORNOW: Multiple types are not supported for condition. */
4538 if (code == COND_EXPR)
4539 gcc_assert (ncopies == 1);
4541 /* Create the destination vector */
4542 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4544 /* In case the vectorization factor (VF) is bigger than the number
4545 of elements that we can fit in a vectype (nunits), we have to generate
4546 more than one vector stmt - i.e - we need to "unroll" the
4547 vector stmt by a factor VF/nunits. For more details see documentation
4548 in vectorizable_operation. */
4550 /* If the reduction is used in an outer loop we need to generate
4551 VF intermediate results, like so (e.g. for ncopies=2):
4556 (i.e. we generate VF results in 2 registers).
4557 In this case we have a separate def-use cycle for each copy, and therefore
4558 for each copy we get the vector def for the reduction variable from the
4559 respective phi node created for this copy.
4561 Otherwise (the reduction is unused in the loop nest), we can combine
4562 together intermediate results, like so (e.g. for ncopies=2):
4566 (i.e. we generate VF/2 results in a single register).
4567 In this case for each copy we get the vector def for the reduction variable
4568 from the vectorized reduction operation generated in the previous iteration.
4571 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4573 single_defuse_cycle = true;
4577 epilog_copies = ncopies;
4579 prev_stmt_info = NULL;
4580 prev_phi_info = NULL;
4583 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4584 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4585 == TYPE_VECTOR_SUBPARTS (vectype_in));
4590 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4591 if (op_type == ternary_op)
4592 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4595 phis = VEC_alloc (gimple, heap, vec_num);
4596 vect_defs = VEC_alloc (tree, heap, vec_num);
4598 VEC_quick_push (tree, vect_defs, NULL_TREE);
4600 for (j = 0; j < ncopies; j++)
4602 if (j == 0 || !single_defuse_cycle)
4604 for (i = 0; i < vec_num; i++)
4606 /* Create the reduction-phi that defines the reduction
4608 new_phi = create_phi_node (vec_dest, loop->header);
4609 set_vinfo_for_stmt (new_phi,
4610 new_stmt_vec_info (new_phi, loop_vinfo,
4612 if (j == 0 || slp_node)
4613 VEC_quick_push (gimple, phis, new_phi);
4617 if (code == COND_EXPR)
4619 gcc_assert (!slp_node);
4620 vectorizable_condition (stmt, gsi, vec_stmt,
4621 PHI_RESULT (VEC_index (gimple, phis, 0)),
4623 /* Multiple types are not supported for condition. */
4630 tree op0, op1 = NULL_TREE;
4632 op0 = ops[!reduc_index];
4633 if (op_type == ternary_op)
4635 if (reduc_index == 0)
4642 vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0, &vec_oprnds1,
4646 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4648 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4649 if (op_type == ternary_op)
4651 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
4653 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4661 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
4662 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
4663 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4664 if (op_type == ternary_op)
4666 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4668 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4672 if (single_defuse_cycle)
4673 reduc_def = gimple_assign_lhs (new_stmt);
4675 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4678 FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4681 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4684 if (!single_defuse_cycle || j == 0)
4685 reduc_def = PHI_RESULT (new_phi);
4688 def1 = ((op_type == ternary_op)
4689 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4690 if (op_type == binary_op)
4692 if (reduc_index == 0)
4693 expr = build2 (code, vectype_out, reduc_def, def0);
4695 expr = build2 (code, vectype_out, def0, reduc_def);
4699 if (reduc_index == 0)
4700 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4703 if (reduc_index == 1)
4704 expr = build3 (code, vectype_out, def0, reduc_def, def1);
4706 expr = build3 (code, vectype_out, def0, def1, reduc_def);
4710 new_stmt = gimple_build_assign (vec_dest, expr);
4711 new_temp = make_ssa_name (vec_dest, new_stmt);
4712 gimple_assign_set_lhs (new_stmt, new_temp);
4713 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4717 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4718 VEC_quick_push (tree, vect_defs, new_temp);
4721 VEC_replace (tree, vect_defs, 0, new_temp);
4728 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4730 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4732 prev_stmt_info = vinfo_for_stmt (new_stmt);
4733 prev_phi_info = vinfo_for_stmt (new_phi);
4736 /* Finalize the reduction-phi (set its arguments) and create the
4737 epilog reduction code. */
4738 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4740 new_temp = gimple_assign_lhs (*vec_stmt);
4741 VEC_replace (tree, vect_defs, 0, new_temp);
4744 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4745 epilog_reduc_code, phis, reduc_index,
4746 double_reduc, slp_node);
4748 VEC_free (gimple, heap, phis);
4749 VEC_free (tree, heap, vec_oprnds0);
4751 VEC_free (tree, heap, vec_oprnds1);
4756 /* Function vect_min_worthwhile_factor.
4758 For a loop where we could vectorize the operation indicated by CODE,
4759 return the minimum vectorization factor that makes it worthwhile
4760 to use generic vectors. */
4762 vect_min_worthwhile_factor (enum tree_code code)
4783 /* Function vectorizable_induction
4785 Check if PHI performs an induction computation that can be vectorized.
4786 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4787 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4788 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4791 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4794 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4795 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4796 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4797 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4798 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4799 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4802 gcc_assert (ncopies >= 1);
4803 /* FORNOW. This restriction should be relaxed. */
4804 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4806 if (vect_print_dump_info (REPORT_DETAILS))
4807 fprintf (vect_dump, "multiple types in nested loop.");
4811 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4814 /* FORNOW: SLP not supported. */
4815 if (STMT_SLP_TYPE (stmt_info))
4818 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4820 if (gimple_code (phi) != GIMPLE_PHI)
4823 if (!vec_stmt) /* transformation not required. */
4825 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4826 if (vect_print_dump_info (REPORT_DETAILS))
4827 fprintf (vect_dump, "=== vectorizable_induction ===");
4828 vect_model_induction_cost (stmt_info, ncopies);
4834 if (vect_print_dump_info (REPORT_DETAILS))
4835 fprintf (vect_dump, "transform induction phi.");
4837 vec_def = get_initial_def_for_induction (phi);
4838 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4842 /* Function vectorizable_live_operation.
4844 STMT computes a value that is used outside the loop. Check if
4845 it can be supported. */
4848 vectorizable_live_operation (gimple stmt,
4849 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4850 gimple *vec_stmt ATTRIBUTE_UNUSED)
4852 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4853 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4854 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4860 enum vect_def_type dt;
4861 enum tree_code code;
4862 enum gimple_rhs_class rhs_class;
4864 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4866 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4869 if (!is_gimple_assign (stmt))
4872 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4875 /* FORNOW. CHECKME. */
4876 if (nested_in_vect_loop_p (loop, stmt))
4879 code = gimple_assign_rhs_code (stmt);
4880 op_type = TREE_CODE_LENGTH (code);
4881 rhs_class = get_gimple_rhs_class (code);
4882 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4883 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4885 /* FORNOW: support only if all uses are invariant. This means
4886 that the scalar operations can remain in place, unvectorized.
4887 The original last scalar value that they compute will be used. */
4889 for (i = 0; i < op_type; i++)
4891 if (rhs_class == GIMPLE_SINGLE_RHS)
4892 op = TREE_OPERAND (gimple_op (stmt, 1), i);
4894 op = gimple_op (stmt, i + 1);
4896 && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4898 if (vect_print_dump_info (REPORT_DETAILS))
4899 fprintf (vect_dump, "use not simple.");
4903 if (dt != vect_external_def && dt != vect_constant_def)
4907 /* No transformation is required for the cases we currently support. */
4911 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
4914 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4916 ssa_op_iter op_iter;
4917 imm_use_iterator imm_iter;
4918 def_operand_p def_p;
4921 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4923 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4927 if (!is_gimple_debug (ustmt))
4930 bb = gimple_bb (ustmt);
4932 if (!flow_bb_inside_loop_p (loop, bb))
4934 if (gimple_debug_bind_p (ustmt))
4936 if (vect_print_dump_info (REPORT_DETAILS))
4937 fprintf (vect_dump, "killing debug use");
4939 gimple_debug_bind_reset_value (ustmt);
4940 update_stmt (ustmt);
4949 /* Function vect_transform_loop.
4951 The analysis phase has determined that the loop is vectorizable.
4952 Vectorize the loop - created vectorized stmts to replace the scalar
4953 stmts in the loop, and update the loop exit condition. */
4956 vect_transform_loop (loop_vec_info loop_vinfo)
4958 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4959 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4960 int nbbs = loop->num_nodes;
4961 gimple_stmt_iterator si;
4964 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4966 bool slp_scheduled = false;
4967 unsigned int nunits;
4968 tree cond_expr = NULL_TREE;
4969 gimple_seq cond_expr_stmt_list = NULL;
4970 bool do_peeling_for_loop_bound;
4972 if (vect_print_dump_info (REPORT_DETAILS))
4973 fprintf (vect_dump, "=== vec_transform_loop ===");
4975 /* Peel the loop if there are data refs with unknown alignment.
4976 Only one data ref with unknown store is allowed. */
4978 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4979 vect_do_peeling_for_alignment (loop_vinfo);
4981 do_peeling_for_loop_bound
4982 = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4983 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4984 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4986 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4987 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4988 vect_loop_versioning (loop_vinfo,
4989 !do_peeling_for_loop_bound,
4990 &cond_expr, &cond_expr_stmt_list);
4992 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4993 compile time constant), or it is a constant that doesn't divide by the
4994 vectorization factor, then an epilog loop needs to be created.
4995 We therefore duplicate the loop: the original loop will be vectorized,
4996 and will compute the first (n/VF) iterations. The second copy of the loop
4997 will remain scalar and will compute the remaining (n%VF) iterations.
4998 (VF is the vectorization factor). */
5000 if (do_peeling_for_loop_bound)
5001 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5002 cond_expr, cond_expr_stmt_list);
5004 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5005 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5007 /* 1) Make sure the loop header has exactly two entries
5008 2) Make sure we have a preheader basic block. */
5010 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5012 split_edge (loop_preheader_edge (loop));
5014 /* FORNOW: the vectorizer supports only loops which body consist
5015 of one basic block (header + empty latch). When the vectorizer will
5016 support more involved loop forms, the order by which the BBs are
5017 traversed need to be reconsidered. */
5019 for (i = 0; i < nbbs; i++)
5021 basic_block bb = bbs[i];
5022 stmt_vec_info stmt_info;
5025 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5027 phi = gsi_stmt (si);
5028 if (vect_print_dump_info (REPORT_DETAILS))
5030 fprintf (vect_dump, "------>vectorizing phi: ");
5031 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
5033 stmt_info = vinfo_for_stmt (phi);
5037 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5038 vect_loop_kill_debug_uses (loop, phi);
5040 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5041 && !STMT_VINFO_LIVE_P (stmt_info))
5044 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5045 != (unsigned HOST_WIDE_INT) vectorization_factor)
5046 && vect_print_dump_info (REPORT_DETAILS))
5047 fprintf (vect_dump, "multiple-types.");
5049 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5051 if (vect_print_dump_info (REPORT_DETAILS))
5052 fprintf (vect_dump, "transform phi.");
5053 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5057 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
5059 gimple stmt = gsi_stmt (si);
5062 if (vect_print_dump_info (REPORT_DETAILS))
5064 fprintf (vect_dump, "------>vectorizing statement: ");
5065 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
5068 stmt_info = vinfo_for_stmt (stmt);
5070 /* vector stmts created in the outer-loop during vectorization of
5071 stmts in an inner-loop may not have a stmt_info, and do not
5072 need to be vectorized. */
5079 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5080 vect_loop_kill_debug_uses (loop, stmt);
5082 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5083 && !STMT_VINFO_LIVE_P (stmt_info))
5089 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5091 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
5092 if (!STMT_SLP_TYPE (stmt_info)
5093 && nunits != (unsigned int) vectorization_factor
5094 && vect_print_dump_info (REPORT_DETAILS))
5095 /* For SLP VF is set according to unrolling factor, and not to
5096 vector size, hence for SLP this print is not valid. */
5097 fprintf (vect_dump, "multiple-types.");
5099 /* SLP. Schedule all the SLP instances when the first SLP stmt is
5101 if (STMT_SLP_TYPE (stmt_info))
5105 slp_scheduled = true;
5107 if (vect_print_dump_info (REPORT_DETAILS))
5108 fprintf (vect_dump, "=== scheduling SLP instances ===");
5110 vect_schedule_slp (loop_vinfo, NULL);
5113 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
5114 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5121 /* -------- vectorize statement ------------ */
5122 if (vect_print_dump_info (REPORT_DETAILS))
5123 fprintf (vect_dump, "transform statement.");
5125 strided_store = false;
5126 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
5129 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5131 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5132 interleaving chain was completed - free all the stores in
5134 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5135 gsi_remove (&si, true);
5140 /* Free the attached stmt_vec_info and remove the stmt. */
5141 free_stmt_vec_info (stmt);
5142 gsi_remove (&si, true);
5150 slpeel_make_loop_iterate_ntimes (loop, ratio);
5152 /* The memory tags and pointers in vectorized statements need to
5153 have their SSA forms updated. FIXME, why can't this be delayed
5154 until all the loops have been transformed? */
5155 update_ssa (TODO_update_ssa);
5157 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5158 fprintf (vect_dump, "LOOP VECTORIZED.");
5159 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5160 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");