2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
7 This file is part of GCC.
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
25 #include "coretypes.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
34 #include "cfglayout.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
44 /* Loop Vectorization Pass.
46 This pass tries to vectorize loops.
48 For example, the vectorizer transforms the following simple loop:
50 short a[N]; short b[N]; short c[N]; int i;
56 as if it was manually vectorized by rewriting the source code into:
58 typedef int __attribute__((mode(V8HI))) v8hi;
59 short a[N]; short b[N]; short c[N]; int i;
60 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
63 for (i=0; i<N/8; i++){
70 The main entry to this pass is vectorize_loops(), in which
71 the vectorizer applies a set of analyses on a given set of loops,
72 followed by the actual vectorization transformation for the loops that
73 had successfully passed the analysis phase.
74 Throughout this pass we make a distinction between two types of
75 data: scalars (which are represented by SSA_NAMES), and memory references
76 ("data-refs"). These two types of data require different handling both
77 during analysis and transformation. The types of data-refs that the
78 vectorizer currently supports are ARRAY_REFS which base is an array DECL
79 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
80 accesses are required to have a simple (consecutive) access pattern.
84 The driver for the analysis phase is vect_analyze_loop().
85 It applies a set of analyses, some of which rely on the scalar evolution
86 analyzer (scev) developed by Sebastian Pop.
88 During the analysis phase the vectorizer records some information
89 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
90 loop, as well as general information about the loop as a whole, which is
91 recorded in a "loop_vec_info" struct attached to each loop.
95 The loop transformation phase scans all the stmts in the loop, and
96 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
97 the loop that needs to be vectorized. It inserts the vector code sequence
98 just before the scalar stmt S, and records a pointer to the vector code
99 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
100 attached to S). This pointer will be used for the vectorization of following
101 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
102 otherwise, we rely on dead code elimination for removing it.
104 For example, say stmt S1 was vectorized into stmt VS1:
107 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
110 To vectorize stmt S2, the vectorizer first finds the stmt that defines
111 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
112 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
113 resulting sequence would be:
116 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
118 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
120 Operands that are not SSA_NAMEs, are data-refs that appear in
121 load/store operations (like 'x[i]' in S1), and are handled differently.
125 Currently the only target specific information that is used is the
126 size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
127 support different sizes of vectors, for now will need to specify one value
128 for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
130 Since we only vectorize operations which vector form can be
131 expressed using existing tree codes, to verify that an operation is
132 supported, the vectorizer checks the relevant optab at the relevant
133 machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
134 the value found is CODE_FOR_nothing, then there's no target support, and
135 we can't vectorize the stmt.
137 For additional information on this project see:
138 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
141 /* Function vect_determine_vectorization_factor
143 Determine the vectorization factor (VF). VF is the number of data elements
144 that are operated upon in parallel in a single iteration of the vectorized
145 loop. For example, when vectorizing a loop that operates on 4byte elements,
146 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
147 elements can fit in a single vector register.
149 We currently support vectorization of loops in which all types operated upon
150 are of the same size. Therefore this function currently sets VF according to
151 the size of the types operated upon, and fails if there are multiple sizes
154 VF is also the factor by which the loop iterations are strip-mined, e.g.:
161 for (i=0; i<N; i+=VF){
162 a[i:VF] = b[i:VF] + c[i:VF];
167 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
169 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
170 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
171 int nbbs = loop->num_nodes;
172 gimple_stmt_iterator si;
173 unsigned int vectorization_factor = 0;
178 stmt_vec_info stmt_info;
182 if (vect_print_dump_info (REPORT_DETAILS))
183 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
185 for (i = 0; i < nbbs; i++)
187 basic_block bb = bbs[i];
189 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
192 stmt_info = vinfo_for_stmt (phi);
193 if (vect_print_dump_info (REPORT_DETAILS))
195 fprintf (vect_dump, "==> examining phi: ");
196 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
199 gcc_assert (stmt_info);
201 if (STMT_VINFO_RELEVANT_P (stmt_info))
203 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
204 scalar_type = TREE_TYPE (PHI_RESULT (phi));
206 if (vect_print_dump_info (REPORT_DETAILS))
208 fprintf (vect_dump, "get vectype for scalar type: ");
209 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
212 vectype = get_vectype_for_scalar_type (scalar_type);
215 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
218 "not vectorized: unsupported data-type ");
219 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
223 STMT_VINFO_VECTYPE (stmt_info) = vectype;
225 if (vect_print_dump_info (REPORT_DETAILS))
227 fprintf (vect_dump, "vectype: ");
228 print_generic_expr (vect_dump, vectype, TDF_SLIM);
231 nunits = TYPE_VECTOR_SUBPARTS (vectype);
232 if (vect_print_dump_info (REPORT_DETAILS))
233 fprintf (vect_dump, "nunits = %d", nunits);
235 if (!vectorization_factor
236 || (nunits > vectorization_factor))
237 vectorization_factor = nunits;
241 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
244 gimple stmt = gsi_stmt (si);
245 stmt_info = vinfo_for_stmt (stmt);
247 if (vect_print_dump_info (REPORT_DETAILS))
249 fprintf (vect_dump, "==> examining statement: ");
250 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
253 gcc_assert (stmt_info);
255 /* skip stmts which do not need to be vectorized. */
256 if (!STMT_VINFO_RELEVANT_P (stmt_info)
257 && !STMT_VINFO_LIVE_P (stmt_info))
259 if (vect_print_dump_info (REPORT_DETAILS))
260 fprintf (vect_dump, "skip.");
264 if (gimple_get_lhs (stmt) == NULL_TREE)
266 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
268 fprintf (vect_dump, "not vectorized: irregular stmt.");
269 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
274 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
276 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
278 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
279 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
284 if (STMT_VINFO_VECTYPE (stmt_info))
286 /* The only case when a vectype had been already set is for stmts
287 that contain a dataref, or for "pattern-stmts" (stmts generated
288 by the vectorizer to represent/replace a certain idiom). */
289 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
290 || is_pattern_stmt_p (stmt_info));
291 vectype = STMT_VINFO_VECTYPE (stmt_info);
295 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
296 && !is_pattern_stmt_p (stmt_info));
298 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
299 if (vect_print_dump_info (REPORT_DETAILS))
301 fprintf (vect_dump, "get vectype for scalar type: ");
302 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
304 vectype = get_vectype_for_scalar_type (scalar_type);
307 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
310 "not vectorized: unsupported data-type ");
311 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
316 STMT_VINFO_VECTYPE (stmt_info) = vectype;
319 /* The vectorization factor is according to the smallest
320 scalar type (or the largest vector size, but we only
321 support one vector size per loop). */
322 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
324 if (vect_print_dump_info (REPORT_DETAILS))
326 fprintf (vect_dump, "get vectype for scalar type: ");
327 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
329 vf_vectype = get_vectype_for_scalar_type (scalar_type);
332 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
335 "not vectorized: unsupported data-type ");
336 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
341 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
342 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
344 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
347 "not vectorized: different sized vector "
348 "types in statement, ");
349 print_generic_expr (vect_dump, vectype, TDF_SLIM);
350 fprintf (vect_dump, " and ");
351 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
356 if (vect_print_dump_info (REPORT_DETAILS))
358 fprintf (vect_dump, "vectype: ");
359 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
362 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
363 if (vect_print_dump_info (REPORT_DETAILS))
364 fprintf (vect_dump, "nunits = %d", nunits);
366 if (!vectorization_factor
367 || (nunits > vectorization_factor))
368 vectorization_factor = nunits;
372 /* TODO: Analyze cost. Decide if worth while to vectorize. */
373 if (vect_print_dump_info (REPORT_DETAILS))
374 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
375 if (vectorization_factor <= 1)
377 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
378 fprintf (vect_dump, "not vectorized: unsupported data-type");
381 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
387 /* Function vect_is_simple_iv_evolution.
389 FORNOW: A simple evolution of an induction variables in the loop is
390 considered a polynomial evolution with constant step. */
393 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
398 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
400 /* When there is no evolution in this loop, the evolution function
402 if (evolution_part == NULL_TREE)
405 /* When the evolution is a polynomial of degree >= 2
406 the evolution function is not "simple". */
407 if (tree_is_chrec (evolution_part))
410 step_expr = evolution_part;
411 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
413 if (vect_print_dump_info (REPORT_DETAILS))
415 fprintf (vect_dump, "step: ");
416 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
417 fprintf (vect_dump, ", init: ");
418 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
424 if (TREE_CODE (step_expr) != INTEGER_CST)
426 if (vect_print_dump_info (REPORT_DETAILS))
427 fprintf (vect_dump, "step unknown.");
434 /* Function vect_analyze_scalar_cycles_1.
436 Examine the cross iteration def-use cycles of scalar variables
437 in LOOP. LOOP_VINFO represents the loop that is now being
438 considered for vectorization (can be LOOP, or an outer-loop
442 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
444 basic_block bb = loop->header;
446 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
447 gimple_stmt_iterator gsi;
450 if (vect_print_dump_info (REPORT_DETAILS))
451 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
453 /* First - identify all inductions. Reduction detection assumes that all the
454 inductions have been identified, therefore, this order must not be
456 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
458 gimple phi = gsi_stmt (gsi);
459 tree access_fn = NULL;
460 tree def = PHI_RESULT (phi);
461 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
463 if (vect_print_dump_info (REPORT_DETAILS))
465 fprintf (vect_dump, "Analyze phi: ");
466 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
469 /* Skip virtual phi's. The data dependences that are associated with
470 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
471 if (!is_gimple_reg (SSA_NAME_VAR (def)))
474 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
476 /* Analyze the evolution function. */
477 access_fn = analyze_scalar_evolution (loop, def);
478 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
480 fprintf (vect_dump, "Access function of PHI: ");
481 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
485 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
487 VEC_safe_push (gimple, heap, worklist, phi);
491 if (vect_print_dump_info (REPORT_DETAILS))
492 fprintf (vect_dump, "Detected induction.");
493 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
497 /* Second - identify all reductions and nested cycles. */
498 while (VEC_length (gimple, worklist) > 0)
500 gimple phi = VEC_pop (gimple, worklist);
501 tree def = PHI_RESULT (phi);
502 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
506 if (vect_print_dump_info (REPORT_DETAILS))
508 fprintf (vect_dump, "Analyze phi: ");
509 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
512 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
513 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
515 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
516 reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle,
522 if (vect_print_dump_info (REPORT_DETAILS))
523 fprintf (vect_dump, "Detected double reduction.");
525 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
526 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
527 vect_double_reduction_def;
533 if (vect_print_dump_info (REPORT_DETAILS))
534 fprintf (vect_dump, "Detected vectorizable nested cycle.");
536 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
537 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
542 if (vect_print_dump_info (REPORT_DETAILS))
543 fprintf (vect_dump, "Detected reduction.");
545 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
546 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
548 /* Store the reduction cycles for possible vectorization in
550 VEC_safe_push (gimple, heap,
551 LOOP_VINFO_REDUCTIONS (loop_vinfo),
557 if (vect_print_dump_info (REPORT_DETAILS))
558 fprintf (vect_dump, "Unknown def-use cycle pattern.");
561 VEC_free (gimple, heap, worklist);
565 /* Function vect_analyze_scalar_cycles.
567 Examine the cross iteration def-use cycles of scalar variables, by
568 analyzing the loop-header PHIs of scalar variables; Classify each
569 cycle as one of the following: invariant, induction, reduction, unknown.
570 We do that for the loop represented by LOOP_VINFO, and also to its
571 inner-loop, if exists.
572 Examples for scalar cycles:
587 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
589 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
591 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
593 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
594 Reductions in such inner-loop therefore have different properties than
595 the reductions in the nest that gets vectorized:
596 1. When vectorized, they are executed in the same order as in the original
597 scalar loop, so we can't change the order of computation when
599 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
600 current checks are too strict. */
603 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
606 /* Function vect_get_loop_niters.
608 Determine how many iterations the loop is executed.
609 If an expression that represents the number of iterations
610 can be constructed, place it in NUMBER_OF_ITERATIONS.
611 Return the loop exit condition. */
614 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
618 if (vect_print_dump_info (REPORT_DETAILS))
619 fprintf (vect_dump, "=== get_loop_niters ===");
621 niters = number_of_exit_cond_executions (loop);
623 if (niters != NULL_TREE
624 && niters != chrec_dont_know)
626 *number_of_iterations = niters;
628 if (vect_print_dump_info (REPORT_DETAILS))
630 fprintf (vect_dump, "==> get_loop_niters:" );
631 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
635 return get_loop_exit_condition (loop);
639 /* Function bb_in_loop_p
641 Used as predicate for dfs order traversal of the loop bbs. */
644 bb_in_loop_p (const_basic_block bb, const void *data)
646 const struct loop *const loop = (const struct loop *)data;
647 if (flow_bb_inside_loop_p (loop, bb))
653 /* Function new_loop_vec_info.
655 Create and initialize a new loop_vec_info struct for LOOP, as well as
656 stmt_vec_info structs for all the stmts in LOOP. */
659 new_loop_vec_info (struct loop *loop)
663 gimple_stmt_iterator si;
664 unsigned int i, nbbs;
666 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
667 LOOP_VINFO_LOOP (res) = loop;
669 bbs = get_loop_body (loop);
671 /* Create/Update stmt_info for all stmts in the loop. */
672 for (i = 0; i < loop->num_nodes; i++)
674 basic_block bb = bbs[i];
676 /* BBs in a nested inner-loop will have been already processed (because
677 we will have called vect_analyze_loop_form for any nested inner-loop).
678 Therefore, for stmts in an inner-loop we just want to update the
679 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
680 loop_info of the outer-loop we are currently considering to vectorize
681 (instead of the loop_info of the inner-loop).
682 For stmts in other BBs we need to create a stmt_info from scratch. */
683 if (bb->loop_father != loop)
686 gcc_assert (loop->inner && bb->loop_father == loop->inner);
687 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
689 gimple phi = gsi_stmt (si);
690 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
691 loop_vec_info inner_loop_vinfo =
692 STMT_VINFO_LOOP_VINFO (stmt_info);
693 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
694 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
696 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
698 gimple stmt = gsi_stmt (si);
699 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
700 loop_vec_info inner_loop_vinfo =
701 STMT_VINFO_LOOP_VINFO (stmt_info);
702 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
703 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
708 /* bb in current nest. */
709 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
711 gimple phi = gsi_stmt (si);
712 gimple_set_uid (phi, 0);
713 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
716 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
718 gimple stmt = gsi_stmt (si);
719 gimple_set_uid (stmt, 0);
720 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
725 /* CHECKME: We want to visit all BBs before their successors (except for
726 latch blocks, for which this assertion wouldn't hold). In the simple
727 case of the loop forms we allow, a dfs order of the BBs would the same
728 as reversed postorder traversal, so we are safe. */
731 bbs = XCNEWVEC (basic_block, loop->num_nodes);
732 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
733 bbs, loop->num_nodes, loop);
734 gcc_assert (nbbs == loop->num_nodes);
736 LOOP_VINFO_BBS (res) = bbs;
737 LOOP_VINFO_NITERS (res) = NULL;
738 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
739 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
740 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
741 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
742 LOOP_VINFO_VECT_FACTOR (res) = 0;
743 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
744 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
745 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
746 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
747 VEC_alloc (gimple, heap,
748 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
749 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
750 VEC_alloc (ddr_p, heap,
751 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
752 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
753 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
754 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
755 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
761 /* Function destroy_loop_vec_info.
763 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
764 stmts in the loop. */
767 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
772 gimple_stmt_iterator si;
774 VEC (slp_instance, heap) *slp_instances;
775 slp_instance instance;
780 loop = LOOP_VINFO_LOOP (loop_vinfo);
782 bbs = LOOP_VINFO_BBS (loop_vinfo);
783 nbbs = loop->num_nodes;
787 free (LOOP_VINFO_BBS (loop_vinfo));
788 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
789 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
790 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
797 for (j = 0; j < nbbs; j++)
799 basic_block bb = bbs[j];
800 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
801 free_stmt_vec_info (gsi_stmt (si));
803 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
805 gimple stmt = gsi_stmt (si);
806 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
810 /* Check if this is a "pattern stmt" (introduced by the
811 vectorizer during the pattern recognition pass). */
812 bool remove_stmt_p = false;
813 gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
816 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
818 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
819 remove_stmt_p = true;
822 /* Free stmt_vec_info. */
823 free_stmt_vec_info (stmt);
825 /* Remove dead "pattern stmts". */
827 gsi_remove (&si, true);
833 free (LOOP_VINFO_BBS (loop_vinfo));
834 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
835 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
836 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
837 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
838 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
839 for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
840 vect_free_slp_instance (instance);
842 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
843 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
844 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
851 /* Function vect_analyze_loop_1.
853 Apply a set of analyses on LOOP, and create a loop_vec_info struct
854 for it. The different analyses will record information in the
855 loop_vec_info struct. This is a subset of the analyses applied in
856 vect_analyze_loop, to be applied on an inner-loop nested in the loop
857 that is now considered for (outer-loop) vectorization. */
860 vect_analyze_loop_1 (struct loop *loop)
862 loop_vec_info loop_vinfo;
864 if (vect_print_dump_info (REPORT_DETAILS))
865 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
867 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
869 loop_vinfo = vect_analyze_loop_form (loop);
872 if (vect_print_dump_info (REPORT_DETAILS))
873 fprintf (vect_dump, "bad inner-loop form.");
881 /* Function vect_analyze_loop_form.
883 Verify that certain CFG restrictions hold, including:
884 - the loop has a pre-header
885 - the loop has a single entry and exit
886 - the loop exit condition is simple enough, and the number of iterations
887 can be analyzed (a countable loop). */
890 vect_analyze_loop_form (struct loop *loop)
892 loop_vec_info loop_vinfo;
894 tree number_of_iterations = NULL;
895 loop_vec_info inner_loop_vinfo = NULL;
897 if (vect_print_dump_info (REPORT_DETAILS))
898 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
900 /* Different restrictions apply when we are considering an inner-most loop,
901 vs. an outer (nested) loop.
902 (FORNOW. May want to relax some of these restrictions in the future). */
906 /* Inner-most loop. We currently require that the number of BBs is
907 exactly 2 (the header and latch). Vectorizable inner-most loops
918 if (loop->num_nodes != 2)
920 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
921 fprintf (vect_dump, "not vectorized: control flow in loop.");
925 if (empty_block_p (loop->header))
927 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
928 fprintf (vect_dump, "not vectorized: empty loop.");
934 struct loop *innerloop = loop->inner;
937 /* Nested loop. We currently require that the loop is doubly-nested,
938 contains a single inner loop, and the number of BBs is exactly 5.
939 Vectorizable outer-loops look like this:
951 The inner-loop has the properties expected of inner-most loops
952 as described above. */
954 if ((loop->inner)->inner || (loop->inner)->next)
956 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
957 fprintf (vect_dump, "not vectorized: multiple nested loops.");
961 /* Analyze the inner-loop. */
962 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
963 if (!inner_loop_vinfo)
965 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
966 fprintf (vect_dump, "not vectorized: Bad inner loop.");
970 if (!expr_invariant_in_loop_p (loop,
971 LOOP_VINFO_NITERS (inner_loop_vinfo)))
973 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
975 "not vectorized: inner-loop count not invariant.");
976 destroy_loop_vec_info (inner_loop_vinfo, true);
980 if (loop->num_nodes != 5)
982 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
983 fprintf (vect_dump, "not vectorized: control flow in loop.");
984 destroy_loop_vec_info (inner_loop_vinfo, true);
988 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
989 entryedge = EDGE_PRED (innerloop->header, 0);
990 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
991 entryedge = EDGE_PRED (innerloop->header, 1);
993 if (entryedge->src != loop->header
994 || !single_exit (innerloop)
995 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
997 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
998 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
999 destroy_loop_vec_info (inner_loop_vinfo, true);
1003 if (vect_print_dump_info (REPORT_DETAILS))
1004 fprintf (vect_dump, "Considering outer-loop vectorization.");
1007 if (!single_exit (loop)
1008 || EDGE_COUNT (loop->header->preds) != 2)
1010 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1012 if (!single_exit (loop))
1013 fprintf (vect_dump, "not vectorized: multiple exits.");
1014 else if (EDGE_COUNT (loop->header->preds) != 2)
1015 fprintf (vect_dump, "not vectorized: too many incoming edges.");
1017 if (inner_loop_vinfo)
1018 destroy_loop_vec_info (inner_loop_vinfo, true);
1022 /* We assume that the loop exit condition is at the end of the loop. i.e,
1023 that the loop is represented as a do-while (with a proper if-guard
1024 before the loop if needed), where the loop header contains all the
1025 executable statements, and the latch is empty. */
1026 if (!empty_block_p (loop->latch)
1027 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1029 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1030 fprintf (vect_dump, "not vectorized: unexpected loop form.");
1031 if (inner_loop_vinfo)
1032 destroy_loop_vec_info (inner_loop_vinfo, true);
1036 /* Make sure there exists a single-predecessor exit bb: */
1037 if (!single_pred_p (single_exit (loop)->dest))
1039 edge e = single_exit (loop);
1040 if (!(e->flags & EDGE_ABNORMAL))
1042 split_loop_exit_edge (e);
1043 if (vect_print_dump_info (REPORT_DETAILS))
1044 fprintf (vect_dump, "split exit edge.");
1048 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1049 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1050 if (inner_loop_vinfo)
1051 destroy_loop_vec_info (inner_loop_vinfo, true);
1056 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1059 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1060 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1061 if (inner_loop_vinfo)
1062 destroy_loop_vec_info (inner_loop_vinfo, true);
1066 if (!number_of_iterations)
1068 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1070 "not vectorized: number of iterations cannot be computed.");
1071 if (inner_loop_vinfo)
1072 destroy_loop_vec_info (inner_loop_vinfo, true);
1076 if (chrec_contains_undetermined (number_of_iterations))
1078 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1079 fprintf (vect_dump, "Infinite number of iterations.");
1080 if (inner_loop_vinfo)
1081 destroy_loop_vec_info (inner_loop_vinfo, true);
1085 if (!NITERS_KNOWN_P (number_of_iterations))
1087 if (vect_print_dump_info (REPORT_DETAILS))
1089 fprintf (vect_dump, "Symbolic number of iterations is ");
1090 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1093 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1095 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1096 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1097 if (inner_loop_vinfo)
1098 destroy_loop_vec_info (inner_loop_vinfo, false);
1102 loop_vinfo = new_loop_vec_info (loop);
1103 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1104 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1106 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1108 /* CHECKME: May want to keep it around it in the future. */
1109 if (inner_loop_vinfo)
1110 destroy_loop_vec_info (inner_loop_vinfo, false);
1112 gcc_assert (!loop->aux);
1113 loop->aux = loop_vinfo;
1118 /* Function vect_analyze_loop_operations.
1120 Scan the loop stmts and make sure they are all vectorizable. */
1123 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1125 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1126 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1127 int nbbs = loop->num_nodes;
1128 gimple_stmt_iterator si;
1129 unsigned int vectorization_factor = 0;
1132 stmt_vec_info stmt_info;
1133 bool need_to_vectorize = false;
1134 int min_profitable_iters;
1135 int min_scalar_loop_bound;
1137 bool only_slp_in_loop = true, ok;
1139 if (vect_print_dump_info (REPORT_DETAILS))
1140 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1142 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1143 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1145 for (i = 0; i < nbbs; i++)
1147 basic_block bb = bbs[i];
1149 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1151 phi = gsi_stmt (si);
1154 stmt_info = vinfo_for_stmt (phi);
1155 if (vect_print_dump_info (REPORT_DETAILS))
1157 fprintf (vect_dump, "examining phi: ");
1158 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1161 if (! is_loop_header_bb_p (bb))
1163 /* inner-loop loop-closed exit phi in outer-loop vectorization
1164 (i.e. a phi in the tail of the outer-loop).
1165 FORNOW: we currently don't support the case that these phis
1166 are not used in the outerloop (unless it is double reduction,
1167 i.e., this phi is vect_reduction_def), cause this case
1168 requires to actually do something here. */
1169 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1170 || STMT_VINFO_LIVE_P (stmt_info))
1171 && STMT_VINFO_DEF_TYPE (stmt_info)
1172 != vect_double_reduction_def)
1174 if (vect_print_dump_info (REPORT_DETAILS))
1176 "Unsupported loop-closed phi in outer-loop.");
1182 gcc_assert (stmt_info);
1184 if (STMT_VINFO_LIVE_P (stmt_info))
1186 /* FORNOW: not yet supported. */
1187 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1188 fprintf (vect_dump, "not vectorized: value used after loop.");
1192 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1193 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1195 /* A scalar-dependence cycle that we don't support. */
1196 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1197 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1201 if (STMT_VINFO_RELEVANT_P (stmt_info))
1203 need_to_vectorize = true;
1204 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1205 ok = vectorizable_induction (phi, NULL, NULL);
1210 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1213 "not vectorized: relevant phi not supported: ");
1214 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1220 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1222 gimple stmt = gsi_stmt (si);
1223 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1225 gcc_assert (stmt_info);
1227 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1230 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1231 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1232 && !PURE_SLP_STMT (stmt_info))
1233 /* STMT needs both SLP and loop-based vectorization. */
1234 only_slp_in_loop = false;
1238 /* All operations in the loop are either irrelevant (deal with loop
1239 control, or dead), or only used outside the loop and can be moved
1240 out of the loop (e.g. invariants, inductions). The loop can be
1241 optimized away by scalar optimizations. We're better off not
1242 touching this loop. */
1243 if (!need_to_vectorize)
1245 if (vect_print_dump_info (REPORT_DETAILS))
1247 "All the computation can be taken out of the loop.");
1248 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1250 "not vectorized: redundant loop. no profit to vectorize.");
1254 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1255 vectorization factor of the loop is the unrolling factor required by the
1256 SLP instances. If that unrolling factor is 1, we say, that we perform
1257 pure SLP on loop - cross iteration parallelism is not exploited. */
1258 if (only_slp_in_loop)
1259 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1261 vectorization_factor = least_common_multiple (vectorization_factor,
1262 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1264 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1266 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1267 && vect_print_dump_info (REPORT_DETAILS))
1269 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1270 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1272 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1273 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1275 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1276 fprintf (vect_dump, "not vectorized: iteration count too small.");
1277 if (vect_print_dump_info (REPORT_DETAILS))
1278 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1279 "vectorization factor.");
1283 /* Analyze cost. Decide if worth while to vectorize. */
1285 /* Once VF is set, SLP costs should be updated since the number of created
1286 vector stmts depends on VF. */
1287 vect_update_slp_costs_according_to_vf (loop_vinfo);
1289 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1290 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1292 if (min_profitable_iters < 0)
1294 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1295 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1296 if (vect_print_dump_info (REPORT_DETAILS))
1297 fprintf (vect_dump, "not vectorized: vector version will never be "
1302 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1303 * vectorization_factor) - 1);
1305 /* Use the cost model only if it is more conservative than user specified
1308 th = (unsigned) min_scalar_loop_bound;
1309 if (min_profitable_iters
1310 && (!min_scalar_loop_bound
1311 || min_profitable_iters > min_scalar_loop_bound))
1312 th = (unsigned) min_profitable_iters;
1314 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1315 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1317 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1318 fprintf (vect_dump, "not vectorized: vectorization not "
1320 if (vect_print_dump_info (REPORT_DETAILS))
1321 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1322 "user specified loop bound parameter or minimum "
1323 "profitable iterations (whichever is more conservative).");
1327 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1328 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1329 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1331 if (vect_print_dump_info (REPORT_DETAILS))
1332 fprintf (vect_dump, "epilog loop required.");
1333 if (!vect_can_advance_ivs_p (loop_vinfo))
1335 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1337 "not vectorized: can't create epilog loop 1.");
1340 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1342 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1344 "not vectorized: can't create epilog loop 2.");
1353 /* Function vect_analyze_loop.
1355 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1356 for it. The different analyses will record information in the
1357 loop_vec_info struct. */
1359 vect_analyze_loop (struct loop *loop)
1362 loop_vec_info loop_vinfo;
1363 int max_vf = MAX_VECTORIZATION_FACTOR;
1366 if (vect_print_dump_info (REPORT_DETAILS))
1367 fprintf (vect_dump, "===== analyze_loop_nest =====");
1369 if (loop_outer (loop)
1370 && loop_vec_info_for_loop (loop_outer (loop))
1371 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1373 if (vect_print_dump_info (REPORT_DETAILS))
1374 fprintf (vect_dump, "outer-loop already vectorized.");
1378 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1380 loop_vinfo = vect_analyze_loop_form (loop);
1383 if (vect_print_dump_info (REPORT_DETAILS))
1384 fprintf (vect_dump, "bad loop form.");
1388 /* Find all data references in the loop (which correspond to vdefs/vuses)
1389 and analyze their evolution in the loop. Also adjust the minimal
1390 vectorization factor according to the loads and stores.
1392 FORNOW: Handle only simple, array references, which
1393 alignment can be forced, and aligned pointer-references. */
1395 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1398 if (vect_print_dump_info (REPORT_DETAILS))
1399 fprintf (vect_dump, "bad data references.");
1400 destroy_loop_vec_info (loop_vinfo, true);
1404 /* Classify all cross-iteration scalar data-flow cycles.
1405 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1407 vect_analyze_scalar_cycles (loop_vinfo);
1409 vect_pattern_recog (loop_vinfo);
1411 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1413 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1416 if (vect_print_dump_info (REPORT_DETAILS))
1417 fprintf (vect_dump, "unexpected pattern.");
1418 destroy_loop_vec_info (loop_vinfo, true);
1422 /* Analyze data dependences between the data-refs in the loop
1423 and adjust the maximum vectorization factor according to
1425 FORNOW: fail at the first data dependence that we encounter. */
1427 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1431 if (vect_print_dump_info (REPORT_DETAILS))
1432 fprintf (vect_dump, "bad data dependence.");
1433 destroy_loop_vec_info (loop_vinfo, true);
1437 ok = vect_determine_vectorization_factor (loop_vinfo);
1440 if (vect_print_dump_info (REPORT_DETAILS))
1441 fprintf (vect_dump, "can't determine vectorization factor.");
1442 destroy_loop_vec_info (loop_vinfo, true);
1445 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1447 if (vect_print_dump_info (REPORT_DETAILS))
1448 fprintf (vect_dump, "bad data dependence.");
1449 destroy_loop_vec_info (loop_vinfo, true);
1453 /* Analyze the alignment of the data-refs in the loop.
1454 Fail if a data reference is found that cannot be vectorized. */
1456 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1459 if (vect_print_dump_info (REPORT_DETAILS))
1460 fprintf (vect_dump, "bad data alignment.");
1461 destroy_loop_vec_info (loop_vinfo, true);
1465 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1466 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1468 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1471 if (vect_print_dump_info (REPORT_DETAILS))
1472 fprintf (vect_dump, "bad data access.");
1473 destroy_loop_vec_info (loop_vinfo, true);
1477 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1478 It is important to call pruning after vect_analyze_data_ref_accesses,
1479 since we use grouping information gathered by interleaving analysis. */
1480 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1483 if (vect_print_dump_info (REPORT_DETAILS))
1484 fprintf (vect_dump, "too long list of versioning for alias "
1486 destroy_loop_vec_info (loop_vinfo, true);
1490 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1491 ok = vect_analyze_slp (loop_vinfo, NULL);
1494 /* Decide which possible SLP instances to SLP. */
1495 vect_make_slp_decision (loop_vinfo);
1497 /* Find stmts that need to be both vectorized and SLPed. */
1498 vect_detect_hybrid_slp (loop_vinfo);
1501 /* This pass will decide on using loop versioning and/or loop peeling in
1502 order to enhance the alignment of data references in the loop. */
1504 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1507 if (vect_print_dump_info (REPORT_DETAILS))
1508 fprintf (vect_dump, "bad data alignment.");
1509 destroy_loop_vec_info (loop_vinfo, true);
1513 /* Scan all the operations in the loop and make sure they are
1516 ok = vect_analyze_loop_operations (loop_vinfo);
1519 if (vect_print_dump_info (REPORT_DETAILS))
1520 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1521 destroy_loop_vec_info (loop_vinfo, true);
1525 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1531 /* Function reduction_code_for_scalar_code
1534 CODE - tree_code of a reduction operations.
1537 REDUC_CODE - the corresponding tree-code to be used to reduce the
1538 vector of partial results into a single scalar result (which
1539 will also reside in a vector) or ERROR_MARK if the operation is
1540 a supported reduction operation, but does not have such tree-code.
1542 Return FALSE if CODE currently cannot be vectorized as reduction. */
1545 reduction_code_for_scalar_code (enum tree_code code,
1546 enum tree_code *reduc_code)
1551 *reduc_code = REDUC_MAX_EXPR;
1555 *reduc_code = REDUC_MIN_EXPR;
1559 *reduc_code = REDUC_PLUS_EXPR;
1567 *reduc_code = ERROR_MARK;
1576 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1577 STMT is printed with a message MSG. */
1580 report_vect_op (gimple stmt, const char *msg)
1582 fprintf (vect_dump, "%s", msg);
1583 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1587 /* Function vect_is_simple_reduction
1589 (1) Detect a cross-iteration def-use cycle that represents a simple
1590 reduction computation. We look for the following pattern:
1595 a2 = operation (a3, a1)
1598 1. operation is commutative and associative and it is safe to
1599 change the order of the computation (if CHECK_REDUCTION is true)
1600 2. no uses for a2 in the loop (a2 is used out of the loop)
1601 3. no uses of a1 in the loop besides the reduction operation.
1603 Condition 1 is tested here.
1604 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1606 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1607 nested cycles, if CHECK_REDUCTION is false.
1609 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1613 inner loop (def of a3)
1618 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
1619 bool check_reduction, bool *double_reduc)
1621 struct loop *loop = (gimple_bb (phi))->loop_father;
1622 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1623 edge latch_e = loop_latch_edge (loop);
1624 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1625 gimple def_stmt, def1 = NULL, def2 = NULL;
1626 enum tree_code code;
1627 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1631 imm_use_iterator imm_iter;
1632 use_operand_p use_p;
1635 *double_reduc = false;
1637 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1638 otherwise, we assume outer loop vectorization. */
1639 gcc_assert ((check_reduction && loop == vect_loop)
1640 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1642 name = PHI_RESULT (phi);
1644 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1646 gimple use_stmt = USE_STMT (use_p);
1647 if (is_gimple_debug (use_stmt))
1649 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1650 && vinfo_for_stmt (use_stmt)
1651 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1655 if (vect_print_dump_info (REPORT_DETAILS))
1656 fprintf (vect_dump, "reduction used in loop.");
1661 if (TREE_CODE (loop_arg) != SSA_NAME)
1663 if (vect_print_dump_info (REPORT_DETAILS))
1665 fprintf (vect_dump, "reduction: not ssa_name: ");
1666 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1671 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1674 if (vect_print_dump_info (REPORT_DETAILS))
1675 fprintf (vect_dump, "reduction: no def_stmt.");
1679 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1681 if (vect_print_dump_info (REPORT_DETAILS))
1682 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1686 if (is_gimple_assign (def_stmt))
1688 name = gimple_assign_lhs (def_stmt);
1693 name = PHI_RESULT (def_stmt);
1698 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1700 gimple use_stmt = USE_STMT (use_p);
1701 if (is_gimple_debug (use_stmt))
1703 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1704 && vinfo_for_stmt (use_stmt)
1705 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1709 if (vect_print_dump_info (REPORT_DETAILS))
1710 fprintf (vect_dump, "reduction used in loop.");
1715 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1716 defined in the inner loop. */
1719 op1 = PHI_ARG_DEF (def_stmt, 0);
1721 if (gimple_phi_num_args (def_stmt) != 1
1722 || TREE_CODE (op1) != SSA_NAME)
1724 if (vect_print_dump_info (REPORT_DETAILS))
1725 fprintf (vect_dump, "unsupported phi node definition.");
1730 def1 = SSA_NAME_DEF_STMT (op1);
1731 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1733 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1734 && is_gimple_assign (def1))
1736 if (vect_print_dump_info (REPORT_DETAILS))
1737 report_vect_op (def_stmt, "detected double reduction: ");
1739 *double_reduc = true;
1746 code = gimple_assign_rhs_code (def_stmt);
1749 && (!commutative_tree_code (code) || !associative_tree_code (code)))
1751 if (vect_print_dump_info (REPORT_DETAILS))
1752 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1756 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1758 if (code != COND_EXPR)
1760 if (vect_print_dump_info (REPORT_DETAILS))
1761 report_vect_op (def_stmt, "reduction: not binary operation: ");
1766 op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1767 if (COMPARISON_CLASS_P (op3))
1769 op4 = TREE_OPERAND (op3, 1);
1770 op3 = TREE_OPERAND (op3, 0);
1773 op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1774 op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1776 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1778 if (vect_print_dump_info (REPORT_DETAILS))
1779 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1786 op1 = gimple_assign_rhs1 (def_stmt);
1787 op2 = gimple_assign_rhs2 (def_stmt);
1789 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1791 if (vect_print_dump_info (REPORT_DETAILS))
1792 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1798 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1799 if ((TREE_CODE (op1) == SSA_NAME
1800 && !types_compatible_p (type,TREE_TYPE (op1)))
1801 || (TREE_CODE (op2) == SSA_NAME
1802 && !types_compatible_p (type, TREE_TYPE (op2)))
1803 || (op3 && TREE_CODE (op3) == SSA_NAME
1804 && !types_compatible_p (type, TREE_TYPE (op3)))
1805 || (op4 && TREE_CODE (op4) == SSA_NAME
1806 && !types_compatible_p (type, TREE_TYPE (op4))))
1808 if (vect_print_dump_info (REPORT_DETAILS))
1810 fprintf (vect_dump, "reduction: multiple types: operation type: ");
1811 print_generic_expr (vect_dump, type, TDF_SLIM);
1812 fprintf (vect_dump, ", operands types: ");
1813 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1814 fprintf (vect_dump, ",");
1815 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1818 fprintf (vect_dump, ",");
1819 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1824 fprintf (vect_dump, ",");
1825 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1832 /* Check that it's ok to change the order of the computation.
1833 Generally, when vectorizing a reduction we change the order of the
1834 computation. This may change the behavior of the program in some
1835 cases, so we need to check that this is ok. One exception is when
1836 vectorizing an outer-loop: the inner-loop is executed sequentially,
1837 and therefore vectorizing reductions in the inner-loop during
1838 outer-loop vectorization is safe. */
1840 /* CHECKME: check for !flag_finite_math_only too? */
1841 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1844 /* Changing the order of operations changes the semantics. */
1845 if (vect_print_dump_info (REPORT_DETAILS))
1846 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1849 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1852 /* Changing the order of operations changes the semantics. */
1853 if (vect_print_dump_info (REPORT_DETAILS))
1854 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1857 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1859 /* Changing the order of operations changes the semantics. */
1860 if (vect_print_dump_info (REPORT_DETAILS))
1861 report_vect_op (def_stmt,
1862 "reduction: unsafe fixed-point math optimization: ");
1866 /* Reduction is safe. We're dealing with one of the following:
1867 1) integer arithmetic and no trapv
1868 2) floating point arithmetic, and special flags permit this optimization
1869 3) nested cycle (i.e., outer loop vectorization). */
1870 if (TREE_CODE (op1) == SSA_NAME)
1871 def1 = SSA_NAME_DEF_STMT (op1);
1873 if (TREE_CODE (op2) == SSA_NAME)
1874 def2 = SSA_NAME_DEF_STMT (op2);
1876 if (code != COND_EXPR
1877 && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1879 if (vect_print_dump_info (REPORT_DETAILS))
1880 report_vect_op (def_stmt, "reduction: no defs for operands: ");
1884 /* Check that one def is the reduction def, defined by PHI,
1885 the other def is either defined in the loop ("vect_internal_def"),
1886 or it's an induction (defined by a loop-header phi-node). */
1888 if (def2 && def2 == phi
1889 && (code == COND_EXPR
1890 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1891 && (is_gimple_assign (def1)
1892 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1893 == vect_induction_def
1894 || (gimple_code (def1) == GIMPLE_PHI
1895 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1896 == vect_internal_def
1897 && !is_loop_header_bb_p (gimple_bb (def1)))))))
1899 if (vect_print_dump_info (REPORT_DETAILS))
1900 report_vect_op (def_stmt, "detected reduction: ");
1903 else if (def1 && def1 == phi
1904 && (code == COND_EXPR
1905 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1906 && (is_gimple_assign (def2)
1907 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1908 == vect_induction_def
1909 || (gimple_code (def2) == GIMPLE_PHI
1910 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1911 == vect_internal_def
1912 && !is_loop_header_bb_p (gimple_bb (def2)))))))
1914 if (check_reduction)
1916 /* Swap operands (just for simplicity - so that the rest of the code
1917 can assume that the reduction variable is always the last (second)
1919 if (vect_print_dump_info (REPORT_DETAILS))
1920 report_vect_op (def_stmt,
1921 "detected reduction: need to swap operands: ");
1923 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
1924 gimple_assign_rhs2_ptr (def_stmt));
1928 if (vect_print_dump_info (REPORT_DETAILS))
1929 report_vect_op (def_stmt, "detected reduction: ");
1936 if (vect_print_dump_info (REPORT_DETAILS))
1937 report_vect_op (def_stmt, "reduction: unknown pattern: ");
1944 /* Function vect_estimate_min_profitable_iters
1946 Return the number of iterations required for the vector version of the
1947 loop to be profitable relative to the cost of the scalar version of the
1950 TODO: Take profile info into account before making vectorization
1951 decisions, if available. */
1954 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
1957 int min_profitable_iters;
1958 int peel_iters_prologue;
1959 int peel_iters_epilogue;
1960 int vec_inside_cost = 0;
1961 int vec_outside_cost = 0;
1962 int scalar_single_iter_cost = 0;
1963 int scalar_outside_cost = 0;
1964 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1965 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1966 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1967 int nbbs = loop->num_nodes;
1968 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
1969 int peel_guard_costs = 0;
1970 int innerloop_iters = 0, factor;
1971 VEC (slp_instance, heap) *slp_instances;
1972 slp_instance instance;
1974 /* Cost model disabled. */
1975 if (!flag_vect_cost_model)
1977 if (vect_print_dump_info (REPORT_COST))
1978 fprintf (vect_dump, "cost model disabled.");
1982 /* Requires loop versioning tests to handle misalignment. */
1983 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1985 /* FIXME: Make cost depend on complexity of individual check. */
1987 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1988 if (vect_print_dump_info (REPORT_COST))
1989 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1990 "versioning to treat misalignment.\n");
1993 /* Requires loop versioning with alias checks. */
1994 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1996 /* FIXME: Make cost depend on complexity of individual check. */
1998 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1999 if (vect_print_dump_info (REPORT_COST))
2000 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2001 "versioning aliasing.\n");
2004 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2005 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2006 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
2008 /* Count statements in scalar loop. Using this as scalar cost for a single
2011 TODO: Add outer loop support.
2013 TODO: Consider assigning different costs to different scalar
2018 innerloop_iters = 50; /* FIXME */
2020 for (i = 0; i < nbbs; i++)
2022 gimple_stmt_iterator si;
2023 basic_block bb = bbs[i];
2025 if (bb->loop_father == loop->inner)
2026 factor = innerloop_iters;
2030 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2032 gimple stmt = gsi_stmt (si);
2033 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2034 /* Skip stmts that are not vectorized inside the loop. */
2035 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2036 && (!STMT_VINFO_LIVE_P (stmt_info)
2037 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2039 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
2040 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2041 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2042 some of the "outside" costs are generated inside the outer-loop. */
2043 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2047 /* Add additional cost for the peeled instructions in prologue and epilogue
2050 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2051 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2053 TODO: Build an expression that represents peel_iters for prologue and
2054 epilogue to be used in a run-time test. */
2056 if (byte_misalign < 0)
2058 peel_iters_prologue = vf/2;
2059 if (vect_print_dump_info (REPORT_COST))
2060 fprintf (vect_dump, "cost model: "
2061 "prologue peel iters set to vf/2.");
2063 /* If peeling for alignment is unknown, loop bound of main loop becomes
2065 peel_iters_epilogue = vf/2;
2066 if (vect_print_dump_info (REPORT_COST))
2067 fprintf (vect_dump, "cost model: "
2068 "epilogue peel iters set to vf/2 because "
2069 "peeling for alignment is unknown .");
2071 /* If peeled iterations are unknown, count a taken branch and a not taken
2072 branch per peeled loop. Even if scalar loop iterations are known,
2073 vector iterations are not known since peeled prologue iterations are
2074 not known. Hence guards remain the same. */
2075 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
2076 + TARG_COND_NOT_TAKEN_BRANCH_COST);
2082 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2083 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
2084 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2085 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
2087 peel_iters_prologue = nelements - (byte_misalign / element_size);
2090 peel_iters_prologue = 0;
2092 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2094 peel_iters_epilogue = vf/2;
2095 if (vect_print_dump_info (REPORT_COST))
2096 fprintf (vect_dump, "cost model: "
2097 "epilogue peel iters set to vf/2 because "
2098 "loop iterations are unknown .");
2100 /* If peeled iterations are known but number of scalar loop
2101 iterations are unknown, count a taken branch per peeled loop. */
2102 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
2107 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2108 peel_iters_prologue = niters < peel_iters_prologue ?
2109 niters : peel_iters_prologue;
2110 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2114 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2115 + (peel_iters_epilogue * scalar_single_iter_cost)
2118 /* FORNOW: The scalar outside cost is incremented in one of the
2121 1. The vectorizer checks for alignment and aliasing and generates
2122 a condition that allows dynamic vectorization. A cost model
2123 check is ANDED with the versioning condition. Hence scalar code
2124 path now has the added cost of the versioning check.
2126 if (cost > th & versioning_check)
2129 Hence run-time scalar is incremented by not-taken branch cost.
2131 2. The vectorizer then checks if a prologue is required. If the
2132 cost model check was not done before during versioning, it has to
2133 be done before the prologue check.
2136 prologue = scalar_iters
2141 if (prologue == num_iters)
2144 Hence the run-time scalar cost is incremented by a taken branch,
2145 plus a not-taken branch, plus a taken branch cost.
2147 3. The vectorizer then checks if an epilogue is required. If the
2148 cost model check was not done before during prologue check, it
2149 has to be done with the epilogue check.
2155 if (prologue == num_iters)
2158 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2161 Hence the run-time scalar cost should be incremented by 2 taken
2164 TODO: The back end may reorder the BBS's differently and reverse
2165 conditions/branch directions. Change the estimates below to
2166 something more reasonable. */
2168 /* If the number of iterations is known and we do not do versioning, we can
2169 decide whether to vectorize at compile time. Hence the scalar version
2170 do not carry cost model guard costs. */
2171 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2172 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2173 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2175 /* Cost model check occurs at versioning. */
2176 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2177 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2178 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
2181 /* Cost model check occurs at prologue generation. */
2182 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2183 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
2184 + TARG_COND_NOT_TAKEN_BRANCH_COST;
2185 /* Cost model check occurs at epilogue generation. */
2187 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
2191 /* Add SLP costs. */
2192 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2193 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
2195 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2196 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2199 /* Calculate number of iterations required to make the vector version
2200 profitable, relative to the loop bodies only. The following condition
2202 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2204 SIC = scalar iteration cost, VIC = vector iteration cost,
2205 VOC = vector outside cost, VF = vectorization factor,
2206 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2207 SOC = scalar outside cost for run time cost model check. */
2209 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2211 if (vec_outside_cost <= 0)
2212 min_profitable_iters = 1;
2215 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2216 - vec_inside_cost * peel_iters_prologue
2217 - vec_inside_cost * peel_iters_epilogue)
2218 / ((scalar_single_iter_cost * vf)
2221 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2222 <= ((vec_inside_cost * min_profitable_iters)
2223 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2224 min_profitable_iters++;
2227 /* vector version will never be profitable. */
2230 if (vect_print_dump_info (REPORT_COST))
2231 fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2232 "divided by the scalar iteration cost = %d "
2233 "is greater or equal to the vectorization factor = %d.",
2234 vec_inside_cost, scalar_single_iter_cost, vf);
2238 if (vect_print_dump_info (REPORT_COST))
2240 fprintf (vect_dump, "Cost model analysis: \n");
2241 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2243 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2245 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2246 scalar_single_iter_cost);
2247 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2248 fprintf (vect_dump, " prologue iterations: %d\n",
2249 peel_iters_prologue);
2250 fprintf (vect_dump, " epilogue iterations: %d\n",
2251 peel_iters_epilogue);
2252 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2253 min_profitable_iters);
2256 min_profitable_iters =
2257 min_profitable_iters < vf ? vf : min_profitable_iters;
2259 /* Because the condition we create is:
2260 if (niters <= min_profitable_iters)
2261 then skip the vectorized loop. */
2262 min_profitable_iters--;
2264 if (vect_print_dump_info (REPORT_COST))
2265 fprintf (vect_dump, " Profitability threshold = %d\n",
2266 min_profitable_iters);
2268 return min_profitable_iters;
2272 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2273 functions. Design better to avoid maintenance issues. */
2275 /* Function vect_model_reduction_cost.
2277 Models cost for a reduction operation, including the vector ops
2278 generated within the strip-mine loop, the initial definition before
2279 the loop, and the epilogue code that must be generated. */
2282 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2286 enum tree_code code;
2289 gimple stmt, orig_stmt;
2291 enum machine_mode mode;
2292 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2293 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2296 /* Cost of reduction op inside loop. */
2297 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
2299 stmt = STMT_VINFO_STMT (stmt_info);
2301 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2303 case GIMPLE_SINGLE_RHS:
2304 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2305 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2307 case GIMPLE_UNARY_RHS:
2308 reduction_op = gimple_assign_rhs1 (stmt);
2310 case GIMPLE_BINARY_RHS:
2311 reduction_op = gimple_assign_rhs2 (stmt);
2317 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2320 if (vect_print_dump_info (REPORT_COST))
2322 fprintf (vect_dump, "unsupported data-type ");
2323 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2328 mode = TYPE_MODE (vectype);
2329 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2332 orig_stmt = STMT_VINFO_STMT (stmt_info);
2334 code = gimple_assign_rhs_code (orig_stmt);
2336 /* Add in cost for initial definition. */
2337 outer_cost += TARG_SCALAR_TO_VEC_COST;
2339 /* Determine cost of epilogue code.
2341 We have a reduction operator that will reduce the vector in one statement.
2342 Also requires scalar extract. */
2344 if (!nested_in_vect_loop_p (loop, orig_stmt))
2346 if (reduc_code != ERROR_MARK)
2347 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
2350 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2352 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2353 int element_bitsize = tree_low_cst (bitsize, 1);
2354 int nelements = vec_size_in_bits / element_bitsize;
2356 optab = optab_for_tree_code (code, vectype, optab_default);
2358 /* We have a whole vector shift available. */
2359 if (VECTOR_MODE_P (mode)
2360 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
2361 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2362 /* Final reduction via vector shifts and the reduction operator. Also
2363 requires scalar extract. */
2364 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
2365 + TARG_VEC_TO_SCALAR_COST);
2367 /* Use extracts and reduction op for final reduction. For N elements,
2368 we have N extracts and N-1 reduction ops. */
2369 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
2373 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2375 if (vect_print_dump_info (REPORT_COST))
2376 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2377 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2378 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2384 /* Function vect_model_induction_cost.
2386 Models cost for induction operations. */
2389 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2391 /* loop cost for vec_loop. */
2392 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
2393 /* prologue cost for vec_init and vec_step. */
2394 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
2396 if (vect_print_dump_info (REPORT_COST))
2397 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2398 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2399 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2403 /* Function get_initial_def_for_induction
2406 STMT - a stmt that performs an induction operation in the loop.
2407 IV_PHI - the initial value of the induction variable
2410 Return a vector variable, initialized with the first VF values of
2411 the induction variable. E.g., for an iv with IV_PHI='X' and
2412 evolution S, for a vector of 4 units, we want to return:
2413 [X, X + S, X + 2*S, X + 3*S]. */
2416 get_initial_def_for_induction (gimple iv_phi)
2418 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2419 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2420 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2421 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2424 edge pe = loop_preheader_edge (loop);
2425 struct loop *iv_loop;
2427 tree vec, vec_init, vec_step, t;
2431 gimple init_stmt, induction_phi, new_stmt;
2432 tree induc_def, vec_def, vec_dest;
2433 tree init_expr, step_expr;
2434 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2439 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2440 bool nested_in_vect_loop = false;
2441 gimple_seq stmts = NULL;
2442 imm_use_iterator imm_iter;
2443 use_operand_p use_p;
2447 gimple_stmt_iterator si;
2448 basic_block bb = gimple_bb (iv_phi);
2451 vectype = get_vectype_for_scalar_type (scalar_type);
2452 gcc_assert (vectype);
2453 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2454 ncopies = vf / nunits;
2456 gcc_assert (phi_info);
2457 gcc_assert (ncopies >= 1);
2459 /* Find the first insertion point in the BB. */
2460 si = gsi_after_labels (bb);
2462 if (INTEGRAL_TYPE_P (scalar_type))
2463 step_expr = build_int_cst (scalar_type, 0);
2464 else if (POINTER_TYPE_P (scalar_type))
2465 step_expr = build_int_cst (sizetype, 0);
2467 step_expr = build_real (scalar_type, dconst0);
2469 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2470 if (nested_in_vect_loop_p (loop, iv_phi))
2472 nested_in_vect_loop = true;
2473 iv_loop = loop->inner;
2477 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2479 latch_e = loop_latch_edge (iv_loop);
2480 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2482 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2483 gcc_assert (access_fn);
2484 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2485 &init_expr, &step_expr);
2487 pe = loop_preheader_edge (iv_loop);
2489 /* Create the vector that holds the initial_value of the induction. */
2490 if (nested_in_vect_loop)
2492 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2493 been created during vectorization of previous stmts; We obtain it from
2494 the STMT_VINFO_VEC_STMT of the defining stmt. */
2495 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2496 loop_preheader_edge (iv_loop));
2497 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2501 /* iv_loop is the loop to be vectorized. Create:
2502 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
2503 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2504 add_referenced_var (new_var);
2506 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2509 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2510 gcc_assert (!new_bb);
2514 t = tree_cons (NULL_TREE, init_expr, t);
2515 for (i = 1; i < nunits; i++)
2517 /* Create: new_name_i = new_name + step_expr */
2518 enum tree_code code = POINTER_TYPE_P (scalar_type)
2519 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2520 init_stmt = gimple_build_assign_with_ops (code, new_var,
2521 new_name, step_expr);
2522 new_name = make_ssa_name (new_var, init_stmt);
2523 gimple_assign_set_lhs (init_stmt, new_name);
2525 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2526 gcc_assert (!new_bb);
2528 if (vect_print_dump_info (REPORT_DETAILS))
2530 fprintf (vect_dump, "created new init_stmt: ");
2531 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2533 t = tree_cons (NULL_TREE, new_name, t);
2535 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
2536 vec = build_constructor_from_list (vectype, nreverse (t));
2537 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2541 /* Create the vector that holds the step of the induction. */
2542 if (nested_in_vect_loop)
2543 /* iv_loop is nested in the loop to be vectorized. Generate:
2544 vec_step = [S, S, S, S] */
2545 new_name = step_expr;
2548 /* iv_loop is the loop to be vectorized. Generate:
2549 vec_step = [VF*S, VF*S, VF*S, VF*S] */
2550 expr = build_int_cst (TREE_TYPE (step_expr), vf);
2551 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2556 for (i = 0; i < nunits; i++)
2557 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2558 gcc_assert (CONSTANT_CLASS_P (new_name));
2559 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2560 gcc_assert (stepvectype);
2561 vec = build_vector (stepvectype, t);
2562 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2565 /* Create the following def-use cycle:
2570 vec_iv = PHI <vec_init, vec_loop>
2574 vec_loop = vec_iv + vec_step; */
2576 /* Create the induction-phi that defines the induction-operand. */
2577 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2578 add_referenced_var (vec_dest);
2579 induction_phi = create_phi_node (vec_dest, iv_loop->header);
2580 set_vinfo_for_stmt (induction_phi,
2581 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2582 induc_def = PHI_RESULT (induction_phi);
2584 /* Create the iv update inside the loop */
2585 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2586 induc_def, vec_step);
2587 vec_def = make_ssa_name (vec_dest, new_stmt);
2588 gimple_assign_set_lhs (new_stmt, vec_def);
2589 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2590 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2593 /* Set the arguments of the phi node: */
2594 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2595 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2599 /* In case that vectorization factor (VF) is bigger than the number
2600 of elements that we can fit in a vectype (nunits), we have to generate
2601 more than one vector stmt - i.e - we need to "unroll" the
2602 vector stmt by a factor VF/nunits. For more details see documentation
2603 in vectorizable_operation. */
2607 stmt_vec_info prev_stmt_vinfo;
2608 /* FORNOW. This restriction should be relaxed. */
2609 gcc_assert (!nested_in_vect_loop);
2611 /* Create the vector that holds the step of the induction. */
2612 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2613 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2616 for (i = 0; i < nunits; i++)
2617 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2618 gcc_assert (CONSTANT_CLASS_P (new_name));
2619 vec = build_vector (stepvectype, t);
2620 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2622 vec_def = induc_def;
2623 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2624 for (i = 1; i < ncopies; i++)
2626 /* vec_i = vec_prev + vec_step */
2627 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2629 vec_def = make_ssa_name (vec_dest, new_stmt);
2630 gimple_assign_set_lhs (new_stmt, vec_def);
2632 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2633 set_vinfo_for_stmt (new_stmt,
2634 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2635 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2636 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2640 if (nested_in_vect_loop)
2642 /* Find the loop-closed exit-phi of the induction, and record
2643 the final vector of induction results: */
2645 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2647 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2649 exit_phi = USE_STMT (use_p);
2655 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2656 /* FORNOW. Currently not supporting the case that an inner-loop induction
2657 is not used in the outer-loop (i.e. only outside the outer-loop). */
2658 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2659 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2661 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2662 if (vect_print_dump_info (REPORT_DETAILS))
2664 fprintf (vect_dump, "vector of inductions after inner-loop:");
2665 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2671 if (vect_print_dump_info (REPORT_DETAILS))
2673 fprintf (vect_dump, "transform induction: created def-use cycle: ");
2674 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2675 fprintf (vect_dump, "\n");
2676 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2679 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2684 /* Function get_initial_def_for_reduction
2687 STMT - a stmt that performs a reduction operation in the loop.
2688 INIT_VAL - the initial value of the reduction variable
2691 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2692 of the reduction (used for adjusting the epilog - see below).
2693 Return a vector variable, initialized according to the operation that STMT
2694 performs. This vector will be used as the initial value of the
2695 vector of partial results.
2697 Option1 (adjust in epilog): Initialize the vector as follows:
2698 add/bit or/xor: [0,0,...,0,0]
2699 mult/bit and: [1,1,...,1,1]
2700 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2701 and when necessary (e.g. add/mult case) let the caller know
2702 that it needs to adjust the result by init_val.
2704 Option2: Initialize the vector as follows:
2705 add/bit or/xor: [init_val,0,0,...,0]
2706 mult/bit and: [init_val,1,1,...,1]
2707 min/max/cond_expr: [init_val,init_val,...,init_val]
2708 and no adjustments are needed.
2710 For example, for the following code:
2716 STMT is 's = s + a[i]', and the reduction variable is 's'.
2717 For a vector of 4 units, we want to return either [0,0,0,init_val],
2718 or [0,0,0,0] and let the caller know that it needs to adjust
2719 the result at the end by 'init_val'.
2721 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2722 initialization vector is simpler (same element in all entries), if
2723 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2725 A cost model should help decide between these two schemes. */
2728 get_initial_def_for_reduction (gimple stmt, tree init_val,
2729 tree *adjustment_def)
2731 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2732 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2733 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2734 tree scalar_type = TREE_TYPE (init_val);
2735 tree vectype = get_vectype_for_scalar_type (scalar_type);
2737 enum tree_code code = gimple_assign_rhs_code (stmt);
2742 bool nested_in_vect_loop = false;
2744 REAL_VALUE_TYPE real_init_val = dconst0;
2745 int int_init_val = 0;
2746 gimple def_stmt = NULL;
2748 gcc_assert (vectype);
2749 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2751 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2752 || SCALAR_FLOAT_TYPE_P (scalar_type));
2754 if (nested_in_vect_loop_p (loop, stmt))
2755 nested_in_vect_loop = true;
2757 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2759 /* In case of double reduction we only create a vector variable to be put
2760 in the reduction phi node. The actual statement creation is done in
2761 vect_create_epilog_for_reduction. */
2762 if (adjustment_def && nested_in_vect_loop
2763 && TREE_CODE (init_val) == SSA_NAME
2764 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2765 && gimple_code (def_stmt) == GIMPLE_PHI
2766 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2767 && vinfo_for_stmt (def_stmt)
2768 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2769 == vect_double_reduction_def)
2771 *adjustment_def = NULL;
2772 return vect_create_destination_var (init_val, vectype);
2775 if (TREE_CONSTANT (init_val))
2777 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2778 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2780 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2783 init_value = init_val;
2787 case WIDEN_SUM_EXPR:
2795 /* ADJUSMENT_DEF is NULL when called from
2796 vect_create_epilog_for_reduction to vectorize double reduction. */
2799 if (nested_in_vect_loop)
2800 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2803 *adjustment_def = init_val;
2806 if (code == MULT_EXPR || code == BIT_AND_EXPR)
2808 real_init_val = dconst1;
2812 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2813 def_for_init = build_real (scalar_type, real_init_val);
2815 def_for_init = build_int_cst (scalar_type, int_init_val);
2817 /* Create a vector of '0' or '1' except the first element. */
2818 for (i = nunits - 2; i >= 0; --i)
2819 t = tree_cons (NULL_TREE, def_for_init, t);
2821 /* Option1: the first element is '0' or '1' as well. */
2824 t = tree_cons (NULL_TREE, def_for_init, t);
2825 init_def = build_vector (vectype, t);
2829 /* Option2: the first element is INIT_VAL. */
2830 t = tree_cons (NULL_TREE, init_value, t);
2831 if (TREE_CONSTANT (init_val))
2832 init_def = build_vector (vectype, t);
2834 init_def = build_constructor_from_list (vectype, t);
2843 *adjustment_def = NULL_TREE;
2844 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2848 for (i = nunits - 1; i >= 0; --i)
2849 t = tree_cons (NULL_TREE, init_value, t);
2851 if (TREE_CONSTANT (init_val))
2852 init_def = build_vector (vectype, t);
2854 init_def = build_constructor_from_list (vectype, t);
2866 /* Function vect_create_epilog_for_reduction
2868 Create code at the loop-epilog to finalize the result of a reduction
2871 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
2872 reduction statements.
2873 STMT is the scalar reduction stmt that is being vectorized.
2874 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2875 number of elements that we can fit in a vectype (nunits). In this case
2876 we have to generate more than one vector stmt - i.e - we need to "unroll"
2877 the vector stmt by a factor VF/nunits. For more details see documentation
2878 in vectorizable_operation.
2879 REDUC_CODE is the tree-code for the epilog reduction.
2880 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
2882 REDUC_INDEX is the index of the operand in the right hand side of the
2883 statement that is defined by REDUCTION_PHI.
2884 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
2885 SLP_NODE is an SLP node containing a group of reduction statements. The
2886 first one in this group is STMT.
2889 1. Creates the reduction def-use cycles: sets the arguments for
2891 The loop-entry argument is the vectorized initial-value of the reduction.
2892 The loop-latch argument is taken from VECT_DEFS - the vector of partial
2894 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
2895 by applying the operation specified by REDUC_CODE if available, or by
2896 other means (whole-vector shifts or a scalar loop).
2897 The function also creates a new phi node at the loop exit to preserve
2898 loop-closed form, as illustrated below.
2900 The flow at the entry to this function:
2903 vec_def = phi <null, null> # REDUCTION_PHI
2904 VECT_DEF = vector_stmt # vectorized form of STMT
2905 s_loop = scalar_stmt # (scalar) STMT
2907 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2911 The above is transformed by this function into:
2914 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2915 VECT_DEF = vector_stmt # vectorized form of STMT
2916 s_loop = scalar_stmt # (scalar) STMT
2918 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2919 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2920 v_out2 = reduce <v_out1>
2921 s_out3 = extract_field <v_out2, 0>
2922 s_out4 = adjust_result <s_out3>
2928 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
2929 int ncopies, enum tree_code reduc_code,
2930 VEC (gimple, heap) *reduction_phis,
2931 int reduc_index, bool double_reduc,
2934 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2935 stmt_vec_info prev_phi_info;
2937 enum machine_mode mode;
2938 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2939 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
2940 basic_block exit_bb;
2943 gimple new_phi = NULL, phi;
2944 gimple_stmt_iterator exit_gsi;
2946 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
2947 gimple epilog_stmt = NULL;
2948 enum tree_code code = gimple_assign_rhs_code (stmt);
2950 tree bitsize, bitpos;
2951 tree adjustment_def = NULL;
2952 tree vec_initial_def = NULL;
2953 tree reduction_op, expr, def;
2954 tree orig_name, scalar_result;
2955 imm_use_iterator imm_iter;
2956 use_operand_p use_p;
2957 bool extract_scalar_result = false;
2958 gimple use_stmt, orig_stmt, reduction_phi = NULL;
2959 bool nested_in_vect_loop = false;
2960 VEC (gimple, heap) *new_phis = NULL;
2961 enum vect_def_type dt = vect_unknown_def_type;
2963 VEC (tree, heap) *scalar_results = NULL;
2964 int group_size = 1, k, ratio;
2965 VEC (tree, heap) *vec_initial_defs = NULL;
2966 VEC (gimple, heap) *phis;
2969 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
2971 if (nested_in_vect_loop_p (loop, stmt))
2975 nested_in_vect_loop = true;
2976 gcc_assert (!slp_node);
2979 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2981 case GIMPLE_SINGLE_RHS:
2982 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
2984 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
2986 case GIMPLE_UNARY_RHS:
2987 reduction_op = gimple_assign_rhs1 (stmt);
2989 case GIMPLE_BINARY_RHS:
2990 reduction_op = reduc_index ?
2991 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
2997 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2998 gcc_assert (vectype);
2999 mode = TYPE_MODE (vectype);
3001 /* 1. Create the reduction def-use cycle:
3002 Set the arguments of REDUCTION_PHIS, i.e., transform
3005 vec_def = phi <null, null> # REDUCTION_PHI
3006 VECT_DEF = vector_stmt # vectorized form of STMT
3012 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3013 VECT_DEF = vector_stmt # vectorized form of STMT
3016 (in case of SLP, do it for all the phis). */
3018 /* Get the loop-entry arguments. */
3020 vect_get_slp_defs (slp_node, &vec_initial_defs, NULL, reduc_index);
3023 vec_initial_defs = VEC_alloc (tree, heap, 1);
3024 /* For the case of reduction, vect_get_vec_def_for_operand returns
3025 the scalar def before the loop, that defines the initial value
3026 of the reduction variable. */
3027 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3029 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3032 /* Set phi nodes arguments. */
3033 for (i = 0; VEC_iterate (gimple, reduction_phis, i, phi); i++)
3035 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3036 tree def = VEC_index (tree, vect_defs, i);
3037 for (j = 0; j < ncopies; j++)
3039 /* Set the loop-entry arg of the reduction-phi. */
3040 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3043 /* Set the loop-latch arg for the reduction-phi. */
3045 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3047 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3049 if (vect_print_dump_info (REPORT_DETAILS))
3051 fprintf (vect_dump, "transform reduction: created def-use"
3053 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3054 fprintf (vect_dump, "\n");
3055 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3059 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3063 VEC_free (tree, heap, vec_initial_defs);
3065 /* 2. Create epilog code.
3066 The reduction epilog code operates across the elements of the vector
3067 of partial results computed by the vectorized loop.
3068 The reduction epilog code consists of:
3070 step 1: compute the scalar result in a vector (v_out2)
3071 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3072 step 3: adjust the scalar result (s_out3) if needed.
3074 Step 1 can be accomplished using one the following three schemes:
3075 (scheme 1) using reduc_code, if available.
3076 (scheme 2) using whole-vector shifts, if available.
3077 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3080 The overall epilog code looks like this:
3082 s_out0 = phi <s_loop> # original EXIT_PHI
3083 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3084 v_out2 = reduce <v_out1> # step 1
3085 s_out3 = extract_field <v_out2, 0> # step 2
3086 s_out4 = adjust_result <s_out3> # step 3
3088 (step 3 is optional, and steps 1 and 2 may be combined).
3089 Lastly, the uses of s_out0 are replaced by s_out4. */
3092 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3093 v_out1 = phi <VECT_DEF>
3094 Store them in NEW_PHIS. */
3096 exit_bb = single_exit (loop)->dest;
3097 prev_phi_info = NULL;
3098 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3099 for (i = 0; VEC_iterate (tree, vect_defs, i, def); i++)
3101 for (j = 0; j < ncopies; j++)
3103 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3104 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3106 VEC_quick_push (gimple, new_phis, phi);
3109 def = vect_get_vec_def_for_stmt_copy (dt, def);
3110 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3113 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3114 prev_phi_info = vinfo_for_stmt (phi);
3118 exit_gsi = gsi_after_labels (exit_bb);
3120 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3121 (i.e. when reduc_code is not available) and in the final adjustment
3122 code (if needed). Also get the original scalar reduction variable as
3123 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3124 represents a reduction pattern), the tree-code and scalar-def are
3125 taken from the original stmt that the pattern-stmt (STMT) replaces.
3126 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3127 are taken from STMT. */
3129 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3132 /* Regular reduction */
3137 /* Reduction pattern */
3138 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3139 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3140 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3143 code = gimple_assign_rhs_code (orig_stmt);
3144 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3145 partial results are added and not subtracted. */
3146 if (code == MINUS_EXPR)
3149 scalar_dest = gimple_assign_lhs (orig_stmt);
3150 scalar_type = TREE_TYPE (scalar_dest);
3151 scalar_results = VEC_alloc (tree, heap, group_size);
3152 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3153 bitsize = TYPE_SIZE (scalar_type);
3155 /* In case this is a reduction in an inner-loop while vectorizing an outer
3156 loop - we don't need to extract a single scalar result at the end of the
3157 inner-loop (unless it is double reduction, i.e., the use of reduction is
3158 outside the outer-loop). The final vector of partial results will be used
3159 in the vectorized outer-loop, or reduced to a scalar result at the end of
3161 if (nested_in_vect_loop && !double_reduc)
3162 goto vect_finalize_reduction;
3164 /* 2.3 Create the reduction code, using one of the three schemes described
3165 above. In SLP we simply need to extract all the elements from the
3166 vector (without reducing them), so we use scalar shifts. */
3167 if (reduc_code != ERROR_MARK && !slp_node)
3171 /*** Case 1: Create:
3172 v_out2 = reduc_expr <v_out1> */
3174 if (vect_print_dump_info (REPORT_DETAILS))
3175 fprintf (vect_dump, "Reduce using direct vector reduction.");
3177 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3178 new_phi = VEC_index (gimple, new_phis, 0);
3179 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
3180 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3181 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3182 gimple_assign_set_lhs (epilog_stmt, new_temp);
3183 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3185 extract_scalar_result = true;
3189 enum tree_code shift_code = ERROR_MARK;
3190 bool have_whole_vector_shift = true;
3192 int element_bitsize = tree_low_cst (bitsize, 1);
3193 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3196 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
3197 shift_code = VEC_RSHIFT_EXPR;
3199 have_whole_vector_shift = false;
3201 /* Regardless of whether we have a whole vector shift, if we're
3202 emulating the operation via tree-vect-generic, we don't want
3203 to use it. Only the first round of the reduction is likely
3204 to still be profitable via emulation. */
3205 /* ??? It might be better to emit a reduction tree code here, so that
3206 tree-vect-generic can expand the first round via bit tricks. */
3207 if (!VECTOR_MODE_P (mode))
3208 have_whole_vector_shift = false;
3211 optab optab = optab_for_tree_code (code, vectype, optab_default);
3212 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
3213 have_whole_vector_shift = false;
3216 if (have_whole_vector_shift && !slp_node)
3218 /*** Case 2: Create:
3219 for (offset = VS/2; offset >= element_size; offset/=2)
3221 Create: va' = vec_shift <va, offset>
3222 Create: va = vop <va, va'>
3225 if (vect_print_dump_info (REPORT_DETAILS))
3226 fprintf (vect_dump, "Reduce using vector shifts");
3228 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3229 new_phi = VEC_index (gimple, new_phis, 0);
3230 new_temp = PHI_RESULT (new_phi);
3231 for (bit_offset = vec_size_in_bits/2;
3232 bit_offset >= element_bitsize;
3235 tree bitpos = size_int (bit_offset);
3237 epilog_stmt = gimple_build_assign_with_ops (shift_code,
3238 vec_dest, new_temp, bitpos);
3239 new_name = make_ssa_name (vec_dest, epilog_stmt);
3240 gimple_assign_set_lhs (epilog_stmt, new_name);
3241 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3243 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3244 new_name, new_temp);
3245 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3246 gimple_assign_set_lhs (epilog_stmt, new_temp);
3247 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3250 extract_scalar_result = true;
3256 /*** Case 3: Create:
3257 s = extract_field <v_out2, 0>
3258 for (offset = element_size;
3259 offset < vector_size;
3260 offset += element_size;)
3262 Create: s' = extract_field <v_out2, offset>
3263 Create: s = op <s, s'> // For non SLP cases
3266 if (vect_print_dump_info (REPORT_DETAILS))
3267 fprintf (vect_dump, "Reduce using scalar code. ");
3269 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3270 for (i = 0; VEC_iterate (gimple, new_phis, i, new_phi); i++)
3272 vec_temp = PHI_RESULT (new_phi);
3273 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3275 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3276 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3277 gimple_assign_set_lhs (epilog_stmt, new_temp);
3278 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3280 /* In SLP we don't need to apply reduction operation, so we just
3281 collect s' values in SCALAR_RESULTS. */
3283 VEC_safe_push (tree, heap, scalar_results, new_temp);
3285 for (bit_offset = element_bitsize;
3286 bit_offset < vec_size_in_bits;
3287 bit_offset += element_bitsize)
3289 tree bitpos = bitsize_int (bit_offset);
3290 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3293 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3294 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3295 gimple_assign_set_lhs (epilog_stmt, new_name);
3296 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3300 /* In SLP we don't need to apply reduction operation, so
3301 we just collect s' values in SCALAR_RESULTS. */
3302 new_temp = new_name;
3303 VEC_safe_push (tree, heap, scalar_results, new_name);
3307 epilog_stmt = gimple_build_assign_with_ops (code,
3308 new_scalar_dest, new_name, new_temp);
3309 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3310 gimple_assign_set_lhs (epilog_stmt, new_temp);
3311 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3316 /* The only case where we need to reduce scalar results in SLP, is
3317 unrolling. If the size of SCALAR_RESULTS is greater than
3318 GROUP_SIZE, we reduce them combining elements modulo
3322 tree res, first_res, new_res;
3325 /* Reduce multiple scalar results in case of SLP unrolling. */
3326 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3329 first_res = VEC_index (tree, scalar_results, j % group_size);
3330 new_stmt = gimple_build_assign_with_ops (code,
3331 new_scalar_dest, first_res, res);
3332 new_res = make_ssa_name (new_scalar_dest, new_stmt);
3333 gimple_assign_set_lhs (new_stmt, new_res);
3334 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3335 VEC_replace (tree, scalar_results, j % group_size, new_res);
3339 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
3340 VEC_safe_push (tree, heap, scalar_results, new_temp);
3342 extract_scalar_result = false;
3346 /* 2.4 Extract the final scalar result. Create:
3347 s_out3 = extract_field <v_out2, bitpos> */
3349 if (extract_scalar_result)
3353 if (vect_print_dump_info (REPORT_DETAILS))
3354 fprintf (vect_dump, "extract scalar result");
3356 if (BYTES_BIG_ENDIAN)
3357 bitpos = size_binop (MULT_EXPR,
3358 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3359 TYPE_SIZE (scalar_type));
3361 bitpos = bitsize_zero_node;
3363 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3364 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3365 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3366 gimple_assign_set_lhs (epilog_stmt, new_temp);
3367 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3368 VEC_safe_push (tree, heap, scalar_results, new_temp);
3371 vect_finalize_reduction:
3373 /* 2.5 Adjust the final result by the initial value of the reduction
3374 variable. (When such adjustment is not needed, then
3375 'adjustment_def' is zero). For example, if code is PLUS we create:
3376 new_temp = loop_exit_def + adjustment_def */
3380 gcc_assert (!slp_node);
3381 if (nested_in_vect_loop)
3383 new_phi = VEC_index (gimple, new_phis, 0);
3384 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3385 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3386 new_dest = vect_create_destination_var (scalar_dest, vectype);
3390 new_temp = VEC_index (tree, scalar_results, 0);
3391 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);