2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
7 This file is part of GCC.
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
25 #include "coretypes.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
34 #include "cfglayout.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
44 /* Loop Vectorization Pass.
46 This pass tries to vectorize loops.
48 For example, the vectorizer transforms the following simple loop:
50 short a[N]; short b[N]; short c[N]; int i;
56 as if it was manually vectorized by rewriting the source code into:
58 typedef int __attribute__((mode(V8HI))) v8hi;
59 short a[N]; short b[N]; short c[N]; int i;
60 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
63 for (i=0; i<N/8; i++){
70 The main entry to this pass is vectorize_loops(), in which
71 the vectorizer applies a set of analyses on a given set of loops,
72 followed by the actual vectorization transformation for the loops that
73 had successfully passed the analysis phase.
74 Throughout this pass we make a distinction between two types of
75 data: scalars (which are represented by SSA_NAMES), and memory references
76 ("data-refs"). These two types of data require different handling both
77 during analysis and transformation. The types of data-refs that the
78 vectorizer currently supports are ARRAY_REFS which base is an array DECL
79 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
80 accesses are required to have a simple (consecutive) access pattern.
84 The driver for the analysis phase is vect_analyze_loop().
85 It applies a set of analyses, some of which rely on the scalar evolution
86 analyzer (scev) developed by Sebastian Pop.
88 During the analysis phase the vectorizer records some information
89 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
90 loop, as well as general information about the loop as a whole, which is
91 recorded in a "loop_vec_info" struct attached to each loop.
95 The loop transformation phase scans all the stmts in the loop, and
96 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
97 the loop that needs to be vectorized. It inserts the vector code sequence
98 just before the scalar stmt S, and records a pointer to the vector code
99 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
100 attached to S). This pointer will be used for the vectorization of following
101 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
102 otherwise, we rely on dead code elimination for removing it.
104 For example, say stmt S1 was vectorized into stmt VS1:
107 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
110 To vectorize stmt S2, the vectorizer first finds the stmt that defines
111 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
112 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
113 resulting sequence would be:
116 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
118 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
120 Operands that are not SSA_NAMEs, are data-refs that appear in
121 load/store operations (like 'x[i]' in S1), and are handled differently.
125 Currently the only target specific information that is used is the
126 size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
127 support different sizes of vectors, for now will need to specify one value
128 for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
130 Since we only vectorize operations which vector form can be
131 expressed using existing tree codes, to verify that an operation is
132 supported, the vectorizer checks the relevant optab at the relevant
133 machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
134 the value found is CODE_FOR_nothing, then there's no target support, and
135 we can't vectorize the stmt.
137 For additional information on this project see:
138 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
141 /* Function vect_determine_vectorization_factor
143 Determine the vectorization factor (VF). VF is the number of data elements
144 that are operated upon in parallel in a single iteration of the vectorized
145 loop. For example, when vectorizing a loop that operates on 4byte elements,
146 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
147 elements can fit in a single vector register.
149 We currently support vectorization of loops in which all types operated upon
150 are of the same size. Therefore this function currently sets VF according to
151 the size of the types operated upon, and fails if there are multiple sizes
154 VF is also the factor by which the loop iterations are strip-mined, e.g.:
161 for (i=0; i<N; i+=VF){
162 a[i:VF] = b[i:VF] + c[i:VF];
167 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
169 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
170 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
171 int nbbs = loop->num_nodes;
172 gimple_stmt_iterator si;
173 unsigned int vectorization_factor = 0;
178 stmt_vec_info stmt_info;
182 if (vect_print_dump_info (REPORT_DETAILS))
183 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
185 for (i = 0; i < nbbs; i++)
187 basic_block bb = bbs[i];
189 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
192 stmt_info = vinfo_for_stmt (phi);
193 if (vect_print_dump_info (REPORT_DETAILS))
195 fprintf (vect_dump, "==> examining phi: ");
196 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
199 gcc_assert (stmt_info);
201 if (STMT_VINFO_RELEVANT_P (stmt_info))
203 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
204 scalar_type = TREE_TYPE (PHI_RESULT (phi));
206 if (vect_print_dump_info (REPORT_DETAILS))
208 fprintf (vect_dump, "get vectype for scalar type: ");
209 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
212 vectype = get_vectype_for_scalar_type (scalar_type);
215 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
218 "not vectorized: unsupported data-type ");
219 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
223 STMT_VINFO_VECTYPE (stmt_info) = vectype;
225 if (vect_print_dump_info (REPORT_DETAILS))
227 fprintf (vect_dump, "vectype: ");
228 print_generic_expr (vect_dump, vectype, TDF_SLIM);
231 nunits = TYPE_VECTOR_SUBPARTS (vectype);
232 if (vect_print_dump_info (REPORT_DETAILS))
233 fprintf (vect_dump, "nunits = %d", nunits);
235 if (!vectorization_factor
236 || (nunits > vectorization_factor))
237 vectorization_factor = nunits;
241 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
244 gimple stmt = gsi_stmt (si);
245 stmt_info = vinfo_for_stmt (stmt);
247 if (vect_print_dump_info (REPORT_DETAILS))
249 fprintf (vect_dump, "==> examining statement: ");
250 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
253 gcc_assert (stmt_info);
255 /* skip stmts which do not need to be vectorized. */
256 if (!STMT_VINFO_RELEVANT_P (stmt_info)
257 && !STMT_VINFO_LIVE_P (stmt_info))
259 if (vect_print_dump_info (REPORT_DETAILS))
260 fprintf (vect_dump, "skip.");
264 if (gimple_get_lhs (stmt) == NULL_TREE)
266 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
268 fprintf (vect_dump, "not vectorized: irregular stmt.");
269 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
274 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
276 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
278 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
279 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
284 if (STMT_VINFO_VECTYPE (stmt_info))
286 /* The only case when a vectype had been already set is for stmts
287 that contain a dataref, or for "pattern-stmts" (stmts generated
288 by the vectorizer to represent/replace a certain idiom). */
289 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
290 || is_pattern_stmt_p (stmt_info));
291 vectype = STMT_VINFO_VECTYPE (stmt_info);
295 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
296 && !is_pattern_stmt_p (stmt_info));
298 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
299 if (vect_print_dump_info (REPORT_DETAILS))
301 fprintf (vect_dump, "get vectype for scalar type: ");
302 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
304 vectype = get_vectype_for_scalar_type (scalar_type);
307 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
310 "not vectorized: unsupported data-type ");
311 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
316 STMT_VINFO_VECTYPE (stmt_info) = vectype;
319 /* The vectorization factor is according to the smallest
320 scalar type (or the largest vector size, but we only
321 support one vector size per loop). */
322 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
324 if (vect_print_dump_info (REPORT_DETAILS))
326 fprintf (vect_dump, "get vectype for scalar type: ");
327 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
329 vf_vectype = get_vectype_for_scalar_type (scalar_type);
332 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
335 "not vectorized: unsupported data-type ");
336 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
341 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
342 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
344 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
347 "not vectorized: different sized vector "
348 "types in statement, ");
349 print_generic_expr (vect_dump, vectype, TDF_SLIM);
350 fprintf (vect_dump, " and ");
351 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
356 if (vect_print_dump_info (REPORT_DETAILS))
358 fprintf (vect_dump, "vectype: ");
359 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
362 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
363 if (vect_print_dump_info (REPORT_DETAILS))
364 fprintf (vect_dump, "nunits = %d", nunits);
366 if (!vectorization_factor
367 || (nunits > vectorization_factor))
368 vectorization_factor = nunits;
372 /* TODO: Analyze cost. Decide if worth while to vectorize. */
373 if (vect_print_dump_info (REPORT_DETAILS))
374 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
375 if (vectorization_factor <= 1)
377 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
378 fprintf (vect_dump, "not vectorized: unsupported data-type");
381 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
387 /* Function vect_is_simple_iv_evolution.
389 FORNOW: A simple evolution of an induction variables in the loop is
390 considered a polynomial evolution with constant step. */
393 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
398 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
400 /* When there is no evolution in this loop, the evolution function
402 if (evolution_part == NULL_TREE)
405 /* When the evolution is a polynomial of degree >= 2
406 the evolution function is not "simple". */
407 if (tree_is_chrec (evolution_part))
410 step_expr = evolution_part;
411 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
413 if (vect_print_dump_info (REPORT_DETAILS))
415 fprintf (vect_dump, "step: ");
416 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
417 fprintf (vect_dump, ", init: ");
418 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
424 if (TREE_CODE (step_expr) != INTEGER_CST)
426 if (vect_print_dump_info (REPORT_DETAILS))
427 fprintf (vect_dump, "step unknown.");
434 /* Function vect_analyze_scalar_cycles_1.
436 Examine the cross iteration def-use cycles of scalar variables
437 in LOOP. LOOP_VINFO represents the loop that is now being
438 considered for vectorization (can be LOOP, or an outer-loop
442 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
444 basic_block bb = loop->header;
446 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
447 gimple_stmt_iterator gsi;
450 if (vect_print_dump_info (REPORT_DETAILS))
451 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
453 /* First - identify all inductions. Reduction detection assumes that all the
454 inductions have been identified, therefore, this order must not be
456 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
458 gimple phi = gsi_stmt (gsi);
459 tree access_fn = NULL;
460 tree def = PHI_RESULT (phi);
461 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
463 if (vect_print_dump_info (REPORT_DETAILS))
465 fprintf (vect_dump, "Analyze phi: ");
466 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
469 /* Skip virtual phi's. The data dependences that are associated with
470 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
471 if (!is_gimple_reg (SSA_NAME_VAR (def)))
474 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
476 /* Analyze the evolution function. */
477 access_fn = analyze_scalar_evolution (loop, def);
478 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
480 fprintf (vect_dump, "Access function of PHI: ");
481 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
485 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
487 VEC_safe_push (gimple, heap, worklist, phi);
491 if (vect_print_dump_info (REPORT_DETAILS))
492 fprintf (vect_dump, "Detected induction.");
493 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
497 /* Second - identify all reductions and nested cycles. */
498 while (VEC_length (gimple, worklist) > 0)
500 gimple phi = VEC_pop (gimple, worklist);
501 tree def = PHI_RESULT (phi);
502 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
506 if (vect_print_dump_info (REPORT_DETAILS))
508 fprintf (vect_dump, "Analyze phi: ");
509 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
512 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
513 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
515 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
516 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
522 if (vect_print_dump_info (REPORT_DETAILS))
523 fprintf (vect_dump, "Detected double reduction.");
525 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
526 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
527 vect_double_reduction_def;
533 if (vect_print_dump_info (REPORT_DETAILS))
534 fprintf (vect_dump, "Detected vectorizable nested cycle.");
536 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
537 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
542 if (vect_print_dump_info (REPORT_DETAILS))
543 fprintf (vect_dump, "Detected reduction.");
545 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
546 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
548 /* Store the reduction cycles for possible vectorization in
550 VEC_safe_push (gimple, heap,
551 LOOP_VINFO_REDUCTIONS (loop_vinfo),
557 if (vect_print_dump_info (REPORT_DETAILS))
558 fprintf (vect_dump, "Unknown def-use cycle pattern.");
561 VEC_free (gimple, heap, worklist);
565 /* Function vect_analyze_scalar_cycles.
567 Examine the cross iteration def-use cycles of scalar variables, by
568 analyzing the loop-header PHIs of scalar variables; Classify each
569 cycle as one of the following: invariant, induction, reduction, unknown.
570 We do that for the loop represented by LOOP_VINFO, and also to its
571 inner-loop, if exists.
572 Examples for scalar cycles:
587 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
589 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
591 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
593 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
594 Reductions in such inner-loop therefore have different properties than
595 the reductions in the nest that gets vectorized:
596 1. When vectorized, they are executed in the same order as in the original
597 scalar loop, so we can't change the order of computation when
599 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
600 current checks are too strict. */
603 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
606 /* Function vect_get_loop_niters.
608 Determine how many iterations the loop is executed.
609 If an expression that represents the number of iterations
610 can be constructed, place it in NUMBER_OF_ITERATIONS.
611 Return the loop exit condition. */
614 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
618 if (vect_print_dump_info (REPORT_DETAILS))
619 fprintf (vect_dump, "=== get_loop_niters ===");
621 niters = number_of_exit_cond_executions (loop);
623 if (niters != NULL_TREE
624 && niters != chrec_dont_know)
626 *number_of_iterations = niters;
628 if (vect_print_dump_info (REPORT_DETAILS))
630 fprintf (vect_dump, "==> get_loop_niters:" );
631 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
635 return get_loop_exit_condition (loop);
639 /* Function bb_in_loop_p
641 Used as predicate for dfs order traversal of the loop bbs. */
644 bb_in_loop_p (const_basic_block bb, const void *data)
646 const struct loop *const loop = (const struct loop *)data;
647 if (flow_bb_inside_loop_p (loop, bb))
653 /* Function new_loop_vec_info.
655 Create and initialize a new loop_vec_info struct for LOOP, as well as
656 stmt_vec_info structs for all the stmts in LOOP. */
659 new_loop_vec_info (struct loop *loop)
663 gimple_stmt_iterator si;
664 unsigned int i, nbbs;
666 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
667 LOOP_VINFO_LOOP (res) = loop;
669 bbs = get_loop_body (loop);
671 /* Create/Update stmt_info for all stmts in the loop. */
672 for (i = 0; i < loop->num_nodes; i++)
674 basic_block bb = bbs[i];
676 /* BBs in a nested inner-loop will have been already processed (because
677 we will have called vect_analyze_loop_form for any nested inner-loop).
678 Therefore, for stmts in an inner-loop we just want to update the
679 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
680 loop_info of the outer-loop we are currently considering to vectorize
681 (instead of the loop_info of the inner-loop).
682 For stmts in other BBs we need to create a stmt_info from scratch. */
683 if (bb->loop_father != loop)
686 gcc_assert (loop->inner && bb->loop_father == loop->inner);
687 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
689 gimple phi = gsi_stmt (si);
690 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
691 loop_vec_info inner_loop_vinfo =
692 STMT_VINFO_LOOP_VINFO (stmt_info);
693 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
694 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
696 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
698 gimple stmt = gsi_stmt (si);
699 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
700 loop_vec_info inner_loop_vinfo =
701 STMT_VINFO_LOOP_VINFO (stmt_info);
702 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
703 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
708 /* bb in current nest. */
709 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
711 gimple phi = gsi_stmt (si);
712 gimple_set_uid (phi, 0);
713 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
716 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
718 gimple stmt = gsi_stmt (si);
719 gimple_set_uid (stmt, 0);
720 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
725 /* CHECKME: We want to visit all BBs before their successors (except for
726 latch blocks, for which this assertion wouldn't hold). In the simple
727 case of the loop forms we allow, a dfs order of the BBs would the same
728 as reversed postorder traversal, so we are safe. */
731 bbs = XCNEWVEC (basic_block, loop->num_nodes);
732 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
733 bbs, loop->num_nodes, loop);
734 gcc_assert (nbbs == loop->num_nodes);
736 LOOP_VINFO_BBS (res) = bbs;
737 LOOP_VINFO_NITERS (res) = NULL;
738 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
739 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
740 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
741 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
742 LOOP_VINFO_VECT_FACTOR (res) = 0;
743 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
744 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
745 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
746 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
747 VEC_alloc (gimple, heap,
748 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
749 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
750 VEC_alloc (ddr_p, heap,
751 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
752 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
753 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
754 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
755 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
761 /* Function destroy_loop_vec_info.
763 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
764 stmts in the loop. */
767 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
772 gimple_stmt_iterator si;
774 VEC (slp_instance, heap) *slp_instances;
775 slp_instance instance;
780 loop = LOOP_VINFO_LOOP (loop_vinfo);
782 bbs = LOOP_VINFO_BBS (loop_vinfo);
783 nbbs = loop->num_nodes;
787 free (LOOP_VINFO_BBS (loop_vinfo));
788 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
789 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
790 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
797 for (j = 0; j < nbbs; j++)
799 basic_block bb = bbs[j];
800 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
801 free_stmt_vec_info (gsi_stmt (si));
803 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
805 gimple stmt = gsi_stmt (si);
806 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
810 /* Check if this is a "pattern stmt" (introduced by the
811 vectorizer during the pattern recognition pass). */
812 bool remove_stmt_p = false;
813 gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
816 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
818 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
819 remove_stmt_p = true;
822 /* Free stmt_vec_info. */
823 free_stmt_vec_info (stmt);
825 /* Remove dead "pattern stmts". */
827 gsi_remove (&si, true);
833 free (LOOP_VINFO_BBS (loop_vinfo));
834 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
835 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
836 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
837 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
838 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
839 for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
840 vect_free_slp_instance (instance);
842 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
843 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
844 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
851 /* Function vect_analyze_loop_1.
853 Apply a set of analyses on LOOP, and create a loop_vec_info struct
854 for it. The different analyses will record information in the
855 loop_vec_info struct. This is a subset of the analyses applied in
856 vect_analyze_loop, to be applied on an inner-loop nested in the loop
857 that is now considered for (outer-loop) vectorization. */
860 vect_analyze_loop_1 (struct loop *loop)
862 loop_vec_info loop_vinfo;
864 if (vect_print_dump_info (REPORT_DETAILS))
865 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
867 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
869 loop_vinfo = vect_analyze_loop_form (loop);
872 if (vect_print_dump_info (REPORT_DETAILS))
873 fprintf (vect_dump, "bad inner-loop form.");
881 /* Function vect_analyze_loop_form.
883 Verify that certain CFG restrictions hold, including:
884 - the loop has a pre-header
885 - the loop has a single entry and exit
886 - the loop exit condition is simple enough, and the number of iterations
887 can be analyzed (a countable loop). */
890 vect_analyze_loop_form (struct loop *loop)
892 loop_vec_info loop_vinfo;
894 tree number_of_iterations = NULL;
895 loop_vec_info inner_loop_vinfo = NULL;
897 if (vect_print_dump_info (REPORT_DETAILS))
898 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
900 /* Different restrictions apply when we are considering an inner-most loop,
901 vs. an outer (nested) loop.
902 (FORNOW. May want to relax some of these restrictions in the future). */
906 /* Inner-most loop. We currently require that the number of BBs is
907 exactly 2 (the header and latch). Vectorizable inner-most loops
918 if (loop->num_nodes != 2)
920 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
921 fprintf (vect_dump, "not vectorized: control flow in loop.");
925 if (empty_block_p (loop->header))
927 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
928 fprintf (vect_dump, "not vectorized: empty loop.");
934 struct loop *innerloop = loop->inner;
937 /* Nested loop. We currently require that the loop is doubly-nested,
938 contains a single inner loop, and the number of BBs is exactly 5.
939 Vectorizable outer-loops look like this:
951 The inner-loop has the properties expected of inner-most loops
952 as described above. */
954 if ((loop->inner)->inner || (loop->inner)->next)
956 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
957 fprintf (vect_dump, "not vectorized: multiple nested loops.");
961 /* Analyze the inner-loop. */
962 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
963 if (!inner_loop_vinfo)
965 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
966 fprintf (vect_dump, "not vectorized: Bad inner loop.");
970 if (!expr_invariant_in_loop_p (loop,
971 LOOP_VINFO_NITERS (inner_loop_vinfo)))
973 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
975 "not vectorized: inner-loop count not invariant.");
976 destroy_loop_vec_info (inner_loop_vinfo, true);
980 if (loop->num_nodes != 5)
982 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
983 fprintf (vect_dump, "not vectorized: control flow in loop.");
984 destroy_loop_vec_info (inner_loop_vinfo, true);
988 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
989 entryedge = EDGE_PRED (innerloop->header, 0);
990 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
991 entryedge = EDGE_PRED (innerloop->header, 1);
993 if (entryedge->src != loop->header
994 || !single_exit (innerloop)
995 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
997 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
998 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
999 destroy_loop_vec_info (inner_loop_vinfo, true);
1003 if (vect_print_dump_info (REPORT_DETAILS))
1004 fprintf (vect_dump, "Considering outer-loop vectorization.");
1007 if (!single_exit (loop)
1008 || EDGE_COUNT (loop->header->preds) != 2)
1010 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1012 if (!single_exit (loop))
1013 fprintf (vect_dump, "not vectorized: multiple exits.");
1014 else if (EDGE_COUNT (loop->header->preds) != 2)
1015 fprintf (vect_dump, "not vectorized: too many incoming edges.");
1017 if (inner_loop_vinfo)
1018 destroy_loop_vec_info (inner_loop_vinfo, true);
1022 /* We assume that the loop exit condition is at the end of the loop. i.e,
1023 that the loop is represented as a do-while (with a proper if-guard
1024 before the loop if needed), where the loop header contains all the
1025 executable statements, and the latch is empty. */
1026 if (!empty_block_p (loop->latch)
1027 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1029 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1030 fprintf (vect_dump, "not vectorized: unexpected loop form.");
1031 if (inner_loop_vinfo)
1032 destroy_loop_vec_info (inner_loop_vinfo, true);
1036 /* Make sure there exists a single-predecessor exit bb: */
1037 if (!single_pred_p (single_exit (loop)->dest))
1039 edge e = single_exit (loop);
1040 if (!(e->flags & EDGE_ABNORMAL))
1042 split_loop_exit_edge (e);
1043 if (vect_print_dump_info (REPORT_DETAILS))
1044 fprintf (vect_dump, "split exit edge.");
1048 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1049 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1050 if (inner_loop_vinfo)
1051 destroy_loop_vec_info (inner_loop_vinfo, true);
1056 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1059 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1060 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1061 if (inner_loop_vinfo)
1062 destroy_loop_vec_info (inner_loop_vinfo, true);
1066 if (!number_of_iterations)
1068 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1070 "not vectorized: number of iterations cannot be computed.");
1071 if (inner_loop_vinfo)
1072 destroy_loop_vec_info (inner_loop_vinfo, true);
1076 if (chrec_contains_undetermined (number_of_iterations))
1078 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1079 fprintf (vect_dump, "Infinite number of iterations.");
1080 if (inner_loop_vinfo)
1081 destroy_loop_vec_info (inner_loop_vinfo, true);
1085 if (!NITERS_KNOWN_P (number_of_iterations))
1087 if (vect_print_dump_info (REPORT_DETAILS))
1089 fprintf (vect_dump, "Symbolic number of iterations is ");
1090 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1093 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1095 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1096 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1097 if (inner_loop_vinfo)
1098 destroy_loop_vec_info (inner_loop_vinfo, false);
1102 loop_vinfo = new_loop_vec_info (loop);
1103 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1104 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1106 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1108 /* CHECKME: May want to keep it around it in the future. */
1109 if (inner_loop_vinfo)
1110 destroy_loop_vec_info (inner_loop_vinfo, false);
1112 gcc_assert (!loop->aux);
1113 loop->aux = loop_vinfo;
1118 /* Function vect_analyze_loop_operations.
1120 Scan the loop stmts and make sure they are all vectorizable. */
1123 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1125 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1126 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1127 int nbbs = loop->num_nodes;
1128 gimple_stmt_iterator si;
1129 unsigned int vectorization_factor = 0;
1132 stmt_vec_info stmt_info;
1133 bool need_to_vectorize = false;
1134 int min_profitable_iters;
1135 int min_scalar_loop_bound;
1137 bool only_slp_in_loop = true, ok;
1139 if (vect_print_dump_info (REPORT_DETAILS))
1140 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1142 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1143 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1145 for (i = 0; i < nbbs; i++)
1147 basic_block bb = bbs[i];
1149 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1151 phi = gsi_stmt (si);
1154 stmt_info = vinfo_for_stmt (phi);
1155 if (vect_print_dump_info (REPORT_DETAILS))
1157 fprintf (vect_dump, "examining phi: ");
1158 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1161 if (! is_loop_header_bb_p (bb))
1163 /* inner-loop loop-closed exit phi in outer-loop vectorization
1164 (i.e. a phi in the tail of the outer-loop).
1165 FORNOW: we currently don't support the case that these phis
1166 are not used in the outerloop (unless it is double reduction,
1167 i.e., this phi is vect_reduction_def), cause this case
1168 requires to actually do something here. */
1169 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1170 || STMT_VINFO_LIVE_P (stmt_info))
1171 && STMT_VINFO_DEF_TYPE (stmt_info)
1172 != vect_double_reduction_def)
1174 if (vect_print_dump_info (REPORT_DETAILS))
1176 "Unsupported loop-closed phi in outer-loop.");
1182 gcc_assert (stmt_info);
1184 if (STMT_VINFO_LIVE_P (stmt_info))
1186 /* FORNOW: not yet supported. */
1187 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1188 fprintf (vect_dump, "not vectorized: value used after loop.");
1192 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1193 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1195 /* A scalar-dependence cycle that we don't support. */
1196 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1197 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1201 if (STMT_VINFO_RELEVANT_P (stmt_info))
1203 need_to_vectorize = true;
1204 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1205 ok = vectorizable_induction (phi, NULL, NULL);
1210 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1213 "not vectorized: relevant phi not supported: ");
1214 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1220 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1222 gimple stmt = gsi_stmt (si);
1223 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1225 gcc_assert (stmt_info);
1227 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1230 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1231 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1232 && !PURE_SLP_STMT (stmt_info))
1233 /* STMT needs both SLP and loop-based vectorization. */
1234 only_slp_in_loop = false;
1238 /* All operations in the loop are either irrelevant (deal with loop
1239 control, or dead), or only used outside the loop and can be moved
1240 out of the loop (e.g. invariants, inductions). The loop can be
1241 optimized away by scalar optimizations. We're better off not
1242 touching this loop. */
1243 if (!need_to_vectorize)
1245 if (vect_print_dump_info (REPORT_DETAILS))
1247 "All the computation can be taken out of the loop.");
1248 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1250 "not vectorized: redundant loop. no profit to vectorize.");
1254 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1255 vectorization factor of the loop is the unrolling factor required by the
1256 SLP instances. If that unrolling factor is 1, we say, that we perform
1257 pure SLP on loop - cross iteration parallelism is not exploited. */
1258 if (only_slp_in_loop)
1259 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1261 vectorization_factor = least_common_multiple (vectorization_factor,
1262 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1264 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1266 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1267 && vect_print_dump_info (REPORT_DETAILS))
1269 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1270 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1272 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1273 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1275 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1276 fprintf (vect_dump, "not vectorized: iteration count too small.");
1277 if (vect_print_dump_info (REPORT_DETAILS))
1278 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1279 "vectorization factor.");
1283 /* Analyze cost. Decide if worth while to vectorize. */
1285 /* Once VF is set, SLP costs should be updated since the number of created
1286 vector stmts depends on VF. */
1287 vect_update_slp_costs_according_to_vf (loop_vinfo);
1289 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1290 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1292 if (min_profitable_iters < 0)
1294 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1295 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1296 if (vect_print_dump_info (REPORT_DETAILS))
1297 fprintf (vect_dump, "not vectorized: vector version will never be "
1302 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1303 * vectorization_factor) - 1);
1305 /* Use the cost model only if it is more conservative than user specified
1308 th = (unsigned) min_scalar_loop_bound;
1309 if (min_profitable_iters
1310 && (!min_scalar_loop_bound
1311 || min_profitable_iters > min_scalar_loop_bound))
1312 th = (unsigned) min_profitable_iters;
1314 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1315 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1317 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1318 fprintf (vect_dump, "not vectorized: vectorization not "
1320 if (vect_print_dump_info (REPORT_DETAILS))
1321 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1322 "user specified loop bound parameter or minimum "
1323 "profitable iterations (whichever is more conservative).");
1327 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1328 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1329 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1331 if (vect_print_dump_info (REPORT_DETAILS))
1332 fprintf (vect_dump, "epilog loop required.");
1333 if (!vect_can_advance_ivs_p (loop_vinfo))
1335 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1337 "not vectorized: can't create epilog loop 1.");
1340 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1342 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1344 "not vectorized: can't create epilog loop 2.");
1353 /* Function vect_analyze_loop.
1355 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1356 for it. The different analyses will record information in the
1357 loop_vec_info struct. */
1359 vect_analyze_loop (struct loop *loop)
1362 loop_vec_info loop_vinfo;
1363 int max_vf = MAX_VECTORIZATION_FACTOR;
1366 if (vect_print_dump_info (REPORT_DETAILS))
1367 fprintf (vect_dump, "===== analyze_loop_nest =====");
1369 if (loop_outer (loop)
1370 && loop_vec_info_for_loop (loop_outer (loop))
1371 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1373 if (vect_print_dump_info (REPORT_DETAILS))
1374 fprintf (vect_dump, "outer-loop already vectorized.");
1378 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1380 loop_vinfo = vect_analyze_loop_form (loop);
1383 if (vect_print_dump_info (REPORT_DETAILS))
1384 fprintf (vect_dump, "bad loop form.");
1388 /* Find all data references in the loop (which correspond to vdefs/vuses)
1389 and analyze their evolution in the loop. Also adjust the minimal
1390 vectorization factor according to the loads and stores.
1392 FORNOW: Handle only simple, array references, which
1393 alignment can be forced, and aligned pointer-references. */
1395 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1398 if (vect_print_dump_info (REPORT_DETAILS))
1399 fprintf (vect_dump, "bad data references.");
1400 destroy_loop_vec_info (loop_vinfo, true);
1404 /* Classify all cross-iteration scalar data-flow cycles.
1405 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1407 vect_analyze_scalar_cycles (loop_vinfo);
1409 vect_pattern_recog (loop_vinfo);
1411 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1413 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1416 if (vect_print_dump_info (REPORT_DETAILS))
1417 fprintf (vect_dump, "unexpected pattern.");
1418 destroy_loop_vec_info (loop_vinfo, true);
1422 /* Analyze data dependences between the data-refs in the loop
1423 and adjust the maximum vectorization factor according to
1425 FORNOW: fail at the first data dependence that we encounter. */
1427 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1431 if (vect_print_dump_info (REPORT_DETAILS))
1432 fprintf (vect_dump, "bad data dependence.");
1433 destroy_loop_vec_info (loop_vinfo, true);
1437 ok = vect_determine_vectorization_factor (loop_vinfo);
1440 if (vect_print_dump_info (REPORT_DETAILS))
1441 fprintf (vect_dump, "can't determine vectorization factor.");
1442 destroy_loop_vec_info (loop_vinfo, true);
1445 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1447 if (vect_print_dump_info (REPORT_DETAILS))
1448 fprintf (vect_dump, "bad data dependence.");
1449 destroy_loop_vec_info (loop_vinfo, true);
1453 /* Analyze the alignment of the data-refs in the loop.
1454 Fail if a data reference is found that cannot be vectorized. */
1456 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1459 if (vect_print_dump_info (REPORT_DETAILS))
1460 fprintf (vect_dump, "bad data alignment.");
1461 destroy_loop_vec_info (loop_vinfo, true);
1465 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1466 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1468 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1471 if (vect_print_dump_info (REPORT_DETAILS))
1472 fprintf (vect_dump, "bad data access.");
1473 destroy_loop_vec_info (loop_vinfo, true);
1477 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1478 It is important to call pruning after vect_analyze_data_ref_accesses,
1479 since we use grouping information gathered by interleaving analysis. */
1480 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1483 if (vect_print_dump_info (REPORT_DETAILS))
1484 fprintf (vect_dump, "too long list of versioning for alias "
1486 destroy_loop_vec_info (loop_vinfo, true);
1490 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1491 ok = vect_analyze_slp (loop_vinfo, NULL);
1494 /* Decide which possible SLP instances to SLP. */
1495 vect_make_slp_decision (loop_vinfo);
1497 /* Find stmts that need to be both vectorized and SLPed. */
1498 vect_detect_hybrid_slp (loop_vinfo);
1501 /* This pass will decide on using loop versioning and/or loop peeling in
1502 order to enhance the alignment of data references in the loop. */
1504 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1507 if (vect_print_dump_info (REPORT_DETAILS))
1508 fprintf (vect_dump, "bad data alignment.");
1509 destroy_loop_vec_info (loop_vinfo, true);
1513 /* Scan all the operations in the loop and make sure they are
1516 ok = vect_analyze_loop_operations (loop_vinfo);
1519 if (vect_print_dump_info (REPORT_DETAILS))
1520 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1521 destroy_loop_vec_info (loop_vinfo, true);
1525 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1531 /* Function reduction_code_for_scalar_code
1534 CODE - tree_code of a reduction operations.
1537 REDUC_CODE - the corresponding tree-code to be used to reduce the
1538 vector of partial results into a single scalar result (which
1539 will also reside in a vector) or ERROR_MARK if the operation is
1540 a supported reduction operation, but does not have such tree-code.
1542 Return FALSE if CODE currently cannot be vectorized as reduction. */
1545 reduction_code_for_scalar_code (enum tree_code code,
1546 enum tree_code *reduc_code)
1551 *reduc_code = REDUC_MAX_EXPR;
1555 *reduc_code = REDUC_MIN_EXPR;
1559 *reduc_code = REDUC_PLUS_EXPR;
1567 *reduc_code = ERROR_MARK;
1576 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1577 STMT is printed with a message MSG. */
1580 report_vect_op (gimple stmt, const char *msg)
1582 fprintf (vect_dump, "%s", msg);
1583 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1587 /* Function vect_is_simple_reduction_1
1589 (1) Detect a cross-iteration def-use cycle that represents a simple
1590 reduction computation. We look for the following pattern:
1595 a2 = operation (a3, a1)
1598 1. operation is commutative and associative and it is safe to
1599 change the order of the computation (if CHECK_REDUCTION is true)
1600 2. no uses for a2 in the loop (a2 is used out of the loop)
1601 3. no uses of a1 in the loop besides the reduction operation.
1603 Condition 1 is tested here.
1604 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1606 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1607 nested cycles, if CHECK_REDUCTION is false.
1609 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1613 inner loop (def of a3)
1616 If MODIFY is true it tries also to rework the code in-place to enable
1617 detection of more reduction patterns. For the time being we rewrite
1618 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1622 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1623 bool check_reduction, bool *double_reduc,
1626 struct loop *loop = (gimple_bb (phi))->loop_father;
1627 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1628 edge latch_e = loop_latch_edge (loop);
1629 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1630 gimple def_stmt, def1 = NULL, def2 = NULL;
1631 enum tree_code orig_code, code;
1632 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1636 imm_use_iterator imm_iter;
1637 use_operand_p use_p;
1640 *double_reduc = false;
1642 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1643 otherwise, we assume outer loop vectorization. */
1644 gcc_assert ((check_reduction && loop == vect_loop)
1645 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1647 name = PHI_RESULT (phi);
1649 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1651 gimple use_stmt = USE_STMT (use_p);
1652 if (is_gimple_debug (use_stmt))
1654 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1655 && vinfo_for_stmt (use_stmt)
1656 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1660 if (vect_print_dump_info (REPORT_DETAILS))
1661 fprintf (vect_dump, "reduction used in loop.");
1666 if (TREE_CODE (loop_arg) != SSA_NAME)
1668 if (vect_print_dump_info (REPORT_DETAILS))
1670 fprintf (vect_dump, "reduction: not ssa_name: ");
1671 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1676 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1679 if (vect_print_dump_info (REPORT_DETAILS))
1680 fprintf (vect_dump, "reduction: no def_stmt.");
1684 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1686 if (vect_print_dump_info (REPORT_DETAILS))
1687 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1691 if (is_gimple_assign (def_stmt))
1693 name = gimple_assign_lhs (def_stmt);
1698 name = PHI_RESULT (def_stmt);
1703 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1705 gimple use_stmt = USE_STMT (use_p);
1706 if (is_gimple_debug (use_stmt))
1708 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1709 && vinfo_for_stmt (use_stmt)
1710 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1714 if (vect_print_dump_info (REPORT_DETAILS))
1715 fprintf (vect_dump, "reduction used in loop.");
1720 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1721 defined in the inner loop. */
1724 op1 = PHI_ARG_DEF (def_stmt, 0);
1726 if (gimple_phi_num_args (def_stmt) != 1
1727 || TREE_CODE (op1) != SSA_NAME)
1729 if (vect_print_dump_info (REPORT_DETAILS))
1730 fprintf (vect_dump, "unsupported phi node definition.");
1735 def1 = SSA_NAME_DEF_STMT (op1);
1736 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1738 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1739 && is_gimple_assign (def1))
1741 if (vect_print_dump_info (REPORT_DETAILS))
1742 report_vect_op (def_stmt, "detected double reduction: ");
1744 *double_reduc = true;
1751 code = orig_code = gimple_assign_rhs_code (def_stmt);
1753 /* We can handle "res -= x[i]", which is non-associative by
1754 simply rewriting this into "res += -x[i]". Avoid changing
1755 gimple instruction for the first simple tests and only do this
1756 if we're allowed to change code at all. */
1757 if (code == MINUS_EXPR && modify)
1761 && (!commutative_tree_code (code) || !associative_tree_code (code)))
1763 if (vect_print_dump_info (REPORT_DETAILS))
1764 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1768 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1770 if (code != COND_EXPR)
1772 if (vect_print_dump_info (REPORT_DETAILS))
1773 report_vect_op (def_stmt, "reduction: not binary operation: ");
1778 op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1779 if (COMPARISON_CLASS_P (op3))
1781 op4 = TREE_OPERAND (op3, 1);
1782 op3 = TREE_OPERAND (op3, 0);
1785 op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1786 op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1788 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1790 if (vect_print_dump_info (REPORT_DETAILS))
1791 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1798 op1 = gimple_assign_rhs1 (def_stmt);
1799 op2 = gimple_assign_rhs2 (def_stmt);
1801 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1803 if (vect_print_dump_info (REPORT_DETAILS))
1804 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1810 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1811 if ((TREE_CODE (op1) == SSA_NAME
1812 && !types_compatible_p (type,TREE_TYPE (op1)))
1813 || (TREE_CODE (op2) == SSA_NAME
1814 && !types_compatible_p (type, TREE_TYPE (op2)))
1815 || (op3 && TREE_CODE (op3) == SSA_NAME
1816 && !types_compatible_p (type, TREE_TYPE (op3)))
1817 || (op4 && TREE_CODE (op4) == SSA_NAME
1818 && !types_compatible_p (type, TREE_TYPE (op4))))
1820 if (vect_print_dump_info (REPORT_DETAILS))
1822 fprintf (vect_dump, "reduction: multiple types: operation type: ");
1823 print_generic_expr (vect_dump, type, TDF_SLIM);
1824 fprintf (vect_dump, ", operands types: ");
1825 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1826 fprintf (vect_dump, ",");
1827 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1830 fprintf (vect_dump, ",");
1831 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1836 fprintf (vect_dump, ",");
1837 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1844 /* Check that it's ok to change the order of the computation.
1845 Generally, when vectorizing a reduction we change the order of the
1846 computation. This may change the behavior of the program in some
1847 cases, so we need to check that this is ok. One exception is when
1848 vectorizing an outer-loop: the inner-loop is executed sequentially,
1849 and therefore vectorizing reductions in the inner-loop during
1850 outer-loop vectorization is safe. */
1852 /* CHECKME: check for !flag_finite_math_only too? */
1853 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1856 /* Changing the order of operations changes the semantics. */
1857 if (vect_print_dump_info (REPORT_DETAILS))
1858 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1861 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1864 /* Changing the order of operations changes the semantics. */
1865 if (vect_print_dump_info (REPORT_DETAILS))
1866 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1869 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1871 /* Changing the order of operations changes the semantics. */
1872 if (vect_print_dump_info (REPORT_DETAILS))
1873 report_vect_op (def_stmt,
1874 "reduction: unsafe fixed-point math optimization: ");
1878 /* If we detected "res -= x[i]" earlier, rewrite it into
1879 "res += -x[i]" now. If this turns out to be useless reassoc
1880 will clean it up again. */
1881 if (orig_code == MINUS_EXPR)
1883 tree rhs = gimple_assign_rhs2 (def_stmt);
1884 tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
1885 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
1887 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
1888 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
1890 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
1891 gimple_assign_set_rhs2 (def_stmt, negrhs);
1892 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
1893 update_stmt (def_stmt);
1896 /* Reduction is safe. We're dealing with one of the following:
1897 1) integer arithmetic and no trapv
1898 2) floating point arithmetic, and special flags permit this optimization
1899 3) nested cycle (i.e., outer loop vectorization). */
1900 if (TREE_CODE (op1) == SSA_NAME)
1901 def1 = SSA_NAME_DEF_STMT (op1);
1903 if (TREE_CODE (op2) == SSA_NAME)
1904 def2 = SSA_NAME_DEF_STMT (op2);
1906 if (code != COND_EXPR
1907 && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1909 if (vect_print_dump_info (REPORT_DETAILS))
1910 report_vect_op (def_stmt, "reduction: no defs for operands: ");
1914 /* Check that one def is the reduction def, defined by PHI,
1915 the other def is either defined in the loop ("vect_internal_def"),
1916 or it's an induction (defined by a loop-header phi-node). */
1918 if (def2 && def2 == phi
1919 && (code == COND_EXPR
1920 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1921 && (is_gimple_assign (def1)
1922 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1923 == vect_induction_def
1924 || (gimple_code (def1) == GIMPLE_PHI
1925 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1926 == vect_internal_def
1927 && !is_loop_header_bb_p (gimple_bb (def1)))))))
1929 if (vect_print_dump_info (REPORT_DETAILS))
1930 report_vect_op (def_stmt, "detected reduction: ");
1933 else if (def1 && def1 == phi
1934 && (code == COND_EXPR
1935 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1936 && (is_gimple_assign (def2)
1937 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1938 == vect_induction_def
1939 || (gimple_code (def2) == GIMPLE_PHI
1940 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1941 == vect_internal_def
1942 && !is_loop_header_bb_p (gimple_bb (def2)))))))
1944 if (check_reduction)
1946 /* Swap operands (just for simplicity - so that the rest of the code
1947 can assume that the reduction variable is always the last (second)
1949 if (vect_print_dump_info (REPORT_DETAILS))
1950 report_vect_op (def_stmt,
1951 "detected reduction: need to swap operands: ");
1953 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
1954 gimple_assign_rhs2_ptr (def_stmt));
1958 if (vect_print_dump_info (REPORT_DETAILS))
1959 report_vect_op (def_stmt, "detected reduction: ");
1966 if (vect_print_dump_info (REPORT_DETAILS))
1967 report_vect_op (def_stmt, "reduction: unknown pattern: ");
1973 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
1974 in-place. Arguments as there. */
1977 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
1978 bool check_reduction, bool *double_reduc)
1980 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
1981 double_reduc, false);
1984 /* Wrapper around vect_is_simple_reduction_1, which will modify code
1985 in-place if it enables detection of more reductions. Arguments
1989 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
1990 bool check_reduction, bool *double_reduc)
1992 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
1993 double_reduc, true);
1996 /* Function vect_estimate_min_profitable_iters
1998 Return the number of iterations required for the vector version of the
1999 loop to be profitable relative to the cost of the scalar version of the
2002 TODO: Take profile info into account before making vectorization
2003 decisions, if available. */
2006 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2009 int min_profitable_iters;
2010 int peel_iters_prologue;
2011 int peel_iters_epilogue;
2012 int vec_inside_cost = 0;
2013 int vec_outside_cost = 0;
2014 int scalar_single_iter_cost = 0;
2015 int scalar_outside_cost = 0;
2016 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2017 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2018 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2019 int nbbs = loop->num_nodes;
2020 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2021 int peel_guard_costs = 0;
2022 int innerloop_iters = 0, factor;
2023 VEC (slp_instance, heap) *slp_instances;
2024 slp_instance instance;
2026 /* Cost model disabled. */
2027 if (!flag_vect_cost_model)
2029 if (vect_print_dump_info (REPORT_COST))
2030 fprintf (vect_dump, "cost model disabled.");
2034 /* Requires loop versioning tests to handle misalignment. */
2035 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2037 /* FIXME: Make cost depend on complexity of individual check. */
2039 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2040 if (vect_print_dump_info (REPORT_COST))
2041 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2042 "versioning to treat misalignment.\n");
2045 /* Requires loop versioning with alias checks. */
2046 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2048 /* FIXME: Make cost depend on complexity of individual check. */
2050 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2051 if (vect_print_dump_info (REPORT_COST))
2052 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2053 "versioning aliasing.\n");
2056 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2057 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2058 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
2060 /* Count statements in scalar loop. Using this as scalar cost for a single
2063 TODO: Add outer loop support.
2065 TODO: Consider assigning different costs to different scalar
2070 innerloop_iters = 50; /* FIXME */
2072 for (i = 0; i < nbbs; i++)
2074 gimple_stmt_iterator si;
2075 basic_block bb = bbs[i];
2077 if (bb->loop_father == loop->inner)
2078 factor = innerloop_iters;
2082 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2084 gimple stmt = gsi_stmt (si);
2085 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2086 /* Skip stmts that are not vectorized inside the loop. */
2087 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2088 && (!STMT_VINFO_LIVE_P (stmt_info)
2089 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2091 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
2092 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2093 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2094 some of the "outside" costs are generated inside the outer-loop. */
2095 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2099 /* Add additional cost for the peeled instructions in prologue and epilogue
2102 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2103 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2105 TODO: Build an expression that represents peel_iters for prologue and
2106 epilogue to be used in a run-time test. */
2108 if (byte_misalign < 0)
2110 peel_iters_prologue = vf/2;
2111 if (vect_print_dump_info (REPORT_COST))
2112 fprintf (vect_dump, "cost model: "
2113 "prologue peel iters set to vf/2.");
2115 /* If peeling for alignment is unknown, loop bound of main loop becomes
2117 peel_iters_epilogue = vf/2;
2118 if (vect_print_dump_info (REPORT_COST))
2119 fprintf (vect_dump, "cost model: "
2120 "epilogue peel iters set to vf/2 because "
2121 "peeling for alignment is unknown .");
2123 /* If peeled iterations are unknown, count a taken branch and a not taken
2124 branch per peeled loop. Even if scalar loop iterations are known,
2125 vector iterations are not known since peeled prologue iterations are
2126 not known. Hence guards remain the same. */
2127 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
2128 + TARG_COND_NOT_TAKEN_BRANCH_COST);
2134 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2135 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
2136 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2137 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
2139 peel_iters_prologue = nelements - (byte_misalign / element_size);
2142 peel_iters_prologue = 0;
2144 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2146 peel_iters_epilogue = vf/2;
2147 if (vect_print_dump_info (REPORT_COST))
2148 fprintf (vect_dump, "cost model: "
2149 "epilogue peel iters set to vf/2 because "
2150 "loop iterations are unknown .");
2152 /* If peeled iterations are known but number of scalar loop
2153 iterations are unknown, count a taken branch per peeled loop. */
2154 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
2159 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2160 peel_iters_prologue = niters < peel_iters_prologue ?
2161 niters : peel_iters_prologue;
2162 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2166 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2167 + (peel_iters_epilogue * scalar_single_iter_cost)
2170 /* FORNOW: The scalar outside cost is incremented in one of the
2173 1. The vectorizer checks for alignment and aliasing and generates
2174 a condition that allows dynamic vectorization. A cost model
2175 check is ANDED with the versioning condition. Hence scalar code
2176 path now has the added cost of the versioning check.
2178 if (cost > th & versioning_check)
2181 Hence run-time scalar is incremented by not-taken branch cost.
2183 2. The vectorizer then checks if a prologue is required. If the
2184 cost model check was not done before during versioning, it has to
2185 be done before the prologue check.
2188 prologue = scalar_iters
2193 if (prologue == num_iters)
2196 Hence the run-time scalar cost is incremented by a taken branch,
2197 plus a not-taken branch, plus a taken branch cost.
2199 3. The vectorizer then checks if an epilogue is required. If the
2200 cost model check was not done before during prologue check, it
2201 has to be done with the epilogue check.
2207 if (prologue == num_iters)
2210 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2213 Hence the run-time scalar cost should be incremented by 2 taken
2216 TODO: The back end may reorder the BBS's differently and reverse
2217 conditions/branch directions. Change the estimates below to
2218 something more reasonable. */
2220 /* If the number of iterations is known and we do not do versioning, we can
2221 decide whether to vectorize at compile time. Hence the scalar version
2222 do not carry cost model guard costs. */
2223 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2224 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2225 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2227 /* Cost model check occurs at versioning. */
2228 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2229 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2230 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
2233 /* Cost model check occurs at prologue generation. */
2234 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2235 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
2236 + TARG_COND_NOT_TAKEN_BRANCH_COST;
2237 /* Cost model check occurs at epilogue generation. */
2239 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
2243 /* Add SLP costs. */
2244 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2245 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
2247 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2248 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2251 /* Calculate number of iterations required to make the vector version
2252 profitable, relative to the loop bodies only. The following condition
2254 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2256 SIC = scalar iteration cost, VIC = vector iteration cost,
2257 VOC = vector outside cost, VF = vectorization factor,
2258 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2259 SOC = scalar outside cost for run time cost model check. */
2261 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2263 if (vec_outside_cost <= 0)
2264 min_profitable_iters = 1;
2267 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2268 - vec_inside_cost * peel_iters_prologue
2269 - vec_inside_cost * peel_iters_epilogue)
2270 / ((scalar_single_iter_cost * vf)
2273 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2274 <= ((vec_inside_cost * min_profitable_iters)
2275 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2276 min_profitable_iters++;
2279 /* vector version will never be profitable. */
2282 if (vect_print_dump_info (REPORT_COST))
2283 fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2284 "divided by the scalar iteration cost = %d "
2285 "is greater or equal to the vectorization factor = %d.",
2286 vec_inside_cost, scalar_single_iter_cost, vf);
2290 if (vect_print_dump_info (REPORT_COST))
2292 fprintf (vect_dump, "Cost model analysis: \n");
2293 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2295 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2297 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2298 scalar_single_iter_cost);
2299 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2300 fprintf (vect_dump, " prologue iterations: %d\n",
2301 peel_iters_prologue);
2302 fprintf (vect_dump, " epilogue iterations: %d\n",
2303 peel_iters_epilogue);
2304 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2305 min_profitable_iters);
2308 min_profitable_iters =
2309 min_profitable_iters < vf ? vf : min_profitable_iters;
2311 /* Because the condition we create is:
2312 if (niters <= min_profitable_iters)
2313 then skip the vectorized loop. */
2314 min_profitable_iters--;
2316 if (vect_print_dump_info (REPORT_COST))
2317 fprintf (vect_dump, " Profitability threshold = %d\n",
2318 min_profitable_iters);
2320 return min_profitable_iters;
2324 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2325 functions. Design better to avoid maintenance issues. */
2327 /* Function vect_model_reduction_cost.
2329 Models cost for a reduction operation, including the vector ops
2330 generated within the strip-mine loop, the initial definition before
2331 the loop, and the epilogue code that must be generated. */
2334 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2338 enum tree_code code;
2341 gimple stmt, orig_stmt;
2343 enum machine_mode mode;
2344 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2345 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2348 /* Cost of reduction op inside loop. */
2349 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
2351 stmt = STMT_VINFO_STMT (stmt_info);
2353 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2355 case GIMPLE_SINGLE_RHS:
2356 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2357 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2359 case GIMPLE_UNARY_RHS:
2360 reduction_op = gimple_assign_rhs1 (stmt);
2362 case GIMPLE_BINARY_RHS:
2363 reduction_op = gimple_assign_rhs2 (stmt);
2369 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2372 if (vect_print_dump_info (REPORT_COST))
2374 fprintf (vect_dump, "unsupported data-type ");
2375 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2380 mode = TYPE_MODE (vectype);
2381 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2384 orig_stmt = STMT_VINFO_STMT (stmt_info);
2386 code = gimple_assign_rhs_code (orig_stmt);
2388 /* Add in cost for initial definition. */
2389 outer_cost += TARG_SCALAR_TO_VEC_COST;
2391 /* Determine cost of epilogue code.
2393 We have a reduction operator that will reduce the vector in one statement.
2394 Also requires scalar extract. */
2396 if (!nested_in_vect_loop_p (loop, orig_stmt))
2398 if (reduc_code != ERROR_MARK)
2399 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
2402 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2404 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2405 int element_bitsize = tree_low_cst (bitsize, 1);
2406 int nelements = vec_size_in_bits / element_bitsize;
2408 optab = optab_for_tree_code (code, vectype, optab_default);
2410 /* We have a whole vector shift available. */
2411 if (VECTOR_MODE_P (mode)
2412 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
2413 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2414 /* Final reduction via vector shifts and the reduction operator. Also
2415 requires scalar extract. */
2416 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
2417 + TARG_VEC_TO_SCALAR_COST);
2419 /* Use extracts and reduction op for final reduction. For N elements,
2420 we have N extracts and N-1 reduction ops. */
2421 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
2425 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2427 if (vect_print_dump_info (REPORT_COST))
2428 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2429 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2430 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2436 /* Function vect_model_induction_cost.
2438 Models cost for induction operations. */
2441 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2443 /* loop cost for vec_loop. */
2444 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
2445 /* prologue cost for vec_init and vec_step. */
2446 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
2448 if (vect_print_dump_info (REPORT_COST))
2449 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2450 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2451 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2455 /* Function get_initial_def_for_induction
2458 STMT - a stmt that performs an induction operation in the loop.
2459 IV_PHI - the initial value of the induction variable
2462 Return a vector variable, initialized with the first VF values of
2463 the induction variable. E.g., for an iv with IV_PHI='X' and
2464 evolution S, for a vector of 4 units, we want to return:
2465 [X, X + S, X + 2*S, X + 3*S]. */
2468 get_initial_def_for_induction (gimple iv_phi)
2470 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2471 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2472 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2473 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2476 edge pe = loop_preheader_edge (loop);
2477 struct loop *iv_loop;
2479 tree vec, vec_init, vec_step, t;
2483 gimple init_stmt, induction_phi, new_stmt;
2484 tree induc_def, vec_def, vec_dest;
2485 tree init_expr, step_expr;
2486 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2491 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2492 bool nested_in_vect_loop = false;
2493 gimple_seq stmts = NULL;
2494 imm_use_iterator imm_iter;
2495 use_operand_p use_p;
2499 gimple_stmt_iterator si;
2500 basic_block bb = gimple_bb (iv_phi);
2503 vectype = get_vectype_for_scalar_type (scalar_type);
2504 gcc_assert (vectype);
2505 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2506 ncopies = vf / nunits;
2508 gcc_assert (phi_info);
2509 gcc_assert (ncopies >= 1);
2511 /* Find the first insertion point in the BB. */
2512 si = gsi_after_labels (bb);
2514 if (INTEGRAL_TYPE_P (scalar_type))
2515 step_expr = build_int_cst (scalar_type, 0);
2516 else if (POINTER_TYPE_P (scalar_type))
2517 step_expr = build_int_cst (sizetype, 0);
2519 step_expr = build_real (scalar_type, dconst0);
2521 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2522 if (nested_in_vect_loop_p (loop, iv_phi))
2524 nested_in_vect_loop = true;
2525 iv_loop = loop->inner;
2529 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2531 latch_e = loop_latch_edge (iv_loop);
2532 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2534 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2535 gcc_assert (access_fn);
2536 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2537 &init_expr, &step_expr);
2539 pe = loop_preheader_edge (iv_loop);
2541 /* Create the vector that holds the initial_value of the induction. */
2542 if (nested_in_vect_loop)
2544 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2545 been created during vectorization of previous stmts; We obtain it from
2546 the STMT_VINFO_VEC_STMT of the defining stmt. */
2547 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2548 loop_preheader_edge (iv_loop));
2549 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2553 /* iv_loop is the loop to be vectorized. Create:
2554 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
2555 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2556 add_referenced_var (new_var);
2558 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2561 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2562 gcc_assert (!new_bb);
2566 t = tree_cons (NULL_TREE, init_expr, t);
2567 for (i = 1; i < nunits; i++)
2569 /* Create: new_name_i = new_name + step_expr */
2570 enum tree_code code = POINTER_TYPE_P (scalar_type)
2571 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2572 init_stmt = gimple_build_assign_with_ops (code, new_var,
2573 new_name, step_expr);
2574 new_name = make_ssa_name (new_var, init_stmt);
2575 gimple_assign_set_lhs (init_stmt, new_name);
2577 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2578 gcc_assert (!new_bb);
2580 if (vect_print_dump_info (REPORT_DETAILS))
2582 fprintf (vect_dump, "created new init_stmt: ");
2583 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2585 t = tree_cons (NULL_TREE, new_name, t);
2587 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
2588 vec = build_constructor_from_list (vectype, nreverse (t));
2589 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2593 /* Create the vector that holds the step of the induction. */
2594 if (nested_in_vect_loop)
2595 /* iv_loop is nested in the loop to be vectorized. Generate:
2596 vec_step = [S, S, S, S] */
2597 new_name = step_expr;
2600 /* iv_loop is the loop to be vectorized. Generate:
2601 vec_step = [VF*S, VF*S, VF*S, VF*S] */
2602 expr = build_int_cst (TREE_TYPE (step_expr), vf);
2603 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2608 for (i = 0; i < nunits; i++)
2609 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2610 gcc_assert (CONSTANT_CLASS_P (new_name));
2611 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2612 gcc_assert (stepvectype);
2613 vec = build_vector (stepvectype, t);
2614 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2617 /* Create the following def-use cycle:
2622 vec_iv = PHI <vec_init, vec_loop>
2626 vec_loop = vec_iv + vec_step; */
2628 /* Create the induction-phi that defines the induction-operand. */
2629 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2630 add_referenced_var (vec_dest);
2631 induction_phi = create_phi_node (vec_dest, iv_loop->header);
2632 set_vinfo_for_stmt (induction_phi,
2633 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2634 induc_def = PHI_RESULT (induction_phi);
2636 /* Create the iv update inside the loop */
2637 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2638 induc_def, vec_step);
2639 vec_def = make_ssa_name (vec_dest, new_stmt);
2640 gimple_assign_set_lhs (new_stmt, vec_def);
2641 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2642 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2645 /* Set the arguments of the phi node: */
2646 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2647 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2651 /* In case that vectorization factor (VF) is bigger than the number
2652 of elements that we can fit in a vectype (nunits), we have to generate
2653 more than one vector stmt - i.e - we need to "unroll" the
2654 vector stmt by a factor VF/nunits. For more details see documentation
2655 in vectorizable_operation. */
2659 stmt_vec_info prev_stmt_vinfo;
2660 /* FORNOW. This restriction should be relaxed. */
2661 gcc_assert (!nested_in_vect_loop);
2663 /* Create the vector that holds the step of the induction. */
2664 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2665 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2668 for (i = 0; i < nunits; i++)
2669 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2670 gcc_assert (CONSTANT_CLASS_P (new_name));
2671 vec = build_vector (stepvectype, t);
2672 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2674 vec_def = induc_def;
2675 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2676 for (i = 1; i < ncopies; i++)
2678 /* vec_i = vec_prev + vec_step */
2679 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2681 vec_def = make_ssa_name (vec_dest, new_stmt);
2682 gimple_assign_set_lhs (new_stmt, vec_def);
2684 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2685 set_vinfo_for_stmt (new_stmt,
2686 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2687 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2688 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2692 if (nested_in_vect_loop)
2694 /* Find the loop-closed exit-phi of the induction, and record
2695 the final vector of induction results: */
2697 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2699 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2701 exit_phi = USE_STMT (use_p);
2707 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2708 /* FORNOW. Currently not supporting the case that an inner-loop induction
2709 is not used in the outer-loop (i.e. only outside the outer-loop). */
2710 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2711 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2713 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2714 if (vect_print_dump_info (REPORT_DETAILS))
2716 fprintf (vect_dump, "vector of inductions after inner-loop:");
2717 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2723 if (vect_print_dump_info (REPORT_DETAILS))
2725 fprintf (vect_dump, "transform induction: created def-use cycle: ");
2726 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2727 fprintf (vect_dump, "\n");
2728 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2731 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2736 /* Function get_initial_def_for_reduction
2739 STMT - a stmt that performs a reduction operation in the loop.
2740 INIT_VAL - the initial value of the reduction variable
2743 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2744 of the reduction (used for adjusting the epilog - see below).
2745 Return a vector variable, initialized according to the operation that STMT
2746 performs. This vector will be used as the initial value of the
2747 vector of partial results.
2749 Option1 (adjust in epilog): Initialize the vector as follows:
2750 add/bit or/xor: [0,0,...,0,0]
2751 mult/bit and: [1,1,...,1,1]
2752 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2753 and when necessary (e.g. add/mult case) let the caller know
2754 that it needs to adjust the result by init_val.
2756 Option2: Initialize the vector as follows:
2757 add/bit or/xor: [init_val,0,0,...,0]
2758 mult/bit and: [init_val,1,1,...,1]
2759 min/max/cond_expr: [init_val,init_val,...,init_val]
2760 and no adjustments are needed.
2762 For example, for the following code:
2768 STMT is 's = s + a[i]', and the reduction variable is 's'.
2769 For a vector of 4 units, we want to return either [0,0,0,init_val],
2770 or [0,0,0,0] and let the caller know that it needs to adjust
2771 the result at the end by 'init_val'.
2773 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2774 initialization vector is simpler (same element in all entries), if
2775 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2777 A cost model should help decide between these two schemes. */
2780 get_initial_def_for_reduction (gimple stmt, tree init_val,
2781 tree *adjustment_def)
2783 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2784 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2785 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2786 tree scalar_type = TREE_TYPE (init_val);
2787 tree vectype = get_vectype_for_scalar_type (scalar_type);
2789 enum tree_code code = gimple_assign_rhs_code (stmt);
2794 bool nested_in_vect_loop = false;
2796 REAL_VALUE_TYPE real_init_val = dconst0;
2797 int int_init_val = 0;
2798 gimple def_stmt = NULL;
2800 gcc_assert (vectype);
2801 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2803 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2804 || SCALAR_FLOAT_TYPE_P (scalar_type));
2806 if (nested_in_vect_loop_p (loop, stmt))
2807 nested_in_vect_loop = true;
2809 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2811 /* In case of double reduction we only create a vector variable to be put
2812 in the reduction phi node. The actual statement creation is done in
2813 vect_create_epilog_for_reduction. */
2814 if (adjustment_def && nested_in_vect_loop
2815 && TREE_CODE (init_val) == SSA_NAME
2816 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2817 && gimple_code (def_stmt) == GIMPLE_PHI
2818 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2819 && vinfo_for_stmt (def_stmt)
2820 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2821 == vect_double_reduction_def)
2823 *adjustment_def = NULL;
2824 return vect_create_destination_var (init_val, vectype);
2827 if (TREE_CONSTANT (init_val))
2829 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2830 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2832 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2835 init_value = init_val;
2839 case WIDEN_SUM_EXPR:
2847 /* ADJUSMENT_DEF is NULL when called from
2848 vect_create_epilog_for_reduction to vectorize double reduction. */
2851 if (nested_in_vect_loop)
2852 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2855 *adjustment_def = init_val;
2858 if (code == MULT_EXPR || code == BIT_AND_EXPR)
2860 real_init_val = dconst1;
2864 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2865 def_for_init = build_real (scalar_type, real_init_val);
2867 def_for_init = build_int_cst (scalar_type, int_init_val);
2869 /* Create a vector of '0' or '1' except the first element. */
2870 for (i = nunits - 2; i >= 0; --i)
2871 t = tree_cons (NULL_TREE, def_for_init, t);
2873 /* Option1: the first element is '0' or '1' as well. */
2876 t = tree_cons (NULL_TREE, def_for_init, t);
2877 init_def = build_vector (vectype, t);
2881 /* Option2: the first element is INIT_VAL. */
2882 t = tree_cons (NULL_TREE, init_value, t);
2883 if (TREE_CONSTANT (init_val))
2884 init_def = build_vector (vectype, t);
2886 init_def = build_constructor_from_list (vectype, t);
2895 *adjustment_def = NULL_TREE;
2896 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2900 for (i = nunits - 1; i >= 0; --i)
2901 t = tree_cons (NULL_TREE, init_value, t);
2903 if (TREE_CONSTANT (init_val))
2904 init_def = build_vector (vectype, t);
2906 init_def = build_constructor_from_list (vectype, t);
2918 /* Function vect_create_epilog_for_reduction
2920 Create code at the loop-epilog to finalize the result of a reduction
2923 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
2924 reduction statements.
2925 STMT is the scalar reduction stmt that is being vectorized.
2926 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2927 number of elements that we can fit in a vectype (nunits). In this case
2928 we have to generate more than one vector stmt - i.e - we need to "unroll"
2929 the vector stmt by a factor VF/nunits. For more details see documentation
2930 in vectorizable_operation.
2931 REDUC_CODE is the tree-code for the epilog reduction.
2932 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
2934 REDUC_INDEX is the index of the operand in the right hand side of the
2935 statement that is defined by REDUCTION_PHI.
2936 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
2937 SLP_NODE is an SLP node containing a group of reduction statements. The
2938 first one in this group is STMT.
2941 1. Creates the reduction def-use cycles: sets the arguments for
2943 The loop-entry argument is the vectorized initial-value of the reduction.
2944 The loop-latch argument is taken from VECT_DEFS - the vector of partial
2946 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
2947 by applying the operation specified by REDUC_CODE if available, or by
2948 other means (whole-vector shifts or a scalar loop).
2949 The function also creates a new phi node at the loop exit to preserve
2950 loop-closed form, as illustrated below.
2952 The flow at the entry to this function:
2955 vec_def = phi <null, null> # REDUCTION_PHI
2956 VECT_DEF = vector_stmt # vectorized form of STMT
2957 s_loop = scalar_stmt # (scalar) STMT
2959 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2963 The above is transformed by this function into:
2966 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2967 VECT_DEF = vector_stmt # vectorized form of STMT
2968 s_loop = scalar_stmt # (scalar) STMT
2970 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2971 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2972 v_out2 = reduce <v_out1>
2973 s_out3 = extract_field <v_out2, 0>
2974 s_out4 = adjust_result <s_out3>
2980 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
2981 int ncopies, enum tree_code reduc_code,
2982 VEC (gimple, heap) *reduction_phis,
2983 int reduc_index, bool double_reduc,
2986 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2987 stmt_vec_info prev_phi_info;
2989 enum machine_mode mode;
2990 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2991 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
2992 basic_block exit_bb;
2995 gimple new_phi = NULL, phi;
2996 gimple_stmt_iterator exit_gsi;
2998 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
2999 gimple epilog_stmt = NULL;
3000 enum tree_code code = gimple_assign_rhs_code (stmt);
3002 tree bitsize, bitpos;
3003 tree adjustment_def = NULL;
3004 tree vec_initial_def = NULL;
3005 tree reduction_op, expr, def;
3006 tree orig_name, scalar_result;
3007 imm_use_iterator imm_iter;
3008 use_operand_p use_p;
3009 bool extract_scalar_result = false;
3010 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3011 bool nested_in_vect_loop = false;
3012 VEC (gimple, heap) *new_phis = NULL;
3013 enum vect_def_type dt = vect_unknown_def_type;
3015 VEC (tree, heap) *scalar_results = NULL;
3016 unsigned int group_size = 1, k, ratio;
3017 VEC (tree, heap) *vec_initial_defs = NULL;
3018 VEC (gimple, heap) *phis;
3021 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3023 if (nested_in_vect_loop_p (loop, stmt))
3027 nested_in_vect_loop = true;
3028 gcc_assert (!slp_node);
3031 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3033 case GIMPLE_SINGLE_RHS:
3034 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3036 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3038 case GIMPLE_UNARY_RHS:
3039 reduction_op = gimple_assign_rhs1 (stmt);
3041 case GIMPLE_BINARY_RHS:
3042 reduction_op = reduc_index ?
3043 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3049 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3050 gcc_assert (vectype);
3051 mode = TYPE_MODE (vectype);
3053 /* 1. Create the reduction def-use cycle:
3054 Set the arguments of REDUCTION_PHIS, i.e., transform
3057 vec_def = phi <null, null> # REDUCTION_PHI
3058 VECT_DEF = vector_stmt # vectorized form of STMT
3064 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3065 VECT_DEF = vector_stmt # vectorized form of STMT
3068 (in case of SLP, do it for all the phis). */
3070 /* Get the loop-entry arguments. */
3072 vect_get_slp_defs (slp_node, &vec_initial_defs, NULL, reduc_index);
3075 vec_initial_defs = VEC_alloc (tree, heap, 1);
3076 /* For the case of reduction, vect_get_vec_def_for_operand returns
3077 the scalar def before the loop, that defines the initial value
3078 of the reduction variable. */
3079 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3081 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3084 /* Set phi nodes arguments. */
3085 for (i = 0; VEC_iterate (gimple, reduction_phis, i, phi); i++)
3087 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3088 tree def = VEC_index (tree, vect_defs, i);
3089 for (j = 0; j < ncopies; j++)
3091 /* Set the loop-entry arg of the reduction-phi. */
3092 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3095 /* Set the loop-latch arg for the reduction-phi. */
3097 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3099 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3101 if (vect_print_dump_info (REPORT_DETAILS))
3103 fprintf (vect_dump, "transform reduction: created def-use"
3105 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3106 fprintf (vect_dump, "\n");
3107 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3111 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3115 VEC_free (tree, heap, vec_initial_defs);
3117 /* 2. Create epilog code.
3118 The reduction epilog code operates across the elements of the vector
3119 of partial results computed by the vectorized loop.
3120 The reduction epilog code consists of:
3122 step 1: compute the scalar result in a vector (v_out2)
3123 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3124 step 3: adjust the scalar result (s_out3) if needed.
3126 Step 1 can be accomplished using one the following three schemes:
3127 (scheme 1) using reduc_code, if available.
3128 (scheme 2) using whole-vector shifts, if available.
3129 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3132 The overall epilog code looks like this:
3134 s_out0 = phi <s_loop> # original EXIT_PHI
3135 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3136 v_out2 = reduce <v_out1> # step 1
3137 s_out3 = extract_field <v_out2, 0> # step 2
3138 s_out4 = adjust_result <s_out3> # step 3
3140 (step 3 is optional, and steps 1 and 2 may be combined).
3141 Lastly, the uses of s_out0 are replaced by s_out4. */
3144 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3145 v_out1 = phi <VECT_DEF>
3146 Store them in NEW_PHIS. */
3148 exit_bb = single_exit (loop)->dest;
3149 prev_phi_info = NULL;
3150 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3151 for (i = 0; VEC_iterate (tree, vect_defs, i, def); i++)
3153 for (j = 0; j < ncopies; j++)
3155 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3156 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3158 VEC_quick_push (gimple, new_phis, phi);
3161 def = vect_get_vec_def_for_stmt_copy (dt, def);
3162 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3165 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3166 prev_phi_info = vinfo_for_stmt (phi);
3170 exit_gsi = gsi_after_labels (exit_bb);
3172 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3173 (i.e. when reduc_code is not available) and in the final adjustment
3174 code (if needed). Also get the original scalar reduction variable as
3175 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3176 represents a reduction pattern), the tree-code and scalar-def are
3177 taken from the original stmt that the pattern-stmt (STMT) replaces.
3178 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3179 are taken from STMT. */
3181 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3184 /* Regular reduction */
3189 /* Reduction pattern */
3190 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3191 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3192 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3195 code = gimple_assign_rhs_code (orig_stmt);
3196 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3197 partial results are added and not subtracted. */
3198 if (code == MINUS_EXPR)
3201 scalar_dest = gimple_assign_lhs (orig_stmt);
3202 scalar_type = TREE_TYPE (scalar_dest);
3203 scalar_results = VEC_alloc (tree, heap, group_size);
3204 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3205 bitsize = TYPE_SIZE (scalar_type);
3207 /* In case this is a reduction in an inner-loop while vectorizing an outer
3208 loop - we don't need to extract a single scalar result at the end of the
3209 inner-loop (unless it is double reduction, i.e., the use of reduction is
3210 outside the outer-loop). The final vector of partial results will be used
3211 in the vectorized outer-loop, or reduced to a scalar result at the end of
3213 if (nested_in_vect_loop && !double_reduc)
3214 goto vect_finalize_reduction;
3216 /* 2.3 Create the reduction code, using one of the three schemes described
3217 above. In SLP we simply need to extract all the elements from the
3218 vector (without reducing them), so we use scalar shifts. */
3219 if (reduc_code != ERROR_MARK && !slp_node)
3223 /*** Case 1: Create:
3224 v_out2 = reduc_expr <v_out1> */
3226 if (vect_print_dump_info (REPORT_DETAILS))
3227 fprintf (vect_dump, "Reduce using direct vector reduction.");
3229 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3230 new_phi = VEC_index (gimple, new_phis, 0);
3231 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
3232 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3233 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3234 gimple_assign_set_lhs (epilog_stmt, new_temp);
3235 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3237 extract_scalar_result = true;
3241 enum tree_code shift_code = ERROR_MARK;
3242 bool have_whole_vector_shift = true;
3244 int element_bitsize = tree_low_cst (bitsize, 1);
3245 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3248 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
3249 shift_code = VEC_RSHIFT_EXPR;
3251 have_whole_vector_shift = false;
3253 /* Regardless of whether we have a whole vector shift, if we're
3254 emulating the operation via tree-vect-generic, we don't want
3255 to use it. Only the first round of the reduction is likely
3256 to still be profitable via emulation. */
3257 /* ??? It might be better to emit a reduction tree code here, so that
3258 tree-vect-generic can expand the first round via bit tricks. */
3259 if (!VECTOR_MODE_P (mode))
3260 have_whole_vector_shift = false;
3263 optab optab = optab_for_tree_code (code, vectype, optab_default);
3264 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
3265 have_whole_vector_shift = false;
3268 if (have_whole_vector_shift && !slp_node)
3270 /*** Case 2: Create:
3271 for (offset = VS/2; offset >= element_size; offset/=2)
3273 Create: va' = vec_shift <va, offset>
3274 Create: va = vop <va, va'>
3277 if (vect_print_dump_info (REPORT_DETAILS))
3278 fprintf (vect_dump, "Reduce using vector shifts");
3280 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3281 new_phi = VEC_index (gimple, new_phis, 0);
3282 new_temp = PHI_RESULT (new_phi);
3283 for (bit_offset = vec_size_in_bits/2;
3284 bit_offset >= element_bitsize;
3287 tree bitpos = size_int (bit_offset);
3289 epilog_stmt = gimple_build_assign_with_ops (shift_code,
3290 vec_dest, new_temp, bitpos);
3291 new_name = make_ssa_name (vec_dest, epilog_stmt);
3292 gimple_assign_set_lhs (epilog_stmt, new_name);
3293 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3295 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3296 new_name, new_temp);
3297 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3298 gimple_assign_set_lhs (epilog_stmt, new_temp);
3299 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3302 extract_scalar_result = true;
3308 /*** Case 3: Create:
3309 s = extract_field <v_out2, 0>
3310 for (offset = element_size;
3311 offset < vector_size;
3312 offset += element_size;)
3314 Create: s' = extract_field <v_out2, offset>
3315 Create: s = op <s, s'> // For non SLP cases
3318 if (vect_print_dump_info (REPORT_DETAILS))
3319 fprintf (vect_dump, "Reduce using scalar code. ");
3321 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3322 for (i = 0; VEC_iterate (gimple, new_phis, i, new_phi); i++)
3324 vec_temp = PHI_RESULT (new_phi);
3325 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3327 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3328 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3329 gimple_assign_set_lhs (epilog_stmt, new_temp);
3330 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3332 /* In SLP we don't need to apply reduction operation, so we just
3333 collect s' values in SCALAR_RESULTS. */
3335 VEC_safe_push (tree, heap, scalar_results, new_temp);
3337 for (bit_offset = element_bitsize;
3338 bit_offset < vec_size_in_bits;
3339 bit_offset += element_bitsize)
3341 tree bitpos = bitsize_int (bit_offset);
3342 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3345 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3346 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3347 gimple_assign_set_lhs (epilog_stmt, new_name);
3348 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3352 /* In SLP we don't need to apply reduction operation, so
3353 we just collect s' values in SCALAR_RESULTS. */
3354 new_temp = new_name;
3355 VEC_safe_push (tree, heap, scalar_results, new_name);
3359 epilog_stmt = gimple_build_assign_with_ops (code,
3360 new_scalar_dest, new_name, new_temp);
3361 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3362 gimple_assign_set_lhs (epilog_stmt, new_temp);
3363 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3368 /* The only case where we need to reduce scalar results in SLP, is
3369 unrolling. If the size of SCALAR_RESULTS is greater than
3370 GROUP_SIZE, we reduce them combining elements modulo
3374 tree res, first_res, new_res;
3377 /* Reduce multiple scalar results in case of SLP unrolling. */
3378 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3381 first_res = VEC_index (tree, scalar_results, j % group_size);
3382 new_stmt = gimple_build_assign_with_ops (code,
3383 new_scalar_dest, first_res, res);
3384 new_res = make_ssa_name (new_scalar_dest, new_stmt);
3385 gimple_assign_set_lhs (new_stmt, new_res);
3386 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3387 VEC_replace (tree, scalar_results, j % group_size, new_res);
3391 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
3392 VEC_safe_push (tree, heap, scalar_results, new_temp);
3394 extract_scalar_result = false;
3398 /* 2.4 Extract the final scalar result. Create:
3399 s_out3 = extract_field <v_out2, bitpos> */
3401 if (extract_scalar_result)
3405 if (vect_print_dump_info (REPORT_DETAILS))
3406 fprintf (vect_dump, "extract scalar result");
3408 if (BYTES_BIG_ENDIAN)
3409 bitpos = size_binop (MULT_EXPR,
3410 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3411 TYPE_SIZE (scalar_type));
3413 bitpos = bitsize_zero_node;
3415 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3416 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3417 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3418 gimple_assign_set_lhs (epilog_stmt, new_temp);
3419 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3420 VEC_safe_push (tree, heap, scalar_results, new_temp);
3423 vect_finalize_reduction:
3425 /* 2.5 Adjust the final result by the initial value of the reduction
3426 variable. (When such adjustment is not needed, then
3427 'adjustment_def' is zero). For example, if code is PLUS we create:
3428 new_temp = loop_exit_def + adjustment_def */
3432 gcc_assert (!slp_node);
3433 if (nested_in_vect_loop)
3435 new_phi = VEC_index (gimple, new_phis, 0);
3436 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3437 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3438 new_dest = vect_create_destination_var (scalar_dest, vectype);
3442 new_temp = VEC_index (tree, scalar_results, 0);
3443 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3444 expr = build2 (code, scalar_type, new_temp, adjustment_def);
3445 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3448 epilog_stmt = gimple_build_assign (new_dest, expr);
3449 new_temp = make_ssa_name (new_dest, epilog_stmt);
3450 gimple_assign_set_lhs (epilog_stmt, new_temp);
3451 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3452 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3453 if (nested_in_vect_loop)
3455 set_vinfo_for_stmt (epilog_stmt,
3456 new_stmt_vec_info (epilog_stmt, loop_vinfo,
3458 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3459 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3462 VEC_quick_push (tree, scalar_results, new_temp);
3464 VEC_replace (tree, scalar_results, 0, new_temp);
3467 VEC_replace (tree, scalar_results, 0, new_temp);
3469 VEC_replace (gimple, new_phis, 0, epilog_stmt);
3472 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
3473 phis with new adjusted scalar results, i.e., replace use <s_out0>
3478 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3479 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3480 v_out2 = reduce <v_out1>
3481 s_out3 = extract_field <v_out2, 0>
3482 s_out4 = adjust_result <s_out3>
3489 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3490 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3491 v_out2 = reduce <v_out1>
3492 s_out3 = extract_field <v_out2, 0>
3493 s_out4 = adjust_result <s_out3>
3497 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
3498 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
3499 need to match SCALAR_RESULTS with corresponding statements. The first
3500 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
3501 the first vector stmt, etc.
3502 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
3503 if (group_size > VEC_length (gimple, new_phis))
3505 ratio = group_size / VEC_length (gimple, new_phis);
3506 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
3511 for (k = 0; k < group_size; k++)
3515 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
3516 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
3521 gimple current_stmt = VEC_index (gimple,
3522 SLP_TREE_SCALAR_STMTS (slp_node), k);
3524 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
3525 /* SLP statements can't participate in patterns. */
3526 gcc_assert (!orig_stmt);
3527 scalar_dest = gimple_assign_lhs (current_stmt);
3530 phis = VEC_alloc (gimple, heap, 3);
3531 /* Find the loop-closed-use at the loop exit of the original scalar
3532 result. (The reduction result is expected to have two immediate uses -
3533 one at the latch block, and one at the loop exit). */
3534 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3535 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3536 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3538 /* We expect to have found an exit_phi because of loop-closed-ssa
3540 gcc_assert (!VEC_empty (gimple, phis));
3542 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
3546 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
3549 /* FORNOW. Currently not supporting the case that an inner-loop
3550 reduction is not used in the outer-loop (but only outside the
3551 outer-loop), unless it is double reduction. */
3552 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
3553 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
3556 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
3558 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
3559 != vect_double_reduction_def)
3562 /* Handle double reduction:
3564 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
3565 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
3566 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
3567 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
3569 At that point the regular reduction (stmt2 and stmt3) is
3570 already vectorized, as well as the exit phi node, stmt4.
3571 Here we vectorize the phi node of double reduction, stmt1, and
3572 update all relevant statements. */
3574 /* Go through all the uses of s2 to find double reduction phi
3575 node, i.e., stmt1 above. */
3576 orig_name = PHI_RESULT (exit_phi);
3577 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3579 stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3580 stmt_vec_info new_phi_vinfo;
3581 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3582 basic_block bb = gimple_bb (use_stmt);
3585 /* Check that USE_STMT is really double reduction phi
3587 if (gimple_code (use_stmt) != GIMPLE_PHI
3588 || gimple_phi_num_args (use_stmt) != 2
3590 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3591 != vect_double_reduction_def
3592 || bb->loop_father != outer_loop)
3595 /* Create vector phi node for double reduction:
3596 vs1 = phi <vs0, vs2>
3597 vs1 was created previously in this function by a call to
3598 vect_get_vec_def_for_operand and is stored in
3600 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3601 vs0 is created here. */
3603 /* Create vector phi node. */
3604 vect_phi = create_phi_node (vec_initial_def, bb);
3605 new_phi_vinfo = new_stmt_vec_info (vect_phi,
3606 loop_vec_info_for_loop (outer_loop), NULL);
3607 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3609 /* Create vs0 - initial def of the double reduction phi. */
3610 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3611 loop_preheader_edge (outer_loop));
3612 init_def = get_initial_def_for_reduction (stmt,
3613 preheader_arg, NULL);
3614 vect_phi_init = vect_init_vector (use_stmt, init_def,
3617 /* Update phi node arguments with vs0 and vs2. */
3618 add_phi_arg (vect_phi, vect_phi_init,
3619 loop_preheader_edge (outer_loop),
3621 add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3622 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3623 if (vect_print_dump_info (REPORT_DETAILS))
3625 fprintf (vect_dump, "created double reduction phi "
3627 print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3630 vect_phi_res = PHI_RESULT (vect_phi);
3632 /* Replace the use, i.e., set the correct vs1 in the regular
3633 reduction phi node. FORNOW, NCOPIES is always 1, so the
3634 loop is redundant. */
3635 use = reduction_phi;
3636 for (j = 0; j < ncopies; j++)
3638 edge pr_edge = loop_preheader_edge (loop);
3639 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3640 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3645 /* Replace the uses: */
3646 orig_name = PHI_RESULT (exit_phi);
3647 scalar_result = VEC_index (tree, scalar_results, k);
3648 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3649 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3650 SET_USE (use_p, scalar_result);
3653 VEC_free (gimple, heap, phis);
3656 VEC_free (tree, heap, scalar_results);
3657 VEC_free (gimple, heap, new_phis);
3661 /* Function vectorizable_reduction.
3663 Check if STMT performs a reduction operation that can be vectorized.
3664 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3665 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3666 Return FALSE if not a vectorizable STMT, TRUE otherwise.
3668 This function also handles reduction idioms (patterns) that have been
3669 recognized in advance during vect_pattern_recog. In this case, STMT may be
3671 X = pattern_expr (arg0, arg1, ..., X)
3672 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3673 sequence that had been detected and replaced by the pattern-stmt (STMT).
3675 In some cases of reduction patterns, the type of the reduction variable X is
3676 different than the type of the other arguments of STMT.
3677 In such cases, the vectype that is used when transforming STMT into a vector
3678 stmt is different than the vectype that is used to determine the
3679 vectorization factor, because it consists of a different number of elements
3680 than the actual number of elements that are being operated upon in parallel.
3682 For example, consider an accumulation of shorts into an int accumulator.
3683 On some targets it's possible to vectorize this pattern operating on 8
3684 shorts at a time (hence, the vectype for purposes of determining the
3685 vectorization factor should be V8HI); on the other hand, the vectype that
3686 is used to create the vector form is actually V4SI (the type of the result).
3688 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3689 indicates what is the actual level of parallelism (V8HI in the example), so
3690 that the right vectorization factor would be derived. This vectype
3691 corresponds to the type of arguments to the reduction stmt, and should *NOT*
3692 be used to create the vectorized stmt. The right vectype for the vectorized
3693 stmt is obtained from the type of the result X:
3694 get_vectype_for_scalar_type (TREE_TYPE (X))
3696 This means that, contrary to "regular" reductions (or "regular" stmts in
3697 general), the following equation:
3698 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3699 does *NOT* necessarily hold for reduction patterns. */
3702 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3703 gimple *vec_stmt, slp_tree slp_node)
3707 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3708 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3709 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
3710 tree vectype_in = NULL_TREE;
3711 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3712 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3713 enum tree_code code, orig_code, epilog_reduc_code;
3714 enum machine_mode vec_mode;
3716 optab optab, reduc_optab;
3717 tree new_temp = NULL_TREE;
3720 enum vect_def_type dt;
3721 gimple new_phi = NULL;
3725 stmt_vec_info orig_stmt_info;
3726 tree expr = NULL_TREE;
3730 stmt_vec_info prev_stmt_info, prev_phi_info;
3731 bool single_defuse_cycle = false;
3732 tree reduc_def = NULL_TREE;
3733 gimple new_stmt = NULL;
3736 bool nested_cycle = false, found_nested_cycle_def = false;
3737 gimple reduc_def_stmt = NULL;
3738 /* The default is that the reduction variable is the last in statement. */
3739 int reduc_index = 2;
3740 bool double_reduc = false, dummy;
3742 struct loop * def_stmt_loop, *outer_loop = NULL;
3744 gimple def_arg_stmt;
3745 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
3746 VEC (gimple, heap) *phis = NULL;
3750 if (nested_in_vect_loop_p (loop, stmt))
3754 nested_cycle = true;
3757 /* 1. Is vectorizable reduction? */
3758 /* Not supportable if the reduction variable is used in the loop. */
3759 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3762 /* Reductions that are not used even in an enclosing outer-loop,
3763 are expected to be "live" (used out of the loop). */
3764 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3765 && !STMT_VINFO_LIVE_P (stmt_info))
3768 /* Make sure it was already recognized as a reduction computation. */
3769 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3770 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3773 /* 2. Has this been recognized as a reduction pattern?
3775 Check if STMT represents a pattern that has been recognized
3776 in earlier analysis stages. For stmts that represent a pattern,
3777 the STMT_VINFO_RELATED_STMT field records the last stmt in
3778 the original sequence that constitutes the pattern. */
3780 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3783 orig_stmt_info = vinfo_for_stmt (orig_stmt);
3784 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3785 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3786 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3789 /* 3. Check the operands of the operation. The first operands are defined
3790 inside the loop body. The last operand is the reduction variable,
3791 which is defined by the loop-header-phi. */
3793 gcc_assert (is_gimple_assign (stmt));
3796 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3798 case GIMPLE_SINGLE_RHS:
3799 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3800 if (op_type == ternary_op)
3802 tree rhs = gimple_assign_rhs1 (stmt);
3803 ops[0] = TREE_OPERAND (rhs, 0);
3804 ops[1] = TREE_OPERAND (rhs, 1);
3805 ops[2] = TREE_OPERAND (rhs, 2);
3806 code = TREE_CODE (rhs);
3812 case GIMPLE_BINARY_RHS:
3813 code = gimple_assign_rhs_code (stmt);
3814 op_type = TREE_CODE_LENGTH (code);
3815 gcc_assert (op_type == binary_op);
3816 ops[0] = gimple_assign_rhs1 (stmt);
3817 ops[1] = gimple_assign_rhs2 (stmt);
3820 case GIMPLE_UNARY_RHS:
3827 scalar_dest = gimple_assign_lhs (stmt);
3828 scalar_type = TREE_TYPE (scalar_dest);
3829 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3830 && !SCALAR_FLOAT_TYPE_P (scalar_type))
3833 /* All uses but the last are expected to be defined in the loop.
3834 The last use is the reduction variable. In case of nested cycle this
3835 assumption is not true: we use reduc_index to record the index of the
3836 reduction variable. */
3837 for (i = 0; i < op_type-1; i++)
3841 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
3842 if (i == 0 && code == COND_EXPR)
3845 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
3846 &def_stmt, &def, &dt, &tem);
3849 gcc_assert (is_simple_use);
3850 if (dt != vect_internal_def
3851 && dt != vect_external_def
3852 && dt != vect_constant_def
3853 && dt != vect_induction_def
3854 && !(dt == vect_nested_cycle && nested_cycle))
3857 if (dt == vect_nested_cycle)
3859 found_nested_cycle_def = true;
3860 reduc_def_stmt = def_stmt;
3865 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3867 gcc_assert (is_simple_use);
3868 gcc_assert (dt == vect_reduction_def
3869 || dt == vect_nested_cycle
3870 || ((dt == vect_internal_def || dt == vect_external_def
3871 || dt == vect_constant_def || dt == vect_induction_def)
3872 && nested_cycle && found_nested_cycle_def));
3873 if (!found_nested_cycle_def)
3874 reduc_def_stmt = def_stmt;
3876 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
3878 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
3883 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
3884 !nested_cycle, &dummy));
3886 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
3892 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3893 / TYPE_VECTOR_SUBPARTS (vectype_in));
3895 gcc_assert (ncopies >= 1);
3897 vec_mode = TYPE_MODE (vectype_in);
3899 if (code == COND_EXPR)
3901 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
3903 if (vect_print_dump_info (REPORT_DETAILS))
3904 fprintf (vect_dump, "unsupported condition in reduction");
3911 /* 4. Supportable by target? */
3913 /* 4.1. check support for the operation in the loop */
3914 optab = optab_for_tree_code (code, vectype_in, optab_default);
3917 if (vect_print_dump_info (REPORT_DETAILS))
3918 fprintf (vect_dump, "no optab.");
3923 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
3925 if (vect_print_dump_info (REPORT_DETAILS))
3926 fprintf (vect_dump, "op not supported by target.");
3928 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3929 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3930 < vect_min_worthwhile_factor (code))
3933 if (vect_print_dump_info (REPORT_DETAILS))
3934 fprintf (vect_dump, "proceeding using word mode.");
3937 /* Worthwhile without SIMD support? */
3938 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
3939 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3940 < vect_min_worthwhile_factor (code))
3942 if (vect_print_dump_info (REPORT_DETAILS))
3943 fprintf (vect_dump, "not worthwhile without SIMD support.");
3949 /* 4.2. Check support for the epilog operation.
3951 If STMT represents a reduction pattern, then the type of the
3952 reduction variable may be different than the type of the rest
3953 of the arguments. For example, consider the case of accumulation
3954 of shorts into an int accumulator; The original code:
3955 S1: int_a = (int) short_a;
3956 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
3959 STMT: int_acc = widen_sum <short_a, int_acc>
3962 1. The tree-code that is used to create the vector operation in the
3963 epilog code (that reduces the partial results) is not the
3964 tree-code of STMT, but is rather the tree-code of the original
3965 stmt from the pattern that STMT is replacing. I.e, in the example
3966 above we want to use 'widen_sum' in the loop, but 'plus' in the
3968 2. The type (mode) we use to check available target support
3969 for the vector operation to be created in the *epilog*, is
3970 determined by the type of the reduction variable (in the example
3971 above we'd check this: plus_optab[vect_int_mode]).
3972 However the type (mode) we use to check available target support
3973 for the vector operation to be created *inside the loop*, is
3974 determined by the type of the other arguments to STMT (in the
3975 example we'd check this: widen_sum_optab[vect_short_mode]).
3977 This is contrary to "regular" reductions, in which the types of all
3978 the arguments are the same as the type of the reduction variable.
3979 For "regular" reductions we can therefore use the same vector type
3980 (and also the same tree-code) when generating the epilog code and
3981 when generating the code inside the loop. */
3985 /* This is a reduction pattern: get the vectype from the type of the
3986 reduction variable, and get the tree-code from orig_stmt. */
3987 orig_code = gimple_assign_rhs_code (orig_stmt);
3988 gcc_assert (vectype_out);
3989 vec_mode = TYPE_MODE (vectype_out);
3993 /* Regular reduction: use the same vectype and tree-code as used for
3994 the vector code inside the loop can be used for the epilog code. */
4000 def_bb = gimple_bb (reduc_def_stmt);
4001 def_stmt_loop = def_bb->loop_father;
4002 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4003 loop_preheader_edge (def_stmt_loop));
4004 if (TREE_CODE (def_arg) == SSA_NAME
4005 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4006 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4007 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4008 && vinfo_for_stmt (def_arg_stmt)
4009 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4010 == vect_double_reduction_def)
4011 double_reduc = true;
4014 epilog_reduc_code = ERROR_MARK;
4015 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4017 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4021 if (vect_print_dump_info (REPORT_DETAILS))
4022 fprintf (vect_dump, "no optab for reduction.");
4024 epilog_reduc_code = ERROR_MARK;
4028 && optab_handler (reduc_optab, vec_mode)->insn_code
4029 == CODE_FOR_nothing)
4031 if (vect_print_dump_info (REPORT_DETAILS))
4032 fprintf (vect_dump, "reduc op not supported by target.");
4034 epilog_reduc_code = ERROR_MARK;
4039 if (!nested_cycle || double_reduc)
4041 if (vect_print_dump_info (REPORT_DETAILS))
4042 fprintf (vect_dump, "no reduc code for scalar code.");
4048 if (double_reduc && ncopies > 1)
4050 if (vect_print_dump_info (REPORT_DETAILS))
4051 fprintf (vect_dump, "multiple types in double reduction");
4056 if (!vec_stmt) /* transformation not required. */
4058 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4059 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4066 if (vect_print_dump_info (REPORT_DETAILS))
4067 fprintf (vect_dump, "transform reduction.");
4069 /* FORNOW: Multiple types are not supported for condition. */
4070 if (code == COND_EXPR)
4071 gcc_assert (ncopies == 1);
4073 /* Create the destination vector */
4074 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4076 /* In case the vectorization factor (VF) is bigger than the number
4077 of elements that we can fit in a vectype (nunits), we have to generate
4078 more than one vector stmt - i.e - we need to "unroll" the
4079 vector stmt by a factor VF/nunits. For more details see documentation
4080 in vectorizable_operation. */
4082 /* If the reduction is used in an outer loop we need to generate
4083 VF intermediate results, like so (e.g. for ncopies=2):
4088 (i.e. we generate VF results in 2 registers).
4089 In this case we have a separate def-use cycle for each copy, and therefore
4090 for each copy we get the vector def for the reduction variable from the
4091 respective phi node created for this copy.
4093 Otherwise (the reduction is unused in the loop nest), we can combine
4094 together intermediate results, like so (e.g. for ncopies=2):
4098 (i.e. we generate VF/2 results in a single register).
4099 In this case for each copy we get the vector def for the reduction variable
4100 from the vectorized reduction operation generated in the previous iteration.
4103 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4105 single_defuse_cycle = true;
4109 epilog_copies = ncopies;
4111 prev_stmt_info = NULL;
4112 prev_phi_info = NULL;
4115 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4116 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4117 == TYPE_VECTOR_SUBPARTS (vectype_in));
4122 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4123 if (op_type == ternary_op)
4124 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4127 phis = VEC_alloc (gimple, heap, vec_num);
4128 vect_defs = VEC_alloc (tree, heap, vec_num);
4130 VEC_quick_push (tree, vect_defs, NULL_TREE);
4132 for (j = 0; j < ncopies; j++)
4134 if (j == 0 || !single_defuse_cycle)
4136 for (i = 0; i < vec_num; i++)
4138 /* Create the reduction-phi that defines the reduction
4140 new_phi = create_phi_node (vec_dest, loop->header);
4141 set_vinfo_for_stmt (new_phi,
4142 new_stmt_vec_info (new_phi, loop_vinfo,
4144 if (j == 0 || slp_node)
4145 VEC_quick_push (gimple, phis, new_phi);
4149 if (code == COND_EXPR)
4151 gcc_assert (!slp_node);
4152 vectorizable_condition (stmt, gsi, vec_stmt,
4153 PHI_RESULT (VEC_index (gimple, phis, 0)),
4155 /* Multiple types are not supported for condition. */
4163 vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1);
4166 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4168 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4169 if (op_type == ternary_op)
4171 if (reduc_index == 0)
4172 loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
4175 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
4178 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4186 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
4187 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
4188 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4189 if (op_type == ternary_op)
4191 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4193 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4197 if (single_defuse_cycle)
4198 reduc_def = gimple_assign_lhs (new_stmt);
4200 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4203 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, def0); i++)
4206 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4209 if (!single_defuse_cycle || j == 0)
4210 reduc_def = PHI_RESULT (new_phi);
4213 def1 = ((op_type == ternary_op)
4214 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4215 if (op_type == binary_op)
4217 if (reduc_index == 0)
4218 expr = build2 (code, vectype_out, reduc_def, def0);
4220 expr = build2 (code, vectype_out, def0, reduc_def);
4224 if (reduc_index == 0)
4225 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4228 if (reduc_index == 1)
4229 expr = build3 (code, vectype_out, def0, reduc_def, def1);
4231 expr = build3 (code, vectype_out, def0, def1, reduc_def);
4235 new_stmt = gimple_build_assign (vec_dest, expr);
4236 new_temp = make_ssa_name (vec_dest, new_stmt);
4237 gimple_assign_set_lhs (new_stmt, new_temp);
4238 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4241 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4242 VEC_quick_push (tree, vect_defs, new_temp);
4245 VEC_replace (tree, vect_defs, 0, new_temp);
4252 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4254 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4256 prev_stmt_info = vinfo_for_stmt (new_stmt);
4257 prev_phi_info = vinfo_for_stmt (new_phi);
4260 /* Finalize the reduction-phi (set its arguments) and create the
4261 epilog reduction code. */
4262 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4264 new_temp = gimple_assign_lhs (*vec_stmt);
4265 VEC_replace (tree, vect_defs, 0, new_temp);
4268 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4269 epilog_reduc_code, phis, reduc_index,
4270 double_reduc, slp_node);
4272 VEC_free (gimple, heap, phis);
4273 VEC_free (tree, heap, vec_oprnds0);
4275 VEC_free (tree, heap, vec_oprnds1);
4280 /* Function vect_min_worthwhile_factor.
4282 For a loop where we could vectorize the operation indicated by CODE,
4283 return the minimum vectorization factor that makes it worthwhile
4284 to use generic vectors. */
4286 vect_min_worthwhile_factor (enum tree_code code)
4307 /* Function vectorizable_induction
4309 Check if PHI performs an induction computation that can be vectorized.
4310 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4311 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4312 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4315 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4318 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4319 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4320 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4321 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4322 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4323 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4326 gcc_assert (ncopies >= 1);
4327 /* FORNOW. This restriction should be relaxed. */
4328 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4330 if (vect_print_dump_info (REPORT_DETAILS))
4331 fprintf (vect_dump, "multiple types in nested loop.");
4335 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4338 /* FORNOW: SLP not supported. */
4339 if (STMT_SLP_TYPE (stmt_info))
4342 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4344 if (gimple_code (phi) != GIMPLE_PHI)
4347 if (!vec_stmt) /* transformation not required. */
4349 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4350 if (vect_print_dump_info (REPORT_DETAILS))
4351 fprintf (vect_dump, "=== vectorizable_induction ===");
4352 vect_model_induction_cost (stmt_info, ncopies);
4358 if (vect_print_dump_info (REPORT_DETAILS))
4359 fprintf (vect_dump, "transform induction phi.");
4361 vec_def = get_initial_def_for_induction (phi);
4362 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4366 /* Function vectorizable_live_operation.
4368 STMT computes a value that is used outside the loop. Check if
4369 it can be supported. */
4372 vectorizable_live_operation (gimple stmt,
4373 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4374 gimple *vec_stmt ATTRIBUTE_UNUSED)
4376 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4377 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4378 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4384 enum vect_def_type dt;
4385 enum tree_code code;
4386 enum gimple_rhs_class rhs_class;
4388 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4390 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4393 if (!is_gimple_assign (stmt))
4396 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4399 /* FORNOW. CHECKME. */
4400 if (nested_in_vect_loop_p (loop, stmt))
4403 code = gimple_assign_rhs_code (stmt);
4404 op_type = TREE_CODE_LENGTH (code);
4405 rhs_class = get_gimple_rhs_class (code);
4406 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4407 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4409 /* FORNOW: support only if all uses are invariant. This means
4410 that the scalar operations can remain in place, unvectorized.
4411 The original last scalar value that they compute will be used. */
4413 for (i = 0; i < op_type; i++)
4415 if (rhs_class == GIMPLE_SINGLE_RHS)
4416 op = TREE_OPERAND (gimple_op (stmt, 1), i);
4418 op = gimple_op (stmt, i + 1);
4420 && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4422 if (vect_print_dump_info (REPORT_DETAILS))
4423 fprintf (vect_dump, "use not simple.");
4427 if (dt != vect_external_def && dt != vect_constant_def)
4431 /* No transformation is required for the cases we currently support. */
4435 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
4438 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4440 ssa_op_iter op_iter;
4441 imm_use_iterator imm_iter;
4442 def_operand_p def_p;
4445 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4447 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4451 if (!is_gimple_debug (ustmt))
4454 bb = gimple_bb (ustmt);
4456 if (!flow_bb_inside_loop_p (loop, bb))
4458 if (gimple_debug_bind_p (ustmt))
4460 if (vect_print_dump_info (REPORT_DETAILS))
4461 fprintf (vect_dump, "killing debug use");
4463 gimple_debug_bind_reset_value (ustmt);
4464 update_stmt (ustmt);
4473 /* Function vect_transform_loop.
4475 The analysis phase has determined that the loop is vectorizable.
4476 Vectorize the loop - created vectorized stmts to replace the scalar
4477 stmts in the loop, and update the loop exit condition. */
4480 vect_transform_loop (loop_vec_info loop_vinfo)
4482 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4483 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4484 int nbbs = loop->num_nodes;
4485 gimple_stmt_iterator si;
4488 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4490 bool slp_scheduled = false;
4491 unsigned int nunits;
4492 tree cond_expr = NULL_TREE;
4493 gimple_seq cond_expr_stmt_list = NULL;
4494 bool do_peeling_for_loop_bound;
4496 if (vect_print_dump_info (REPORT_DETAILS))
4497 fprintf (vect_dump, "=== vec_transform_loop ===");
4499 /* Peel the loop if there are data refs with unknown alignment.
4500 Only one data ref with unknown store is allowed. */
4502 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4503 vect_do_peeling_for_alignment (loop_vinfo);
4505 do_peeling_for_loop_bound
4506 = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4507 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4508 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4510 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4511 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4512 vect_loop_versioning (loop_vinfo,
4513 !do_peeling_for_loop_bound,
4514 &cond_expr, &cond_expr_stmt_list);
4516 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4517 compile time constant), or it is a constant that doesn't divide by the
4518 vectorization factor, then an epilog loop needs to be created.
4519 We therefore duplicate the loop: the original loop will be vectorized,
4520 and will compute the first (n/VF) iterations. The second copy of the loop
4521 will remain scalar and will compute the remaining (n%VF) iterations.
4522 (VF is the vectorization factor). */
4524 if (do_peeling_for_loop_bound)
4525 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4526 cond_expr, cond_expr_stmt_list);
4528 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4529 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4531 /* 1) Make sure the loop header has exactly two entries
4532 2) Make sure we have a preheader basic block. */
4534 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4536 split_edge (loop_preheader_edge (loop));
4538 /* FORNOW: the vectorizer supports only loops which body consist
4539 of one basic block (header + empty latch). When the vectorizer will
4540 support more involved loop forms, the order by which the BBs are
4541 traversed need to be reconsidered. */
4543 for (i = 0; i < nbbs; i++)
4545 basic_block bb = bbs[i];
4546 stmt_vec_info stmt_info;
4549 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4551 phi = gsi_stmt (si);
4552 if (vect_print_dump_info (REPORT_DETAILS))
4554 fprintf (vect_dump, "------>vectorizing phi: ");
4555 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4557 stmt_info = vinfo_for_stmt (phi);
4561 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4562 vect_loop_kill_debug_uses (loop, phi);
4564 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4565 && !STMT_VINFO_LIVE_P (stmt_info))
4568 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4569 != (unsigned HOST_WIDE_INT) vectorization_factor)
4570 && vect_print_dump_info (REPORT_DETAILS))
4571 fprintf (vect_dump, "multiple-types.");
4573 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4575 if (vect_print_dump_info (REPORT_DETAILS))
4576 fprintf (vect_dump, "transform phi.");
4577 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4581 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4583 gimple stmt = gsi_stmt (si);
4586 if (vect_print_dump_info (REPORT_DETAILS))
4588 fprintf (vect_dump, "------>vectorizing statement: ");
4589 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4592 stmt_info = vinfo_for_stmt (stmt);
4594 /* vector stmts created in the outer-loop during vectorization of
4595 stmts in an inner-loop may not have a stmt_info, and do not
4596 need to be vectorized. */
4603 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4604 vect_loop_kill_debug_uses (loop, stmt);
4606 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4607 && !STMT_VINFO_LIVE_P (stmt_info))
4613 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4615 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4616 if (!STMT_SLP_TYPE (stmt_info)
4617 && nunits != (unsigned int) vectorization_factor
4618 && vect_print_dump_info (REPORT_DETAILS))
4619 /* For SLP VF is set according to unrolling factor, and not to
4620 vector size, hence for SLP this print is not valid. */
4621 fprintf (vect_dump, "multiple-types.");
4623 /* SLP. Schedule all the SLP instances when the first SLP stmt is
4625 if (STMT_SLP_TYPE (stmt_info))
4629 slp_scheduled = true;
4631 if (vect_print_dump_info (REPORT_DETAILS))
4632 fprintf (vect_dump, "=== scheduling SLP instances ===");
4634 vect_schedule_slp (loop_vinfo, NULL);
4637 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
4638 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4645 /* -------- vectorize statement ------------ */
4646 if (vect_print_dump_info (REPORT_DETAILS))
4647 fprintf (vect_dump, "transform statement.");
4649 strided_store = false;
4650 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4653 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4655 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4656 interleaving chain was completed - free all the stores in
4658 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4659 gsi_remove (&si, true);
4664 /* Free the attached stmt_vec_info and remove the stmt. */
4665 free_stmt_vec_info (stmt);
4666 gsi_remove (&si, true);
4674 slpeel_make_loop_iterate_ntimes (loop, ratio);
4676 /* The memory tags and pointers in vectorized statements need to
4677 have their SSA forms updated. FIXME, why can't this be delayed
4678 until all the loops have been transformed? */
4679 update_ssa (TODO_update_ssa);
4681 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4682 fprintf (vect_dump, "LOOP VECTORIZED.");
4683 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4684 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");