1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (gimple, gimple_stmt_iterator *, bool *,
50 slp_tree, slp_instance);
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (gimple, struct loop*, tree, tree *, gimple *, bool, bool *);
54 static tree vect_create_addr_base_for_vector_ref
55 (gimple, gimple_seq *, tree, struct loop *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, gimple, tree *);
58 static tree vect_init_vector (gimple, tree, tree, gimple_stmt_iterator *);
59 static void vect_finish_stmt_generation
60 (gimple stmt, gimple vec_stmt, gimple_stmt_iterator *);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void vect_create_epilog_for_reduction
63 (tree, gimple, int, enum tree_code, gimple);
64 static tree get_initial_def_for_reduction (gimple, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
78 cost_for_stmt (gimple stmt)
80 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
82 switch (STMT_VINFO_TYPE (stmt_info))
84 case load_vec_info_type:
85 return TARG_SCALAR_LOAD_COST;
86 case store_vec_info_type:
87 return TARG_SCALAR_STORE_COST;
88 case op_vec_info_type:
89 case condition_vec_info_type:
90 case assignment_vec_info_type:
91 case reduc_vec_info_type:
92 case induc_vec_info_type:
93 case type_promotion_vec_info_type:
94 case type_demotion_vec_info_type:
95 case type_conversion_vec_info_type:
96 case call_vec_info_type:
97 return TARG_SCALAR_STMT_COST;
98 case undef_vec_info_type:
105 /* Function vect_estimate_min_profitable_iters
107 Return the number of iterations required for the vector version of the
108 loop to be profitable relative to the cost of the scalar version of the
111 TODO: Take profile info into account before making vectorization
112 decisions, if available. */
115 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
118 int min_profitable_iters;
119 int peel_iters_prologue;
120 int peel_iters_epilogue;
121 int vec_inside_cost = 0;
122 int vec_outside_cost = 0;
123 int scalar_single_iter_cost = 0;
124 int scalar_outside_cost = 0;
125 bool runtime_test = false;
126 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
127 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
128 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
129 int nbbs = loop->num_nodes;
130 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
131 int peel_guard_costs = 0;
132 int innerloop_iters = 0, factor;
133 VEC (slp_instance, heap) *slp_instances;
134 slp_instance instance;
136 /* Cost model disabled. */
137 if (!flag_vect_cost_model)
139 if (vect_print_dump_info (REPORT_COST))
140 fprintf (vect_dump, "cost model disabled.");
144 /* If the number of iterations is unknown, or the
145 peeling-for-misalignment amount is unknown, we will have to generate
146 a runtime test to test the loop count against the threshold. */
147 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
148 || (byte_misalign < 0))
151 /* Requires loop versioning tests to handle misalignment. */
153 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
155 /* FIXME: Make cost depend on complexity of individual check. */
157 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
158 if (vect_print_dump_info (REPORT_COST))
159 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
160 "versioning to treat misalignment.\n");
163 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
165 /* FIXME: Make cost depend on complexity of individual check. */
167 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
168 if (vect_print_dump_info (REPORT_COST))
169 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
170 "versioning aliasing.\n");
173 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
174 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
176 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
179 /* Count statements in scalar loop. Using this as scalar cost for a single
182 TODO: Add outer loop support.
184 TODO: Consider assigning different costs to different scalar
189 innerloop_iters = 50; /* FIXME */
191 for (i = 0; i < nbbs; i++)
193 gimple_stmt_iterator si;
194 basic_block bb = bbs[i];
196 if (bb->loop_father == loop->inner)
197 factor = innerloop_iters;
201 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
203 gimple stmt = gsi_stmt (si);
204 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
205 /* Skip stmts that are not vectorized inside the loop. */
206 if (!STMT_VINFO_RELEVANT_P (stmt_info)
207 && (!STMT_VINFO_LIVE_P (stmt_info)
208 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
210 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
211 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
212 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
213 some of the "outside" costs are generated inside the outer-loop. */
214 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
218 /* Add additional cost for the peeled instructions in prologue and epilogue
221 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
222 at compile-time - we assume it's vf/2 (the worst would be vf-1).
224 TODO: Build an expression that represents peel_iters for prologue and
225 epilogue to be used in a run-time test. */
227 if (byte_misalign < 0)
229 peel_iters_prologue = vf/2;
230 if (vect_print_dump_info (REPORT_COST))
231 fprintf (vect_dump, "cost model: "
232 "prologue peel iters set to vf/2.");
234 /* If peeling for alignment is unknown, loop bound of main loop becomes
236 peel_iters_epilogue = vf/2;
237 if (vect_print_dump_info (REPORT_COST))
238 fprintf (vect_dump, "cost model: "
239 "epilogue peel iters set to vf/2 because "
240 "peeling for alignment is unknown .");
242 /* If peeled iterations are unknown, count a taken branch and a not taken
243 branch per peeled loop. Even if scalar loop iterations are known,
244 vector iterations are not known since peeled prologue iterations are
245 not known. Hence guards remain the same. */
246 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
247 + TARG_COND_NOT_TAKEN_BRANCH_COST);
254 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
255 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
256 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
257 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
259 peel_iters_prologue = nelements - (byte_misalign / element_size);
262 peel_iters_prologue = 0;
264 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
266 peel_iters_epilogue = vf/2;
267 if (vect_print_dump_info (REPORT_COST))
268 fprintf (vect_dump, "cost model: "
269 "epilogue peel iters set to vf/2 because "
270 "loop iterations are unknown .");
272 /* If peeled iterations are known but number of scalar loop
273 iterations are unknown, count a taken branch per peeled loop. */
274 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
279 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
280 peel_iters_prologue = niters < peel_iters_prologue ?
281 niters : peel_iters_prologue;
282 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
286 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
287 + (peel_iters_epilogue * scalar_single_iter_cost)
290 /* FORNOW: The scalar outside cost is incremented in one of the
293 1. The vectorizer checks for alignment and aliasing and generates
294 a condition that allows dynamic vectorization. A cost model
295 check is ANDED with the versioning condition. Hence scalar code
296 path now has the added cost of the versioning check.
298 if (cost > th & versioning_check)
301 Hence run-time scalar is incremented by not-taken branch cost.
303 2. The vectorizer then checks if a prologue is required. If the
304 cost model check was not done before during versioning, it has to
305 be done before the prologue check.
308 prologue = scalar_iters
313 if (prologue == num_iters)
316 Hence the run-time scalar cost is incremented by a taken branch,
317 plus a not-taken branch, plus a taken branch cost.
319 3. The vectorizer then checks if an epilogue is required. If the
320 cost model check was not done before during prologue check, it
321 has to be done with the epilogue check.
327 if (prologue == num_iters)
330 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
333 Hence the run-time scalar cost should be incremented by 2 taken
336 TODO: The back end may reorder the BBS's differently and reverse
337 conditions/branch directions. Change the estimates below to
338 something more reasonable. */
342 /* Cost model check occurs at versioning. */
343 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
344 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
345 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
348 /* Cost model occurs at prologue generation. */
349 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
350 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
351 + TARG_COND_NOT_TAKEN_BRANCH_COST;
352 /* Cost model check occurs at epilogue generation. */
354 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
359 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
360 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
362 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
363 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
366 /* Calculate number of iterations required to make the vector version
367 profitable, relative to the loop bodies only. The following condition
369 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
371 SIC = scalar iteration cost, VIC = vector iteration cost,
372 VOC = vector outside cost, VF = vectorization factor,
373 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
374 SOC = scalar outside cost for run time cost model check. */
376 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
378 if (vec_outside_cost <= 0)
379 min_profitable_iters = 1;
382 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
383 - vec_inside_cost * peel_iters_prologue
384 - vec_inside_cost * peel_iters_epilogue)
385 / ((scalar_single_iter_cost * vf)
388 if ((scalar_single_iter_cost * vf * min_profitable_iters)
389 <= ((vec_inside_cost * min_profitable_iters)
390 + ((vec_outside_cost - scalar_outside_cost) * vf)))
391 min_profitable_iters++;
394 /* vector version will never be profitable. */
397 if (vect_print_dump_info (REPORT_COST))
398 fprintf (vect_dump, "cost model: vector iteration cost = %d "
399 "is divisible by scalar iteration cost = %d by a factor "
400 "greater than or equal to the vectorization factor = %d .",
401 vec_inside_cost, scalar_single_iter_cost, vf);
405 if (vect_print_dump_info (REPORT_COST))
407 fprintf (vect_dump, "Cost model analysis: \n");
408 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
410 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
412 fprintf (vect_dump, " Scalar iteration cost: %d\n",
413 scalar_single_iter_cost);
414 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
415 fprintf (vect_dump, " prologue iterations: %d\n",
416 peel_iters_prologue);
417 fprintf (vect_dump, " epilogue iterations: %d\n",
418 peel_iters_epilogue);
419 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
420 min_profitable_iters);
423 min_profitable_iters =
424 min_profitable_iters < vf ? vf : min_profitable_iters;
426 /* Because the condition we create is:
427 if (niters <= min_profitable_iters)
428 then skip the vectorized loop. */
429 min_profitable_iters--;
431 if (vect_print_dump_info (REPORT_COST))
432 fprintf (vect_dump, " Profitability threshold = %d\n",
433 min_profitable_iters);
435 return min_profitable_iters;
439 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
440 functions. Design better to avoid maintenance issues. */
442 /* Function vect_model_reduction_cost.
444 Models cost for a reduction operation, including the vector ops
445 generated within the strip-mine loop, the initial definition before
446 the loop, and the epilogue code that must be generated. */
449 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
456 gimple stmt, orig_stmt;
458 enum machine_mode mode;
459 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
460 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
463 /* Cost of reduction op inside loop. */
464 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
466 stmt = STMT_VINFO_STMT (stmt_info);
468 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
470 case GIMPLE_SINGLE_RHS:
471 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
472 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
474 case GIMPLE_UNARY_RHS:
475 reduction_op = gimple_assign_rhs1 (stmt);
477 case GIMPLE_BINARY_RHS:
478 reduction_op = gimple_assign_rhs2 (stmt);
484 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
487 if (vect_print_dump_info (REPORT_COST))
489 fprintf (vect_dump, "unsupported data-type ");
490 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
495 mode = TYPE_MODE (vectype);
496 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
499 orig_stmt = STMT_VINFO_STMT (stmt_info);
501 code = gimple_assign_rhs_code (orig_stmt);
503 /* Add in cost for initial definition. */
504 outer_cost += TARG_SCALAR_TO_VEC_COST;
506 /* Determine cost of epilogue code.
508 We have a reduction operator that will reduce the vector in one statement.
509 Also requires scalar extract. */
511 if (!nested_in_vect_loop_p (loop, orig_stmt))
513 if (reduc_code < NUM_TREE_CODES)
514 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
517 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
519 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
520 int element_bitsize = tree_low_cst (bitsize, 1);
521 int nelements = vec_size_in_bits / element_bitsize;
523 optab = optab_for_tree_code (code, vectype, optab_default);
525 /* We have a whole vector shift available. */
526 if (VECTOR_MODE_P (mode)
527 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
528 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
529 /* Final reduction via vector shifts and the reduction operator. Also
530 requires scalar extract. */
531 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
532 + TARG_VEC_TO_SCALAR_COST);
534 /* Use extracts and reduction op for final reduction. For N elements,
535 we have N extracts and N-1 reduction ops. */
536 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
540 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
542 if (vect_print_dump_info (REPORT_COST))
543 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
544 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
545 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
551 /* Function vect_model_induction_cost.
553 Models cost for induction operations. */
556 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
558 /* loop cost for vec_loop. */
559 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
560 /* prologue cost for vec_init and vec_step. */
561 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
563 if (vect_print_dump_info (REPORT_COST))
564 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
565 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
566 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
570 /* Function vect_model_simple_cost.
572 Models cost for simple operations, i.e. those that only emit ncopies of a
573 single op. Right now, this does not account for multiple insns that could
574 be generated for the single vector op. We will handle that shortly. */
577 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
578 enum vect_def_type *dt, slp_tree slp_node)
581 int inside_cost = 0, outside_cost = 0;
583 /* The SLP costs were already calculated during SLP tree build. */
584 if (PURE_SLP_STMT (stmt_info))
587 inside_cost = ncopies * TARG_VEC_STMT_COST;
589 /* FORNOW: Assuming maximum 2 args per stmts. */
590 for (i = 0; i < 2; i++)
592 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
593 outside_cost += TARG_SCALAR_TO_VEC_COST;
596 if (vect_print_dump_info (REPORT_COST))
597 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
598 "outside_cost = %d .", inside_cost, outside_cost);
600 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
601 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
602 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
606 /* Function vect_cost_strided_group_size
608 For strided load or store, return the group_size only if it is the first
609 load or store of a group, else return 1. This ensures that group size is
610 only returned once per group. */
613 vect_cost_strided_group_size (stmt_vec_info stmt_info)
615 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
617 if (first_stmt == STMT_VINFO_STMT (stmt_info))
618 return DR_GROUP_SIZE (stmt_info);
624 /* Function vect_model_store_cost
626 Models cost for stores. In the case of strided accesses, one access
627 has the overhead of the strided access attributed to it. */
630 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
631 enum vect_def_type dt, slp_tree slp_node)
634 int inside_cost = 0, outside_cost = 0;
636 /* The SLP costs were already calculated during SLP tree build. */
637 if (PURE_SLP_STMT (stmt_info))
640 if (dt == vect_constant_def || dt == vect_invariant_def)
641 outside_cost = TARG_SCALAR_TO_VEC_COST;
643 /* Strided access? */
644 if (DR_GROUP_FIRST_DR (stmt_info) && !slp_node)
645 group_size = vect_cost_strided_group_size (stmt_info);
646 /* Not a strided access. */
650 /* Is this an access in a group of stores, which provide strided access?
651 If so, add in the cost of the permutes. */
654 /* Uses a high and low interleave operation for each needed permute. */
655 inside_cost = ncopies * exact_log2(group_size) * group_size
656 * TARG_VEC_STMT_COST;
658 if (vect_print_dump_info (REPORT_COST))
659 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
664 /* Costs of the stores. */
665 inside_cost += ncopies * TARG_VEC_STORE_COST;
667 if (vect_print_dump_info (REPORT_COST))
668 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
669 "outside_cost = %d .", inside_cost, outside_cost);
671 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
672 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
673 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
677 /* Function vect_model_load_cost
679 Models cost for loads. In the case of strided accesses, the last access
680 has the overhead of the strided access attributed to it. Since unaligned
681 accesses are supported for loads, we also account for the costs of the
682 access scheme chosen. */
685 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
689 int alignment_support_cheme;
691 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
692 int inside_cost = 0, outside_cost = 0;
694 /* The SLP costs were already calculated during SLP tree build. */
695 if (PURE_SLP_STMT (stmt_info))
698 /* Strided accesses? */
699 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
700 if (first_stmt && !slp_node)
702 group_size = vect_cost_strided_group_size (stmt_info);
703 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
705 /* Not a strided access. */
712 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
714 /* Is this an access in a group of loads providing strided access?
715 If so, add in the cost of the permutes. */
718 /* Uses an even and odd extract operations for each needed permute. */
719 inside_cost = ncopies * exact_log2(group_size) * group_size
720 * TARG_VEC_STMT_COST;
722 if (vect_print_dump_info (REPORT_COST))
723 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
728 /* The loads themselves. */
729 switch (alignment_support_cheme)
733 inside_cost += ncopies * TARG_VEC_LOAD_COST;
735 if (vect_print_dump_info (REPORT_COST))
736 fprintf (vect_dump, "vect_model_load_cost: aligned.");
740 case dr_unaligned_supported:
742 /* Here, we assign an additional cost for the unaligned load. */
743 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
745 if (vect_print_dump_info (REPORT_COST))
746 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
751 case dr_explicit_realign:
753 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
755 /* FIXME: If the misalignment remains fixed across the iterations of
756 the containing loop, the following cost should be added to the
758 if (targetm.vectorize.builtin_mask_for_load)
759 inside_cost += TARG_VEC_STMT_COST;
763 case dr_explicit_realign_optimized:
765 if (vect_print_dump_info (REPORT_COST))
766 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
769 /* Unaligned software pipeline has a load of an address, an initial
770 load, and possibly a mask operation to "prime" the loop. However,
771 if this is an access in a group of loads, which provide strided
772 access, then the above cost should only be considered for one
773 access in the group. Inside the loop, there is a load op
774 and a realignment op. */
776 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
778 outside_cost = 2*TARG_VEC_STMT_COST;
779 if (targetm.vectorize.builtin_mask_for_load)
780 outside_cost += TARG_VEC_STMT_COST;
783 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
792 if (vect_print_dump_info (REPORT_COST))
793 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
794 "outside_cost = %d .", inside_cost, outside_cost);
796 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
797 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
798 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
802 /* Function vect_get_new_vect_var.
804 Returns a name for a new variable. The current naming scheme appends the
805 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
806 the name of vectorizer generated variables, and appends that to NAME if
810 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
817 case vect_simple_var:
820 case vect_scalar_var:
823 case vect_pointer_var:
832 char* tmp = concat (prefix, name, NULL);
833 new_vect_var = create_tmp_var (type, tmp);
837 new_vect_var = create_tmp_var (type, prefix);
839 /* Mark vector typed variable as a gimple register variable. */
840 if (TREE_CODE (type) == VECTOR_TYPE)
841 DECL_GIMPLE_REG_P (new_vect_var) = true;
847 /* Function vect_create_addr_base_for_vector_ref.
849 Create an expression that computes the address of the first memory location
850 that will be accessed for a data reference.
853 STMT: The statement containing the data reference.
854 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
855 OFFSET: Optional. If supplied, it is be added to the initial address.
856 LOOP: Specify relative to which loop-nest should the address be computed.
857 For example, when the dataref is in an inner-loop nested in an
858 outer-loop that is now being vectorized, LOOP can be either the
859 outer-loop, or the inner-loop. The first memory location accessed
860 by the following dataref ('in' points to short):
867 if LOOP=i_loop: &in (relative to i_loop)
868 if LOOP=j_loop: &in+i*2B (relative to j_loop)
871 1. Return an SSA_NAME whose value is the address of the memory location of
872 the first vector of the data reference.
873 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
874 these statement(s) which define the returned SSA_NAME.
876 FORNOW: We are only handling array accesses with step 1. */
879 vect_create_addr_base_for_vector_ref (gimple stmt,
880 gimple_seq *new_stmt_list,
884 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
885 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
886 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
887 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
889 tree data_ref_base_var;
891 tree addr_base, addr_expr;
893 gimple_seq seq = NULL;
894 tree base_offset = unshare_expr (DR_OFFSET (dr));
895 tree init = unshare_expr (DR_INIT (dr));
896 tree vect_ptr_type, addr_expr2;
897 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
900 if (loop != containing_loop)
902 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
903 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
905 gcc_assert (nested_in_vect_loop_p (loop, stmt));
907 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
908 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
909 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
912 /* Create data_ref_base */
913 base_name = build_fold_indirect_ref (data_ref_base);
914 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
915 add_referenced_var (data_ref_base_var);
916 data_ref_base = force_gimple_operand (data_ref_base, &seq, true,
918 gimple_seq_add_seq (new_stmt_list, seq);
920 /* Create base_offset */
921 base_offset = size_binop (PLUS_EXPR, base_offset, init);
922 base_offset = fold_convert (sizetype, base_offset);
923 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
924 add_referenced_var (dest);
925 base_offset = force_gimple_operand (base_offset, &seq, true, dest);
926 gimple_seq_add_seq (new_stmt_list, seq);
930 tree tmp = create_tmp_var (sizetype, "offset");
932 add_referenced_var (tmp);
933 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
934 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
935 base_offset, offset);
936 base_offset = force_gimple_operand (base_offset, &seq, false, tmp);
937 gimple_seq_add_seq (new_stmt_list, seq);
940 /* base + base_offset */
941 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
942 data_ref_base, base_offset);
944 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
946 /* addr_expr = addr_base */
947 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
948 get_name (base_name));
949 add_referenced_var (addr_expr);
950 vec_stmt = fold_convert (vect_ptr_type, addr_base);
951 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
952 get_name (base_name));
953 add_referenced_var (addr_expr2);
954 vec_stmt = force_gimple_operand (vec_stmt, &seq, false, addr_expr2);
955 gimple_seq_add_seq (new_stmt_list, seq);
957 if (vect_print_dump_info (REPORT_DETAILS))
959 fprintf (vect_dump, "created ");
960 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
966 /* Function vect_create_data_ref_ptr.
968 Create a new pointer to vector type (vp), that points to the first location
969 accessed in the loop by STMT, along with the def-use update chain to
970 appropriately advance the pointer through the loop iterations. Also set
971 aliasing information for the pointer. This vector pointer is used by the
972 callers to this function to create a memory reference expression for vector
976 1. STMT: a stmt that references memory. Expected to be of the form
977 GIMPLE_ASSIGN <name, data-ref> or
978 GIMPLE_ASSIGN <data-ref, name>.
979 2. AT_LOOP: the loop where the vector memref is to be created.
980 3. OFFSET (optional): an offset to be added to the initial address accessed
981 by the data-ref in STMT.
982 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
983 pointing to the initial address.
986 1. Declare a new ptr to vector_type, and have it point to the base of the
987 data reference (initial addressed accessed by the data reference).
988 For example, for vector of type V8HI, the following code is generated:
991 vp = (v8hi *)initial_address;
993 if OFFSET is not supplied:
994 initial_address = &a[init];
995 if OFFSET is supplied:
996 initial_address = &a[init + OFFSET];
998 Return the initial_address in INITIAL_ADDRESS.
1000 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
1001 update the pointer in each iteration of the loop.
1003 Return the increment stmt that updates the pointer in PTR_INCR.
1005 3. Set INV_P to true if the access pattern of the data reference in the
1006 vectorized loop is invariant. Set it to false otherwise.
1008 4. Return the pointer. */
1011 vect_create_data_ref_ptr (gimple stmt, struct loop *at_loop,
1012 tree offset, tree *initial_address, gimple *ptr_incr,
1013 bool only_init, bool *inv_p)
1016 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1017 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1018 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1019 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
1020 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
1021 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1027 gimple_seq new_stmt_list = NULL;
1031 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1033 gimple_stmt_iterator incr_gsi;
1035 tree indx_before_incr, indx_after_incr;
1039 /* Check the step (evolution) of the load in LOOP, and record
1040 whether it's invariant. */
1041 if (nested_in_vect_loop)
1042 step = STMT_VINFO_DR_STEP (stmt_info);
1044 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1046 if (tree_int_cst_compare (step, size_zero_node) == 0)
1051 /* Create an expression for the first address accessed by this load
1053 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1055 if (vect_print_dump_info (REPORT_DETAILS))
1057 tree data_ref_base = base_name;
1058 fprintf (vect_dump, "create vector-pointer variable to type: ");
1059 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1060 if (TREE_CODE (data_ref_base) == VAR_DECL)
1061 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
1062 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1063 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
1064 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1065 fprintf (vect_dump, " vectorizing a record based array ref: ");
1066 else if (TREE_CODE (data_ref_base) == SSA_NAME)
1067 fprintf (vect_dump, " vectorizing a pointer ref: ");
1068 print_generic_expr (vect_dump, base_name, TDF_SLIM);
1071 /** (1) Create the new vector-pointer variable: **/
1072 vect_ptr_type = build_pointer_type (vectype);
1074 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1075 get_name (base_name));
1076 add_referenced_var (vect_ptr);
1078 /** (2) Add aliasing information to the new vector-pointer:
1079 (The points-to info (DR_PTR_INFO) may be defined later.) **/
1081 tag = DR_SYMBOL_TAG (dr);
1084 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1085 tag must be created with tag added to its may alias list. */
1087 new_type_alias (vect_ptr, tag, DR_REF (dr));
1089 set_symbol_mem_tag (vect_ptr, tag);
1091 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1092 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1093 def-use update cycles for the pointer: One relative to the outer-loop
1094 (LOOP), which is what steps (3) and (4) below do. The other is relative
1095 to the inner-loop (which is the inner-most loop containing the dataref),
1096 and this is done be step (5) below.
1098 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1099 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1100 redundant. Steps (3),(4) create the following:
1103 LOOP: vp1 = phi(vp0,vp2)
1109 If there is an inner-loop nested in loop, then step (5) will also be
1110 applied, and an additional update in the inner-loop will be created:
1113 LOOP: vp1 = phi(vp0,vp2)
1115 inner: vp3 = phi(vp1,vp4)
1116 vp4 = vp3 + inner_step
1122 /** (3) Calculate the initial address the vector-pointer, and set
1123 the vector-pointer to point to it before the loop: **/
1125 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1127 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1129 pe = loop_preheader_edge (loop);
1132 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
1133 gcc_assert (!new_bb);
1136 *initial_address = new_temp;
1138 /* Create: p = (vectype *) initial_base */
1139 vec_stmt = gimple_build_assign (vect_ptr,
1140 fold_convert (vect_ptr_type, new_temp));
1141 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1142 gimple_assign_set_lhs (vec_stmt, vect_ptr_init);
1143 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
1144 gcc_assert (!new_bb);
1147 /** (4) Handle the updating of the vector-pointer inside the loop.
1148 This is needed when ONLY_INIT is false, and also when AT_LOOP
1149 is the inner-loop nested in LOOP (during outer-loop vectorization).
1152 if (only_init && at_loop == loop) /* No update in loop is required. */
1154 /* Copy the points-to information if it exists. */
1155 if (DR_PTR_INFO (dr))
1156 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1157 vptr = vect_ptr_init;
1161 /* The step of the vector pointer is the Vector Size. */
1162 tree step = TYPE_SIZE_UNIT (vectype);
1163 /* One exception to the above is when the scalar step of the load in
1164 LOOP is zero. In this case the step here is also zero. */
1166 step = size_zero_node;
1168 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1170 create_iv (vect_ptr_init,
1171 fold_convert (vect_ptr_type, step),
1172 NULL_TREE, loop, &incr_gsi, insert_after,
1173 &indx_before_incr, &indx_after_incr);
1174 incr = gsi_stmt (incr_gsi);
1175 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1177 /* Copy the points-to information if it exists. */
1178 if (DR_PTR_INFO (dr))
1180 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1181 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1183 merge_alias_info (vect_ptr_init, indx_before_incr);
1184 merge_alias_info (vect_ptr_init, indx_after_incr);
1188 vptr = indx_before_incr;
1191 if (!nested_in_vect_loop || only_init)
1195 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1196 nested in LOOP, if exists: **/
1198 gcc_assert (nested_in_vect_loop);
1201 standard_iv_increment_position (containing_loop, &incr_gsi,
1203 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE,
1204 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
1206 incr = gsi_stmt (incr_gsi);
1207 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1209 /* Copy the points-to information if it exists. */
1210 if (DR_PTR_INFO (dr))
1212 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1213 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1215 merge_alias_info (vect_ptr_init, indx_before_incr);
1216 merge_alias_info (vect_ptr_init, indx_after_incr);
1220 return indx_before_incr;
1227 /* Function bump_vector_ptr
1229 Increment a pointer (to a vector type) by vector-size. If requested,
1230 i.e. if PTR-INCR is given, then also connect the new increment stmt
1231 to the existing def-use update-chain of the pointer, by modifying
1232 the PTR_INCR as illustrated below:
1234 The pointer def-use update-chain before this function:
1235 DATAREF_PTR = phi (p_0, p_2)
1237 PTR_INCR: p_2 = DATAREF_PTR + step
1239 The pointer def-use update-chain after this function:
1240 DATAREF_PTR = phi (p_0, p_2)
1242 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1244 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1247 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1249 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1250 the loop. The increment amount across iterations is expected
1252 BSI - location where the new update stmt is to be placed.
1253 STMT - the original scalar memory-access stmt that is being vectorized.
1254 BUMP - optional. The offset by which to bump the pointer. If not given,
1255 the offset is assumed to be vector_size.
1257 Output: Return NEW_DATAREF_PTR as illustrated above.
1262 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
1263 gimple stmt, tree bump)
1265 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1266 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1267 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1268 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1269 tree update = TYPE_SIZE_UNIT (vectype);
1272 use_operand_p use_p;
1273 tree new_dataref_ptr;
1278 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, ptr_var,
1279 dataref_ptr, update);
1280 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1281 gimple_assign_set_lhs (incr_stmt, new_dataref_ptr);
1282 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
1284 /* Copy the points-to information if it exists. */
1285 if (DR_PTR_INFO (dr))
1286 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1287 merge_alias_info (new_dataref_ptr, dataref_ptr);
1290 return new_dataref_ptr;
1292 /* Update the vector-pointer's cross-iteration increment. */
1293 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1295 tree use = USE_FROM_PTR (use_p);
1297 if (use == dataref_ptr)
1298 SET_USE (use_p, new_dataref_ptr);
1300 gcc_assert (tree_int_cst_compare (use, update) == 0);
1303 return new_dataref_ptr;
1307 /* Function vect_create_destination_var.
1309 Create a new temporary of type VECTYPE. */
1312 vect_create_destination_var (tree scalar_dest, tree vectype)
1315 const char *new_name;
1317 enum vect_var_kind kind;
1319 kind = vectype ? vect_simple_var : vect_scalar_var;
1320 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1322 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1324 new_name = get_name (scalar_dest);
1327 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1328 add_referenced_var (vec_dest);
1334 /* Function vect_init_vector.
1336 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1337 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1338 is not NULL. Otherwise, place the initialization at the loop preheader.
1339 Return the DEF of INIT_STMT.
1340 It will be used in the vectorization of STMT. */
1343 vect_init_vector (gimple stmt, tree vector_var, tree vector_type,
1344 gimple_stmt_iterator *gsi)
1346 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1354 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1355 add_referenced_var (new_var);
1356 init_stmt = gimple_build_assign (new_var, vector_var);
1357 new_temp = make_ssa_name (new_var, init_stmt);
1358 gimple_assign_set_lhs (init_stmt, new_temp);
1361 vect_finish_stmt_generation (stmt, init_stmt, gsi);
1364 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1365 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1367 if (nested_in_vect_loop_p (loop, stmt))
1369 pe = loop_preheader_edge (loop);
1370 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1371 gcc_assert (!new_bb);
1374 if (vect_print_dump_info (REPORT_DETAILS))
1376 fprintf (vect_dump, "created new init_stmt: ");
1377 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1380 vec_oprnd = gimple_assign_lhs (init_stmt);
1385 /* For constant and loop invariant defs of SLP_NODE this function returns
1386 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1387 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1388 stmts. NUMBER_OF_VECTORS is the number of vector defs to create. */
1391 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1392 unsigned int op_num, unsigned int number_of_vectors)
1394 VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1395 gimple stmt = VEC_index (gimple, stmts, 0);
1396 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1397 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1398 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1401 int j, number_of_places_left_in_vector;
1404 int group_size = VEC_length (gimple, stmts);
1405 unsigned int vec_num, i;
1406 int number_of_copies = 1;
1407 bool is_store = false;
1408 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1411 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1414 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1415 created vectors. It is greater than 1 if unrolling is performed.
1417 For example, we have two scalar operands, s1 and s2 (e.g., group of
1418 strided accesses of size two), while NUNITS is four (i.e., four scalars
1419 of this type can be packed in a vector). The output vector will contain
1420 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1423 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1424 containing the operands.
1426 For example, NUNITS is four as before, and the group size is 8
1427 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1428 {s5, s6, s7, s8}. */
1430 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1432 number_of_places_left_in_vector = nunits;
1434 for (j = 0; j < number_of_copies; j++)
1436 for (i = group_size - 1; VEC_iterate (gimple, stmts, i, stmt); i--)
1439 op = gimple_assign_rhs1 (stmt);
1441 op = gimple_op (stmt, op_num + 1);
1442 if (!CONSTANT_CLASS_P (op))
1445 /* Create 'vect_ = {op0,op1,...,opn}'. */
1446 t = tree_cons (NULL_TREE, op, t);
1448 number_of_places_left_in_vector--;
1450 if (number_of_places_left_in_vector == 0)
1452 number_of_places_left_in_vector = nunits;
1454 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1455 gcc_assert (vector_type);
1457 vec_cst = build_vector (vector_type, t);
1459 vec_cst = build_constructor_from_list (vector_type, t);
1461 VEC_quick_push (tree, voprnds,
1462 vect_init_vector (stmt, vec_cst, vector_type,
1469 /* Since the vectors are created in the reverse order, we should invert
1471 vec_num = VEC_length (tree, voprnds);
1472 for (j = vec_num - 1; j >= 0; j--)
1474 vop = VEC_index (tree, voprnds, j);
1475 VEC_quick_push (tree, *vec_oprnds, vop);
1478 VEC_free (tree, heap, voprnds);
1480 /* In case that VF is greater than the unrolling factor needed for the SLP
1481 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1482 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1483 to replicate the vectors. */
1484 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1486 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1487 VEC_quick_push (tree, *vec_oprnds, vop);
1492 /* Get vectorized definitions from SLP_NODE that contains corresponding
1493 vectorized def-stmts. */
1496 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1499 gimple vec_def_stmt;
1502 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1505 VEC_iterate (gimple, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1508 gcc_assert (vec_def_stmt);
1509 vec_oprnd = gimple_get_lhs (vec_def_stmt);
1510 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1515 /* Get vectorized definitions for SLP_NODE.
1516 If the scalar definitions are loop invariants or constants, collect them and
1517 call vect_get_constant_vectors() to create vector stmts.
1518 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1519 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1520 vect_get_slp_vect_defs() to retrieve them.
1521 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1522 the right node. This is used when the second operand must remain scalar. */
1525 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1526 VEC (tree,heap) **vec_oprnds1)
1529 enum tree_code code;
1530 int number_of_vects;
1531 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
1533 first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1534 /* The number of vector defs is determined by the number of vector statements
1535 in the node from which we get those statements. */
1536 if (SLP_TREE_LEFT (slp_node))
1537 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_LEFT (slp_node));
1540 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1541 /* Number of vector stmts was calculated according to LHS in
1542 vect_schedule_slp_instance(), fix it by replacing LHS with RHS, if
1543 necessary. See vect_get_smallest_scalar_type() for details. */
1544 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
1546 if (rhs_size_unit != lhs_size_unit)
1548 number_of_vects *= rhs_size_unit;
1549 number_of_vects /= lhs_size_unit;
1553 /* Allocate memory for vectorized defs. */
1554 *vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects);
1556 /* SLP_NODE corresponds either to a group of stores or to a group of
1557 unary/binary operations. We don't call this function for loads. */
1558 if (SLP_TREE_LEFT (slp_node))
1559 /* The defs are already vectorized. */
1560 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1562 /* Build vectors from scalar defs. */
1563 vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects);
1565 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1566 /* Since we don't call this function with loads, this is a group of
1570 code = gimple_assign_rhs_code (first_stmt);
1571 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1)
1574 /* The number of vector defs is determined by the number of vector statements
1575 in the node from which we get those statements. */
1576 if (SLP_TREE_RIGHT (slp_node))
1577 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_RIGHT (slp_node));
1579 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1581 *vec_oprnds1 = VEC_alloc (tree, heap, number_of_vects);
1583 if (SLP_TREE_RIGHT (slp_node))
1584 /* The defs are already vectorized. */
1585 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1587 /* Build vectors from scalar defs. */
1588 vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects);
1592 /* Function get_initial_def_for_induction
1595 STMT - a stmt that performs an induction operation in the loop.
1596 IV_PHI - the initial value of the induction variable
1599 Return a vector variable, initialized with the first VF values of
1600 the induction variable. E.g., for an iv with IV_PHI='X' and
1601 evolution S, for a vector of 4 units, we want to return:
1602 [X, X + S, X + 2*S, X + 3*S]. */
1605 get_initial_def_for_induction (gimple iv_phi)
1607 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1608 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1609 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1610 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
1613 edge pe = loop_preheader_edge (loop);
1614 struct loop *iv_loop;
1616 tree vec, vec_init, vec_step, t;
1620 gimple init_stmt, induction_phi, new_stmt;
1621 tree induc_def, vec_def, vec_dest;
1622 tree init_expr, step_expr;
1623 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1628 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1629 bool nested_in_vect_loop = false;
1630 gimple_seq stmts = NULL;
1631 imm_use_iterator imm_iter;
1632 use_operand_p use_p;
1636 gimple_stmt_iterator si;
1637 basic_block bb = gimple_bb (iv_phi);
1639 vectype = get_vectype_for_scalar_type (scalar_type);
1640 gcc_assert (vectype);
1641 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1642 ncopies = vf / nunits;
1644 gcc_assert (phi_info);
1645 gcc_assert (ncopies >= 1);
1647 /* Find the first insertion point in the BB. */
1648 si = gsi_after_labels (bb);
1650 if (INTEGRAL_TYPE_P (scalar_type) || POINTER_TYPE_P (scalar_type))
1651 step_expr = build_int_cst (scalar_type, 0);
1653 step_expr = build_real (scalar_type, dconst0);
1655 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1656 if (nested_in_vect_loop_p (loop, iv_phi))
1658 nested_in_vect_loop = true;
1659 iv_loop = loop->inner;
1663 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
1665 latch_e = loop_latch_edge (iv_loop);
1666 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1668 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1669 gcc_assert (access_fn);
1670 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1671 &init_expr, &step_expr);
1673 pe = loop_preheader_edge (iv_loop);
1675 /* Create the vector that holds the initial_value of the induction. */
1676 if (nested_in_vect_loop)
1678 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1679 been created during vectorization of previous stmts; We obtain it from
1680 the STMT_VINFO_VEC_STMT of the defining stmt. */
1681 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1682 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1686 /* iv_loop is the loop to be vectorized. Create:
1687 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1688 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1689 add_referenced_var (new_var);
1691 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1694 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
1695 gcc_assert (!new_bb);
1699 t = tree_cons (NULL_TREE, init_expr, t);
1700 for (i = 1; i < nunits; i++)
1702 /* Create: new_name_i = new_name + step_expr */
1703 enum tree_code code = POINTER_TYPE_P (scalar_type)
1704 ? POINTER_PLUS_EXPR : PLUS_EXPR;
1705 init_stmt = gimple_build_assign_with_ops (code, new_var,
1706 new_name, step_expr);
1707 new_name = make_ssa_name (new_var, init_stmt);
1708 gimple_assign_set_lhs (init_stmt, new_name);
1710 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1711 gcc_assert (!new_bb);
1713 if (vect_print_dump_info (REPORT_DETAILS))
1715 fprintf (vect_dump, "created new init_stmt: ");
1716 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1718 t = tree_cons (NULL_TREE, new_name, t);
1720 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1721 vec = build_constructor_from_list (vectype, nreverse (t));
1722 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1726 /* Create the vector that holds the step of the induction. */
1727 if (nested_in_vect_loop)
1728 /* iv_loop is nested in the loop to be vectorized. Generate:
1729 vec_step = [S, S, S, S] */
1730 new_name = step_expr;
1733 /* iv_loop is the loop to be vectorized. Generate:
1734 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1735 expr = build_int_cst (scalar_type, vf);
1736 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1740 for (i = 0; i < nunits; i++)
1741 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1742 gcc_assert (CONSTANT_CLASS_P (new_name));
1743 vec = build_vector (vectype, t);
1744 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1747 /* Create the following def-use cycle:
1752 vec_iv = PHI <vec_init, vec_loop>
1756 vec_loop = vec_iv + vec_step; */
1758 /* Create the induction-phi that defines the induction-operand. */
1759 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1760 add_referenced_var (vec_dest);
1761 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1762 set_vinfo_for_stmt (induction_phi,
1763 new_stmt_vec_info (induction_phi, loop_vinfo));
1764 induc_def = PHI_RESULT (induction_phi);
1766 /* Create the iv update inside the loop */
1767 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1768 induc_def, vec_step);
1769 vec_def = make_ssa_name (vec_dest, new_stmt);
1770 gimple_assign_set_lhs (new_stmt, vec_def);
1771 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1772 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
1774 /* Set the arguments of the phi node: */
1775 add_phi_arg (induction_phi, vec_init, pe);
1776 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1779 /* In case that vectorization factor (VF) is bigger than the number
1780 of elements that we can fit in a vectype (nunits), we have to generate
1781 more than one vector stmt - i.e - we need to "unroll" the
1782 vector stmt by a factor VF/nunits. For more details see documentation
1783 in vectorizable_operation. */
1787 stmt_vec_info prev_stmt_vinfo;
1788 /* FORNOW. This restriction should be relaxed. */
1789 gcc_assert (!nested_in_vect_loop);
1791 /* Create the vector that holds the step of the induction. */
1792 expr = build_int_cst (scalar_type, nunits);
1793 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1795 for (i = 0; i < nunits; i++)
1796 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1797 gcc_assert (CONSTANT_CLASS_P (new_name));
1798 vec = build_vector (vectype, t);
1799 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1801 vec_def = induc_def;
1802 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1803 for (i = 1; i < ncopies; i++)
1805 /* vec_i = vec_prev + vec_step */
1806 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1808 vec_def = make_ssa_name (vec_dest, new_stmt);
1809 gimple_assign_set_lhs (new_stmt, vec_def);
1811 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1812 set_vinfo_for_stmt (new_stmt,
1813 new_stmt_vec_info (new_stmt, loop_vinfo));
1814 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1815 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1819 if (nested_in_vect_loop)
1821 /* Find the loop-closed exit-phi of the induction, and record
1822 the final vector of induction results: */
1824 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1826 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
1828 exit_phi = USE_STMT (use_p);
1834 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1835 /* FORNOW. Currently not supporting the case that an inner-loop induction
1836 is not used in the outer-loop (i.e. only outside the outer-loop). */
1837 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1838 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1840 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1841 if (vect_print_dump_info (REPORT_DETAILS))
1843 fprintf (vect_dump, "vector of inductions after inner-loop:");
1844 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
1850 if (vect_print_dump_info (REPORT_DETAILS))
1852 fprintf (vect_dump, "transform induction: created def-use cycle: ");
1853 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
1854 fprintf (vect_dump, "\n");
1855 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
1858 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1863 /* Function vect_get_vec_def_for_operand.
1865 OP is an operand in STMT. This function returns a (vector) def that will be
1866 used in the vectorized stmt for STMT.
1868 In the case that OP is an SSA_NAME which is defined in the loop, then
1869 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1871 In case OP is an invariant or constant, a new stmt that creates a vector def
1872 needs to be introduced. */
1875 vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
1880 stmt_vec_info def_stmt_info = NULL;
1881 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1882 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1883 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1884 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1890 enum vect_def_type dt;
1894 if (vect_print_dump_info (REPORT_DETAILS))
1896 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1897 print_generic_expr (vect_dump, op, TDF_SLIM);
1900 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1901 gcc_assert (is_simple_use);
1902 if (vect_print_dump_info (REPORT_DETAILS))
1906 fprintf (vect_dump, "def = ");
1907 print_generic_expr (vect_dump, def, TDF_SLIM);
1911 fprintf (vect_dump, " def_stmt = ");
1912 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1918 /* Case 1: operand is a constant. */
1919 case vect_constant_def:
1924 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1925 if (vect_print_dump_info (REPORT_DETAILS))
1926 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1928 for (i = nunits - 1; i >= 0; --i)
1930 t = tree_cons (NULL_TREE, op, t);
1932 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1933 gcc_assert (vector_type);
1934 vec_cst = build_vector (vector_type, t);
1936 return vect_init_vector (stmt, vec_cst, vector_type, NULL);
1939 /* Case 2: operand is defined outside the loop - loop invariant. */
1940 case vect_invariant_def:
1945 /* Create 'vec_inv = {inv,inv,..,inv}' */
1946 if (vect_print_dump_info (REPORT_DETAILS))
1947 fprintf (vect_dump, "Create vector_inv.");
1949 for (i = nunits - 1; i >= 0; --i)
1951 t = tree_cons (NULL_TREE, def, t);
1954 /* FIXME: use build_constructor directly. */
1955 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1956 gcc_assert (vector_type);
1957 vec_inv = build_constructor_from_list (vector_type, t);
1958 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1961 /* Case 3: operand is defined inside the loop. */
1965 *scalar_def = NULL/* FIXME tuples: def_stmt*/;
1967 /* Get the def from the vectorized stmt. */
1968 def_stmt_info = vinfo_for_stmt (def_stmt);
1969 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1970 gcc_assert (vec_stmt);
1971 if (gimple_code (vec_stmt) == GIMPLE_PHI)
1972 vec_oprnd = PHI_RESULT (vec_stmt);
1973 else if (is_gimple_call (vec_stmt))
1974 vec_oprnd = gimple_call_lhs (vec_stmt);
1976 vec_oprnd = gimple_assign_lhs (vec_stmt);
1980 /* Case 4: operand is defined by a loop header phi - reduction */
1981 case vect_reduction_def:
1985 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
1986 loop = (gimple_bb (def_stmt))->loop_father;
1988 /* Get the def before the loop */
1989 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1990 return get_initial_def_for_reduction (stmt, op, scalar_def);
1993 /* Case 5: operand is defined by loop-header phi - induction. */
1994 case vect_induction_def:
1996 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
1998 /* Get the def from the vectorized stmt. */
1999 def_stmt_info = vinfo_for_stmt (def_stmt);
2000 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
2001 gcc_assert (vec_stmt && gimple_code (vec_stmt) == GIMPLE_PHI);
2002 vec_oprnd = PHI_RESULT (vec_stmt);
2012 /* Function vect_get_vec_def_for_stmt_copy
2014 Return a vector-def for an operand. This function is used when the
2015 vectorized stmt to be created (by the caller to this function) is a "copy"
2016 created in case the vectorized result cannot fit in one vector, and several
2017 copies of the vector-stmt are required. In this case the vector-def is
2018 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
2019 of the stmt that defines VEC_OPRND.
2020 DT is the type of the vector def VEC_OPRND.
2023 In case the vectorization factor (VF) is bigger than the number
2024 of elements that can fit in a vectype (nunits), we have to generate
2025 more than one vector stmt to vectorize the scalar stmt. This situation
2026 arises when there are multiple data-types operated upon in the loop; the
2027 smallest data-type determines the VF, and as a result, when vectorizing
2028 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
2029 vector stmt (each computing a vector of 'nunits' results, and together
2030 computing 'VF' results in each iteration). This function is called when
2031 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
2032 which VF=16 and nunits=4, so the number of copies required is 4):
2034 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
2036 S1: x = load VS1.0: vx.0 = memref0 VS1.1
2037 VS1.1: vx.1 = memref1 VS1.2
2038 VS1.2: vx.2 = memref2 VS1.3
2039 VS1.3: vx.3 = memref3
2041 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
2042 VSnew.1: vz1 = vx.1 + ... VSnew.2
2043 VSnew.2: vz2 = vx.2 + ... VSnew.3
2044 VSnew.3: vz3 = vx.3 + ...
2046 The vectorization of S1 is explained in vectorizable_load.
2047 The vectorization of S2:
2048 To create the first vector-stmt out of the 4 copies - VSnew.0 -
2049 the function 'vect_get_vec_def_for_operand' is called to
2050 get the relevant vector-def for each operand of S2. For operand x it
2051 returns the vector-def 'vx.0'.
2053 To create the remaining copies of the vector-stmt (VSnew.j), this
2054 function is called to get the relevant vector-def for each operand. It is
2055 obtained from the respective VS1.j stmt, which is recorded in the
2056 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2058 For example, to obtain the vector-def 'vx.1' in order to create the
2059 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2060 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2061 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2062 and return its def ('vx.1').
2063 Overall, to create the above sequence this function will be called 3 times:
2064 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2065 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2066 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
2069 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2071 gimple vec_stmt_for_operand;
2072 stmt_vec_info def_stmt_info;
2074 /* Do nothing; can reuse same def. */
2075 if (dt == vect_invariant_def || dt == vect_constant_def )
2078 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2079 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2080 gcc_assert (def_stmt_info);
2081 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2082 gcc_assert (vec_stmt_for_operand);
2083 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2084 if (gimple_code (vec_stmt_for_operand) == GIMPLE_PHI)
2085 vec_oprnd = PHI_RESULT (vec_stmt_for_operand);
2087 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2092 /* Get vectorized definitions for the operands to create a copy of an original
2093 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
2096 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2097 VEC(tree,heap) **vec_oprnds0,
2098 VEC(tree,heap) **vec_oprnds1)
2100 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2102 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2103 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2105 if (vec_oprnds1 && *vec_oprnds1)
2107 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2108 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2109 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2114 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
2117 vect_get_vec_defs (tree op0, tree op1, gimple stmt,
2118 VEC(tree,heap) **vec_oprnds0, VEC(tree,heap) **vec_oprnds1,
2122 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2127 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2128 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2129 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2133 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2134 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2135 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2141 /* Function vect_finish_stmt_generation.
2143 Insert a new stmt. */
2146 vect_finish_stmt_generation (gimple stmt, gimple vec_stmt,
2147 gimple_stmt_iterator *gsi)
2149 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2150 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2152 gcc_assert (stmt == gsi_stmt (*gsi));
2153 gcc_assert (gimple_code (stmt) != GIMPLE_LABEL);
2155 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
2157 set_vinfo_for_stmt (vec_stmt, new_stmt_vec_info (vec_stmt, loop_vinfo));
2159 if (vect_print_dump_info (REPORT_DETAILS))
2161 fprintf (vect_dump, "add new stmt: ");
2162 print_gimple_stmt (vect_dump, vec_stmt, 0, TDF_SLIM);
2165 /* Make sure gsi points to the stmt that is being vectorized. */
2166 gcc_assert (stmt == gsi_stmt (*gsi));
2168 gimple_set_location (vec_stmt, gimple_location (stmt));
2172 /* Function get_initial_def_for_reduction
2175 STMT - a stmt that performs a reduction operation in the loop.
2176 INIT_VAL - the initial value of the reduction variable
2179 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2180 of the reduction (used for adjusting the epilog - see below).
2181 Return a vector variable, initialized according to the operation that STMT
2182 performs. This vector will be used as the initial value of the
2183 vector of partial results.
2185 Option1 (adjust in epilog): Initialize the vector as follows:
2188 min/max: [init_val,init_val,..,init_val,init_val]
2189 bit and/or: [init_val,init_val,..,init_val,init_val]
2190 and when necessary (e.g. add/mult case) let the caller know
2191 that it needs to adjust the result by init_val.
2193 Option2: Initialize the vector as follows:
2194 add: [0,0,...,0,init_val]
2195 mult: [1,1,...,1,init_val]
2196 min/max: [init_val,init_val,...,init_val]
2197 bit and/or: [init_val,init_val,...,init_val]
2198 and no adjustments are needed.
2200 For example, for the following code:
2206 STMT is 's = s + a[i]', and the reduction variable is 's'.
2207 For a vector of 4 units, we want to return either [0,0,0,init_val],
2208 or [0,0,0,0] and let the caller know that it needs to adjust
2209 the result at the end by 'init_val'.
2211 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2212 initialization vector is simpler (same element in all entries).
2213 A cost model should help decide between these two schemes. */
2216 get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
2218 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2219 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2220 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2221 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2222 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2223 enum tree_code code = gimple_assign_rhs_code (stmt);
2224 tree type = TREE_TYPE (init_val);
2231 bool nested_in_vect_loop = false;
2233 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2234 if (nested_in_vect_loop_p (loop, stmt))
2235 nested_in_vect_loop = true;
2237 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2239 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2243 case WIDEN_SUM_EXPR:
2246 if (nested_in_vect_loop)
2247 *adjustment_def = vecdef;
2249 *adjustment_def = init_val;
2250 /* Create a vector of zeros for init_def. */
2251 if (SCALAR_FLOAT_TYPE_P (type))
2252 def_for_init = build_real (type, dconst0);
2254 def_for_init = build_int_cst (type, 0);
2255 for (i = nunits - 1; i >= 0; --i)
2256 t = tree_cons (NULL_TREE, def_for_init, t);
2257 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
2258 gcc_assert (vector_type);
2259 init_def = build_vector (vector_type, t);
2264 *adjustment_def = NULL_TREE;
2276 /* Function vect_create_epilog_for_reduction
2278 Create code at the loop-epilog to finalize the result of a reduction
2281 VECT_DEF is a vector of partial results.
2282 REDUC_CODE is the tree-code for the epilog reduction.
2283 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2284 number of elements that we can fit in a vectype (nunits). In this case
2285 we have to generate more than one vector stmt - i.e - we need to "unroll"
2286 the vector stmt by a factor VF/nunits. For more details see documentation
2287 in vectorizable_operation.
2288 STMT is the scalar reduction stmt that is being vectorized.
2289 REDUCTION_PHI is the phi-node that carries the reduction computation.
2292 1. Creates the reduction def-use cycle: sets the arguments for
2294 The loop-entry argument is the vectorized initial-value of the reduction.
2295 The loop-latch argument is VECT_DEF - the vector of partial sums.
2296 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2297 by applying the operation specified by REDUC_CODE if available, or by
2298 other means (whole-vector shifts or a scalar loop).
2299 The function also creates a new phi node at the loop exit to preserve
2300 loop-closed form, as illustrated below.
2302 The flow at the entry to this function:
2305 vec_def = phi <null, null> # REDUCTION_PHI
2306 VECT_DEF = vector_stmt # vectorized form of STMT
2307 s_loop = scalar_stmt # (scalar) STMT
2309 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2313 The above is transformed by this function into:
2316 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2317 VECT_DEF = vector_stmt # vectorized form of STMT
2318 s_loop = scalar_stmt # (scalar) STMT
2320 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2321 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2322 v_out2 = reduce <v_out1>
2323 s_out3 = extract_field <v_out2, 0>
2324 s_out4 = adjust_result <s_out3>
2330 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2332 enum tree_code reduc_code,
2333 gimple reduction_phi)
2335 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2336 stmt_vec_info prev_phi_info;
2338 enum machine_mode mode;
2339 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2340 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2341 basic_block exit_bb;
2344 gimple new_phi = NULL, phi;
2345 gimple_stmt_iterator exit_gsi;
2347 tree new_temp = NULL_TREE;
2349 gimple epilog_stmt = NULL;
2350 tree new_scalar_dest, new_dest;
2352 tree bitsize, bitpos, bytesize;
2353 enum tree_code code = gimple_assign_rhs_code (stmt);
2354 tree adjustment_def;
2355 tree vec_initial_def, def;
2357 imm_use_iterator imm_iter;
2358 use_operand_p use_p;
2359 bool extract_scalar_result = false;
2360 tree reduction_op, expr;
2363 bool nested_in_vect_loop = false;
2364 VEC(gimple,heap) *phis = NULL;
2365 enum vect_def_type dt = vect_unknown_def_type;
2368 if (nested_in_vect_loop_p (loop, stmt))
2371 nested_in_vect_loop = true;
2374 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2376 case GIMPLE_SINGLE_RHS:
2377 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2378 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2380 case GIMPLE_UNARY_RHS:
2381 reduction_op = gimple_assign_rhs1 (stmt);
2383 case GIMPLE_BINARY_RHS:
2384 reduction_op = gimple_assign_rhs2 (stmt);
2390 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2391 gcc_assert (vectype);
2392 mode = TYPE_MODE (vectype);
2394 /*** 1. Create the reduction def-use cycle ***/
2396 /* For the case of reduction, vect_get_vec_def_for_operand returns
2397 the scalar def before the loop, that defines the initial value
2398 of the reduction variable. */
2399 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2402 phi = reduction_phi;
2404 for (j = 0; j < ncopies; j++)
2406 /* 1.1 set the loop-entry arg of the reduction-phi: */
2407 add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop));
2409 /* 1.2 set the loop-latch arg for the reduction-phi: */
2411 def = vect_get_vec_def_for_stmt_copy (dt, def);
2412 add_phi_arg (phi, def, loop_latch_edge (loop));
2414 if (vect_print_dump_info (REPORT_DETAILS))
2416 fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2417 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2418 fprintf (vect_dump, "\n");
2419 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2422 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2425 /*** 2. Create epilog code
2426 The reduction epilog code operates across the elements of the vector
2427 of partial results computed by the vectorized loop.
2428 The reduction epilog code consists of:
2429 step 1: compute the scalar result in a vector (v_out2)
2430 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2431 step 3: adjust the scalar result (s_out3) if needed.
2433 Step 1 can be accomplished using one the following three schemes:
2434 (scheme 1) using reduc_code, if available.
2435 (scheme 2) using whole-vector shifts, if available.
2436 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2439 The overall epilog code looks like this:
2441 s_out0 = phi <s_loop> # original EXIT_PHI
2442 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2443 v_out2 = reduce <v_out1> # step 1
2444 s_out3 = extract_field <v_out2, 0> # step 2
2445 s_out4 = adjust_result <s_out3> # step 3
2447 (step 3 is optional, and steps 1 and 2 may be combined).
2448 Lastly, the uses of s_out0 are replaced by s_out4.
2452 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2453 v_out1 = phi <v_loop> */
2455 exit_bb = single_exit (loop)->dest;
2457 prev_phi_info = NULL;
2458 for (j = 0; j < ncopies; j++)
2460 phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2461 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
2466 def = vect_get_vec_def_for_stmt_copy (dt, def);
2467 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
2469 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
2470 prev_phi_info = vinfo_for_stmt (phi);
2472 exit_gsi = gsi_after_labels (exit_bb);
2474 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2475 (i.e. when reduc_code is not available) and in the final adjustment
2476 code (if needed). Also get the original scalar reduction variable as
2477 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2478 represents a reduction pattern), the tree-code and scalar-def are
2479 taken from the original stmt that the pattern-stmt (STMT) replaces.
2480 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2481 are taken from STMT. */
2483 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2486 /* Regular reduction */
2491 /* Reduction pattern */
2492 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2493 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2494 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2496 code = gimple_assign_rhs_code (orig_stmt);
2497 scalar_dest = gimple_assign_lhs (orig_stmt);
2498 scalar_type = TREE_TYPE (scalar_dest);
2499 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2500 bitsize = TYPE_SIZE (scalar_type);
2501 bytesize = TYPE_SIZE_UNIT (scalar_type);
2504 /* In case this is a reduction in an inner-loop while vectorizing an outer
2505 loop - we don't need to extract a single scalar result at the end of the
2506 inner-loop. The final vector of partial results will be used in the
2507 vectorized outer-loop, or reduced to a scalar result at the end of the
2509 if (nested_in_vect_loop)
2510 goto vect_finalize_reduction;
2513 gcc_assert (ncopies == 1);
2515 /* 2.3 Create the reduction code, using one of the three schemes described
2518 if (reduc_code < NUM_TREE_CODES)
2522 /*** Case 1: Create:
2523 v_out2 = reduc_expr <v_out1> */
2525 if (vect_print_dump_info (REPORT_DETAILS))
2526 fprintf (vect_dump, "Reduce using direct vector reduction.");
2528 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2529 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2530 epilog_stmt = gimple_build_assign (vec_dest, tmp);
2531 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2532 gimple_assign_set_lhs (epilog_stmt, new_temp);
2533 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2535 extract_scalar_result = true;
2539 enum tree_code shift_code = 0;
2540 bool have_whole_vector_shift = true;
2542 int element_bitsize = tree_low_cst (bitsize, 1);
2543 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2546 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2547 shift_code = VEC_RSHIFT_EXPR;
2549 have_whole_vector_shift = false;
2551 /* Regardless of whether we have a whole vector shift, if we're
2552 emulating the operation via tree-vect-generic, we don't want
2553 to use it. Only the first round of the reduction is likely
2554 to still be profitable via emulation. */
2555 /* ??? It might be better to emit a reduction tree code here, so that
2556 tree-vect-generic can expand the first round via bit tricks. */
2557 if (!VECTOR_MODE_P (mode))
2558 have_whole_vector_shift = false;
2561 optab optab = optab_for_tree_code (code, vectype, optab_default);
2562 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2563 have_whole_vector_shift = false;
2566 if (have_whole_vector_shift)
2568 /*** Case 2: Create:
2569 for (offset = VS/2; offset >= element_size; offset/=2)
2571 Create: va' = vec_shift <va, offset>
2572 Create: va = vop <va, va'>
2575 if (vect_print_dump_info (REPORT_DETAILS))
2576 fprintf (vect_dump, "Reduce using vector shifts");
2578 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2579 new_temp = PHI_RESULT (new_phi);
2581 for (bit_offset = vec_size_in_bits/2;
2582 bit_offset >= element_bitsize;
2585 tree bitpos = size_int (bit_offset);
2586 epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
2588 new_name = make_ssa_name (vec_dest, epilog_stmt);
2589 gimple_assign_set_lhs (epilog_stmt, new_name);
2590 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2592 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
2593 new_name, new_temp);
2594 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2595 gimple_assign_set_lhs (epilog_stmt, new_temp);
2596 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2599 extract_scalar_result = true;
2605 /*** Case 3: Create:
2606 s = extract_field <v_out2, 0>
2607 for (offset = element_size;
2608 offset < vector_size;
2609 offset += element_size;)
2611 Create: s' = extract_field <v_out2, offset>
2612 Create: s = op <s, s'>
2615 if (vect_print_dump_info (REPORT_DETAILS))
2616 fprintf (vect_dump, "Reduce using scalar code. ");
2618 vec_temp = PHI_RESULT (new_phi);
2619 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2620 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2622 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2623 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2624 gimple_assign_set_lhs (epilog_stmt, new_temp);
2625 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2627 for (bit_offset = element_bitsize;
2628 bit_offset < vec_size_in_bits;
2629 bit_offset += element_bitsize)
2631 tree bitpos = bitsize_int (bit_offset);
2632 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2635 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2636 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2637 gimple_assign_set_lhs (epilog_stmt, new_name);
2638 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2640 epilog_stmt = gimple_build_assign_with_ops (code,
2642 new_name, new_temp);
2643 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2644 gimple_assign_set_lhs (epilog_stmt, new_temp);
2645 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2648 extract_scalar_result = false;
2652 /* 2.4 Extract the final scalar result. Create:
2653 s_out3 = extract_field <v_out2, bitpos> */
2655 if (extract_scalar_result)
2659 gcc_assert (!nested_in_vect_loop);
2660 if (vect_print_dump_info (REPORT_DETAILS))
2661 fprintf (vect_dump, "extract scalar result");
2663 if (BYTES_BIG_ENDIAN)
2664 bitpos = size_binop (MULT_EXPR,
2665 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2666 TYPE_SIZE (scalar_type));
2668 bitpos = bitsize_zero_node;
2670 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2671 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2672 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2673 gimple_assign_set_lhs (epilog_stmt, new_temp);
2674 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2677 vect_finalize_reduction:
2679 /* 2.5 Adjust the final result by the initial value of the reduction
2680 variable. (When such adjustment is not needed, then
2681 'adjustment_def' is zero). For example, if code is PLUS we create:
2682 new_temp = loop_exit_def + adjustment_def */
2686 if (nested_in_vect_loop)
2688 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2689 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2690 new_dest = vect_create_destination_var (scalar_dest, vectype);
2694 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2695 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2696 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2698 epilog_stmt = gimple_build_assign (new_dest, expr);
2699 new_temp = make_ssa_name (new_dest, epilog_stmt);
2700 gimple_assign_set_lhs (epilog_stmt, new_temp);
2701 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
2702 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2706 /* 2.6 Handle the loop-exit phi */
2708 /* Replace uses of s_out0 with uses of s_out3:
2709 Find the loop-closed-use at the loop exit of the original scalar result.
2710 (The reduction result is expected to have two immediate uses - one at the
2711 latch block, and one at the loop exit). */
2712 phis = VEC_alloc (gimple, heap, 10);
2713 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2715 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2717 exit_phi = USE_STMT (use_p);
2718 VEC_quick_push (gimple, phis, exit_phi);
2721 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2722 gcc_assert (!VEC_empty (gimple, phis));
2724 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
2726 if (nested_in_vect_loop)
2728 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2730 /* FORNOW. Currently not supporting the case that an inner-loop
2731 reduction is not used in the outer-loop (but only outside the
2733 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2734 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2736 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2737 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2738 set_vinfo_for_stmt (epilog_stmt,
2739 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2741 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
2742 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
2746 /* Replace the uses: */
2747 orig_name = PHI_RESULT (exit_phi);
2748 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2749 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2750 SET_USE (use_p, new_temp);
2752 VEC_free (gimple, heap, phis);
2756 /* Function vectorizable_reduction.
2758 Check if STMT performs a reduction operation that can be vectorized.
2759 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2760 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2761 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2763 This function also handles reduction idioms (patterns) that have been
2764 recognized in advance during vect_pattern_recog. In this case, STMT may be
2766 X = pattern_expr (arg0, arg1, ..., X)
2767 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2768 sequence that had been detected and replaced by the pattern-stmt (STMT).
2770 In some cases of reduction patterns, the type of the reduction variable X is
2771 different than the type of the other arguments of STMT.
2772 In such cases, the vectype that is used when transforming STMT into a vector
2773 stmt is different than the vectype that is used to determine the
2774 vectorization factor, because it consists of a different number of elements
2775 than the actual number of elements that are being operated upon in parallel.
2777 For example, consider an accumulation of shorts into an int accumulator.
2778 On some targets it's possible to vectorize this pattern operating on 8
2779 shorts at a time (hence, the vectype for purposes of determining the
2780 vectorization factor should be V8HI); on the other hand, the vectype that
2781 is used to create the vector form is actually V4SI (the type of the result).
2783 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2784 indicates what is the actual level of parallelism (V8HI in the example), so
2785 that the right vectorization factor would be derived. This vectype
2786 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2787 be used to create the vectorized stmt. The right vectype for the vectorized
2788 stmt is obtained from the type of the result X:
2789 get_vectype_for_scalar_type (TREE_TYPE (X))
2791 This means that, contrary to "regular" reductions (or "regular" stmts in
2792 general), the following equation:
2793 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2794 does *NOT* necessarily hold for reduction patterns. */
2797 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
2802 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2803 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2804 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2805 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2806 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2807 enum tree_code code, orig_code, epilog_reduc_code = 0;
2808 enum machine_mode vec_mode;
2810 optab optab, reduc_optab;
2811 tree new_temp = NULL_TREE;
2814 enum vect_def_type dt;
2815 gimple new_phi = NULL;
2819 stmt_vec_info orig_stmt_info;
2820 tree expr = NULL_TREE;
2822 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2823 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2825 stmt_vec_info prev_stmt_info, prev_phi_info;
2826 gimple first_phi = NULL;
2827 bool single_defuse_cycle = false;
2829 gimple new_stmt = NULL;
2833 if (nested_in_vect_loop_p (loop, stmt))
2836 gcc_assert (ncopies >= 1);
2838 /* FORNOW: SLP not supported. */
2839 if (STMT_SLP_TYPE (stmt_info))
2842 /* 1. Is vectorizable reduction? */
2844 /* Not supportable if the reduction variable is used in the loop. */
2845 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2848 /* Reductions that are not used even in an enclosing outer-loop,
2849 are expected to be "live" (used out of the loop). */
2850 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2851 && !STMT_VINFO_LIVE_P (stmt_info))
2854 /* Make sure it was already recognized as a reduction computation. */
2855 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2858 /* 2. Has this been recognized as a reduction pattern?
2860 Check if STMT represents a pattern that has been recognized
2861 in earlier analysis stages. For stmts that represent a pattern,
2862 the STMT_VINFO_RELATED_STMT field records the last stmt in
2863 the original sequence that constitutes the pattern. */
2865 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2868 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2869 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2870 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2871 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2874 /* 3. Check the operands of the operation. The first operands are defined
2875 inside the loop body. The last operand is the reduction variable,
2876 which is defined by the loop-header-phi. */
2878 gcc_assert (is_gimple_assign (stmt));
2881 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2883 case GIMPLE_SINGLE_RHS:
2884 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
2885 if (op_type == ternary_op)
2887 tree rhs = gimple_assign_rhs1 (stmt);
2888 ops[0] = TREE_OPERAND (rhs, 0);
2889 ops[1] = TREE_OPERAND (rhs, 1);
2890 ops[2] = TREE_OPERAND (rhs, 2);
2891 code = TREE_CODE (rhs);
2897 case GIMPLE_BINARY_RHS:
2898 code = gimple_assign_rhs_code (stmt);
2899 op_type = TREE_CODE_LENGTH (code);
2900 gcc_assert (op_type == binary_op);
2901 ops[0] = gimple_assign_rhs1 (stmt);
2902 ops[1] = gimple_assign_rhs2 (stmt);
2905 case GIMPLE_UNARY_RHS:
2912 scalar_dest = gimple_assign_lhs (stmt);
2913 scalar_type = TREE_TYPE (scalar_dest);
2914 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2915 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2918 /* All uses but the last are expected to be defined in the loop.
2919 The last use is the reduction variable. */
2920 for (i = 0; i < op_type-1; i++)
2922 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt,
2924 gcc_assert (is_simple_use);
2925 if (dt != vect_loop_def
2926 && dt != vect_invariant_def
2927 && dt != vect_constant_def
2928 && dt != vect_induction_def)
2932 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt, &def, &dt);
2933 gcc_assert (is_simple_use);
2934 gcc_assert (dt == vect_reduction_def);
2935 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2937 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2939 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2941 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2944 /* 4. Supportable by target? */
2946 /* 4.1. check support for the operation in the loop */
2947 optab = optab_for_tree_code (code, vectype, optab_default);
2950 if (vect_print_dump_info (REPORT_DETAILS))
2951 fprintf (vect_dump, "no optab.");
2954 vec_mode = TYPE_MODE (vectype);
2955 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2957 if (vect_print_dump_info (REPORT_DETAILS))
2958 fprintf (vect_dump, "op not supported by target.");
2959 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2960 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2961 < vect_min_worthwhile_factor (code))
2963 if (vect_print_dump_info (REPORT_DETAILS))
2964 fprintf (vect_dump, "proceeding using word mode.");
2967 /* Worthwhile without SIMD support? */
2968 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2969 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2970 < vect_min_worthwhile_factor (code))
2972 if (vect_print_dump_info (REPORT_DETAILS))
2973 fprintf (vect_dump, "not worthwhile without SIMD support.");
2977 /* 4.2. Check support for the epilog operation.
2979 If STMT represents a reduction pattern, then the type of the
2980 reduction variable may be different than the type of the rest
2981 of the arguments. For example, consider the case of accumulation
2982 of shorts into an int accumulator; The original code:
2983 S1: int_a = (int) short_a;
2984 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2987 STMT: int_acc = widen_sum <short_a, int_acc>
2990 1. The tree-code that is used to create the vector operation in the
2991 epilog code (that reduces the partial results) is not the
2992 tree-code of STMT, but is rather the tree-code of the original
2993 stmt from the pattern that STMT is replacing. I.e, in the example
2994 above we want to use 'widen_sum' in the loop, but 'plus' in the
2996 2. The type (mode) we use to check available target support
2997 for the vector operation to be created in the *epilog*, is
2998 determined by the type of the reduction variable (in the example
2999 above we'd check this: plus_optab[vect_int_mode]).
3000 However the type (mode) we use to check available target support
3001 for the vector operation to be created *inside the loop*, is
3002 determined by the type of the other arguments to STMT (in the
3003 example we'd check this: widen_sum_optab[vect_short_mode]).
3005 This is contrary to "regular" reductions, in which the types of all
3006 the arguments are the same as the type of the reduction variable.
3007 For "regular" reductions we can therefore use the same vector type
3008 (and also the same tree-code) when generating the epilog code and
3009 when generating the code inside the loop. */
3013 /* This is a reduction pattern: get the vectype from the type of the
3014 reduction variable, and get the tree-code from orig_stmt. */
3015 orig_code = gimple_assign_rhs_code (orig_stmt);
3016 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3019 if (vect_print_dump_info (REPORT_DETAILS))
3021 fprintf (vect_dump, "unsupported data-type ");
3022 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3027 vec_mode = TYPE_MODE (vectype);
3031 /* Regular reduction: use the same vectype and tree-code as used for
3032 the vector code inside the loop can be used for the epilog code. */
3036 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3038 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default);
3041 if (vect_print_dump_info (REPORT_DETAILS))
3042 fprintf (vect_dump, "no optab for reduction.");
3043 epilog_reduc_code = NUM_TREE_CODES;
3045 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
3047 if (vect_print_dump_info (REPORT_DETAILS))
3048 fprintf (vect_dump, "reduc op not supported by target.");
3049 epilog_reduc_code = NUM_TREE_CODES;
3052 if (!vec_stmt) /* transformation not required. */
3054 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3055 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3062 if (vect_print_dump_info (REPORT_DETAILS))
3063 fprintf (vect_dump, "transform reduction.");
3065 /* Create the destination vector */
3066 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3068 /* In case the vectorization factor (VF) is bigger than the number
3069 of elements that we can fit in a vectype (nunits), we have to generate
3070 more than one vector stmt - i.e - we need to "unroll" the
3071 vector stmt by a factor VF/nunits. For more details see documentation
3072 in vectorizable_operation. */
3074 /* If the reduction is used in an outer loop we need to generate
3075 VF intermediate results, like so (e.g. for ncopies=2):
3080 (i.e. we generate VF results in 2 registers).
3081 In this case we have a separate def-use cycle for each copy, and therefore
3082 for each copy we get the vector def for the reduction variable from the
3083 respective phi node created for this copy.
3085 Otherwise (the reduction is unused in the loop nest), we can combine
3086 together intermediate results, like so (e.g. for ncopies=2):
3090 (i.e. we generate VF/2 results in a single register).
3091 In this case for each copy we get the vector def for the reduction variable
3092 from the vectorized reduction operation generated in the previous iteration.
3095 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop)
3097 single_defuse_cycle = true;
3101 epilog_copies = ncopies;
3103 prev_stmt_info = NULL;
3104 prev_phi_info = NULL;
3105 for (j = 0; j < ncopies; j++)
3107 if (j == 0 || !single_defuse_cycle)
3109 /* Create the reduction-phi that defines the reduction-operand. */
3110 new_phi = create_phi_node (vec_dest, loop->header);
3111 set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo));
3117 loop_vec_def0 = vect_get_vec_def_for_operand (ops[0], stmt, NULL);
3118 if (op_type == ternary_op)
3120 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, NULL);
3123 /* Get the vector def for the reduction variable from the phi node */
3124 reduc_def = PHI_RESULT (new_phi);
3125 first_phi = new_phi;
3129 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3130 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3131 if (op_type == ternary_op)
3132 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3134 if (single_defuse_cycle)
3135 reduc_def = gimple_assign_lhs (new_stmt);
3137 reduc_def = PHI_RESULT (new_phi);
3139 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3142 /* Arguments are ready. create the new vector stmt. */
3143 if (op_type == binary_op)
3144 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3146 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3148 new_stmt = gimple_build_assign (vec_dest, expr);
3149 new_temp = make_ssa_name (vec_dest, new_stmt);
3150 gimple_assign_set_lhs (new_stmt, new_temp);
3151 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3154 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3156 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3157 prev_stmt_info = vinfo_for_stmt (new_stmt);
3158 prev_phi_info = vinfo_for_stmt (new_phi);
3161 /* Finalize the reduction-phi (set its arguments) and create the
3162 epilog reduction code. */
3163 if (!single_defuse_cycle)
3164 new_temp = gimple_assign_lhs (*vec_stmt);
3165 vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3166 epilog_reduc_code, first_phi);
3170 /* Checks if CALL can be vectorized in type VECTYPE. Returns
3171 a function declaration if the target has a vectorized version
3172 of the function, or NULL_TREE if the function cannot be vectorized. */
3175 vectorizable_function (gimple call, tree vectype_out, tree vectype_in)
3177 tree fndecl = gimple_call_fndecl (call);
3178 enum built_in_function code;
3180 /* We only handle functions that do not read or clobber memory -- i.e.
3181 const or novops ones. */
3182 if (!(gimple_call_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3186 || TREE_CODE (fndecl) != FUNCTION_DECL
3187 || !DECL_BUILT_IN (fndecl))
3190 code = DECL_FUNCTION_CODE (fndecl);
3191 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3195 /* Function vectorizable_call.
3197 Check if STMT performs a function call that can be vectorized.
3198 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3199 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3200 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3203 vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
3208 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3209 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3210 tree vectype_out, vectype_in;
3213 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3214 tree fndecl, new_temp, def, rhs_type, lhs_type;
3216 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3219 VEC(tree, heap) *vargs = NULL;
3220 enum { NARROW, NONE, WIDEN } modifier;
3223 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3226 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3229 /* FORNOW: SLP not supported. */
3230 if (STMT_SLP_TYPE (stmt_info))
3233 /* Is STMT a vectorizable call? */
3234 if (!is_gimple_call (stmt))
3237 if (TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
3240 /* Process function arguments. */
3241 rhs_type = NULL_TREE;
3242 nargs = gimple_call_num_args (stmt);
3244 /* Bail out if the function has more than two arguments, we
3245 do not have interesting builtin functions to vectorize with
3246 more than two arguments. No arguments is also not good. */
3247 if (nargs == 0 || nargs > 2)
3250 for (i = 0; i < nargs; i++)
3252 op = gimple_call_arg (stmt, i);
3254 /* We can only handle calls with arguments of the same type. */
3256 && rhs_type != TREE_TYPE (op))
3258 if (vect_print_dump_info (REPORT_DETAILS))
3259 fprintf (vect_dump, "argument types differ.");
3262 rhs_type = TREE_TYPE (op);
3264 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[i]))
3266 if (vect_print_dump_info (REPORT_DETAILS))
3267 fprintf (vect_dump, "use not simple.");
3272 vectype_in = get_vectype_for_scalar_type (rhs_type);
3275 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3277 lhs_type = TREE_TYPE (gimple_call_lhs (stmt));
3278 vectype_out = get_vectype_for_scalar_type (lhs_type);
3281 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3284 if (nunits_in == nunits_out / 2)
3286 else if (nunits_out == nunits_in)
3288 else if (nunits_out == nunits_in / 2)
3293 /* For now, we only vectorize functions if a target specific builtin
3294 is available. TODO -- in some cases, it might be profitable to
3295 insert the calls for pieces of the vector, in order to be able
3296 to vectorize other operations in the loop. */
3297 fndecl = vectorizable_function (stmt, vectype_out, vectype_in);
3298 if (fndecl == NULL_TREE)
3300 if (vect_print_dump_info (REPORT_DETAILS))
3301 fprintf (vect_dump, "function is not vectorizable.");
3306 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3308 if (modifier == NARROW)
3309 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3311 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3313 /* Sanity check: make sure that at least one copy of the vectorized stmt
3314 needs to be generated. */
3315 gcc_assert (ncopies >= 1);
3317 if (!vec_stmt) /* transformation not required. */
3319 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3320 if (vect_print_dump_info (REPORT_DETAILS))
3321 fprintf (vect_dump, "=== vectorizable_call ===");
3322 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3328 if (vect_print_dump_info (REPORT_DETAILS))
3329 fprintf (vect_dump, "transform operation.");
3332 scalar_dest = gimple_call_lhs (stmt);
3333 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3335 prev_stmt_info = NULL;
3339 for (j = 0; j < ncopies; ++j)
3341 /* Build argument list for the vectorized call. */
3343 vargs = VEC_alloc (tree, heap, nargs);
3345 VEC_truncate (tree, vargs, 0);
3347 for (i = 0; i < nargs; i++)
3349 op = gimple_call_arg (stmt, i);
3352 = vect_get_vec_def_for_operand (op, stmt, NULL);
3355 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3357 VEC_quick_push (tree, vargs, vec_oprnd0);
3360 new_stmt = gimple_build_call_vec (fndecl, vargs);
3361 new_temp = make_ssa_name (vec_dest, new_stmt);
3362 gimple_call_set_lhs (new_stmt, new_temp);
3364 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3367 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3369 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3371 prev_stmt_info = vinfo_for_stmt (new_stmt);
3377 for (j = 0; j < ncopies; ++j)
3379 /* Build argument list for the vectorized call. */
3381 vargs = VEC_alloc (tree, heap, nargs * 2);
3383 VEC_truncate (tree, vargs, 0);
3385 for (i = 0; i < nargs; i++)
3387 op = gimple_call_arg (stmt, i);
3391 = vect_get_vec_def_for_operand (op, stmt, NULL);
3393 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3398 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3400 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3403 VEC_quick_push (tree, vargs, vec_oprnd0);
3404 VEC_quick_push (tree, vargs, vec_oprnd1);
3407 new_stmt = gimple_build_call_vec (fndecl, vargs);
3408 new_temp = make_ssa_name (vec_dest, new_stmt);
3409 gimple_call_set_lhs (new_stmt, new_temp);
3411 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3414 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3416 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3418 prev_stmt_info = vinfo_for_stmt (new_stmt);
3421 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3426 /* No current target implements this case. */
3430 VEC_free (tree, heap, vargs);
3432 /* The call in STMT might prevent it from being removed in dce.
3433 We however cannot remove it here, due to the way the ssa name
3434 it defines is mapped to the new definition. So just replace
3435 rhs of the statement with something harmless. */
3437 type = TREE_TYPE (scalar_dest);
3438 new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
3439 fold_convert (type, integer_zero_node));
3440 set_vinfo_for_stmt (new_stmt, stmt_info);
3441 set_vinfo_for_stmt (stmt, NULL);
3442 STMT_VINFO_STMT (stmt_info) = new_stmt;
3443 gsi_replace (gsi, new_stmt, false);
3444 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3450 /* Function vect_gen_widened_results_half
3452 Create a vector stmt whose code, type, number of arguments, and result
3453 variable are CODE, OP_TYPE, and VEC_DEST, and its arguments are
3454 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3455 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3456 needs to be created (DECL is a function-decl of a target-builtin).
3457 STMT is the original scalar stmt that we are vectorizing. */
3460 vect_gen_widened_results_half (enum tree_code code,
3462 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3463 tree vec_dest, gimple_stmt_iterator *gsi,
3471 /* Generate half of the widened result: */
3472 if (code == CALL_EXPR)
3474 /* Target specific support */
3475 if (op_type == binary_op)
3476 new_stmt = gimple_build_call (decl, 2, vec_oprnd0, vec_oprnd1);
3478 new_stmt = gimple_build_call (decl, 1, vec_oprnd0);
3479 new_temp = make_ssa_name (vec_dest, new_stmt);
3480 gimple_call_set_lhs (new_stmt, new_temp);
3484 /* Generic support */
3485 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3486 if (op_type != binary_op)
3488 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vec_oprnd0,
3490 new_temp = make_ssa_name (vec_dest, new_stmt);
3491 gimple_assign_set_lhs (new_stmt, new_temp);
3493 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3495 if (code == CALL_EXPR)
3497 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3499 if (TREE_CODE (sym) == SSA_NAME)
3500 sym = SSA_NAME_VAR (sym);
3501 mark_sym_for_renaming (sym);
3509 /* Check if STMT performs a conversion operation, that can be vectorized.
3510 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3511 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3512 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3515 vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
3516 gimple *vec_stmt, slp_tree slp_node)
3521 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3522 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3523 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3524 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3525 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3529 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3530 gimple new_stmt = NULL;
3531 stmt_vec_info prev_stmt_info;
3534 tree vectype_out, vectype_in;
3537 tree rhs_type, lhs_type;
3539 enum { NARROW, NONE, WIDEN } modifier;
3541 VEC(tree,heap) *vec_oprnds0 = NULL;
3544 VEC(tree,heap) *dummy = NULL;
3547 /* Is STMT a vectorizable conversion? */
3549 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3552 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3555 if (!is_gimple_assign (stmt))
3558 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
3561 code = gimple_assign_rhs_code (stmt);
3562 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3565 /* Check types of lhs and rhs. */
3566 op0 = gimple_assign_rhs1 (stmt);
3567 rhs_type = TREE_TYPE (op0);
3568 vectype_in = get_vectype_for_scalar_type (rhs_type);
3571 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3573 scalar_dest = gimple_assign_lhs (stmt);
3574 lhs_type = TREE_TYPE (scalar_dest);
3575 vectype_out = get_vectype_for_scalar_type (lhs_type);
3578 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3581 if (nunits_in == nunits_out / 2)
3583 else if (nunits_out == nunits_in)
3585 else if (nunits_out == nunits_in / 2)
3590 if (modifier == NONE)
3591 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3593 /* Bail out if the types are both integral or non-integral. */
3594 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3595 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3598 integral_type = INTEGRAL_TYPE_P (rhs_type) ? vectype_in : vectype_out;
3600 if (modifier == NARROW)
3601 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3603 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3605 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3606 this, so we can safely override NCOPIES with 1 here. */
3610 /* Sanity check: make sure that at least one copy of the vectorized stmt
3611 needs to be generated. */
3612 gcc_assert (ncopies >= 1);
3614 /* Check the operands of the operation. */
3615 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3617 if (vect_print_dump_info (REPORT_DETAILS))
3618 fprintf (vect_dump, "use not simple.");
3622 /* Supportable by target? */
3623 if ((modifier == NONE
3624 && !targetm.vectorize.builtin_conversion (code, integral_type))
3625 || (modifier == WIDEN
3626 && !supportable_widening_operation (code, stmt, vectype_in,
3629 &dummy_int, &dummy))
3630 || (modifier == NARROW
3631 && !supportable_narrowing_operation (code, stmt, vectype_in,
3632 &code1, &dummy_int, &dummy)))
3634 if (vect_print_dump_info (REPORT_DETAILS))
3635 fprintf (vect_dump, "conversion not supported by target.");
3639 if (modifier != NONE)
3641 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3642 /* FORNOW: SLP not supported. */
3643 if (STMT_SLP_TYPE (stmt_info))
3647 if (!vec_stmt) /* transformation not required. */
3649 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3654 if (vect_print_dump_info (REPORT_DETAILS))
3655 fprintf (vect_dump, "transform conversion.");
3658 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3660 if (modifier == NONE && !slp_node)
3661 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3663 prev_stmt_info = NULL;
3667 for (j = 0; j < ncopies; j++)
3673 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3675 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3678 targetm.vectorize.builtin_conversion (code, integral_type);
3679 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3681 /* Arguments are ready. create the new vector stmt. */
3682 new_stmt = gimple_build_call (builtin_decl, 1, vop0);
3683 new_temp = make_ssa_name (vec_dest, new_stmt);
3684 gimple_call_set_lhs (new_stmt, new_temp);
3685 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3686 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3687 SSA_OP_ALL_VIRTUALS)
3689 if (TREE_CODE (sym) == SSA_NAME)
3690 sym = SSA_NAME_VAR (sym);
3691 mark_sym_for_renaming (sym);
3694 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3698 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3700 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3701 prev_stmt_info = vinfo_for_stmt (new_stmt);
3706 /* In case the vectorization factor (VF) is bigger than the number
3707 of elements that we can fit in a vectype (nunits), we have to
3708 generate more than one vector stmt - i.e - we need to "unroll"
3709 the vector stmt by a factor VF/nunits. */
3710 for (j = 0; j < ncopies; j++)
3713 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3715 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3717 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3719 /* Generate first half of the widened result: */
3721 = vect_gen_widened_results_half (code1, decl1,
3722 vec_oprnd0, vec_oprnd1,
3723 unary_op, vec_dest, gsi, stmt);
3725 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3727 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3728 prev_stmt_info = vinfo_for_stmt (new_stmt);
3730 /* Generate second half of the widened result: */
3732 = vect_gen_widened_results_half (code2, decl2,
3733 vec_oprnd0, vec_oprnd1,
3734 unary_op, vec_dest, gsi, stmt);
3735 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3736 prev_stmt_info = vinfo_for_stmt (new_stmt);
3741 /* In case the vectorization factor (VF) is bigger than the number
3742 of elements that we can fit in a vectype (nunits), we have to
3743 generate more than one vector stmt - i.e - we need to "unroll"
3744 the vector stmt by a factor VF/nunits. */
3745 for (j = 0; j < ncopies; j++)
3750 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3751 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3755 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3756 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3759 /* Arguments are ready. Create the new vector stmt. */
3760 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3761 new_stmt = gimple_build_assign_with_ops (code1, vec_dest, vec_oprnd0,
3763 new_temp = make_ssa_name (vec_dest, new_stmt);
3764 gimple_assign_set_lhs (new_stmt, new_temp);
3765 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3768 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3770 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3772 prev_stmt_info = vinfo_for_stmt (new_stmt);
3775 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3779 VEC_free (tree, heap, vec_oprnds0);
3785 /* Function vectorizable_assignment.
3787 Check if STMT performs an assignment (copy) that can be vectorized.
3788 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3789 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3790 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3793 vectorizable_assignment (gimple stmt, gimple_stmt_iterator *gsi,
3794 gimple *vec_stmt, slp_tree slp_node)
3799 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3800 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3801 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3805 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3806 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3809 VEC(tree,heap) *vec_oprnds = NULL;
3812 /* Multiple types in SLP are handled by creating the appropriate number of
3813 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
3818 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3820 gcc_assert (ncopies >= 1);
3822 return false; /* FORNOW */
3824 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3827 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3830 /* Is vectorizable assignment? */
3831 if (!is_gimple_assign (stmt))
3834 scalar_dest = gimple_assign_lhs (stmt);
3835 if (TREE_CODE (scalar_dest) != SSA_NAME)
3838 if (gimple_assign_single_p (stmt)
3839 || gimple_assign_rhs_code (stmt) == PAREN_EXPR)
3840 op = gimple_assign_rhs1 (stmt);
3844 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3846 if (vect_print_dump_info (REPORT_DETAILS))
3847 fprintf (vect_dump, "use not simple.");
3851 if (!vec_stmt) /* transformation not required. */
3853 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3854 if (vect_print_dump_info (REPORT_DETAILS))
3855 fprintf (vect_dump, "=== vectorizable_assignment ===");
3856 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3861 if (vect_print_dump_info (REPORT_DETAILS))
3862 fprintf (vect_dump, "transform assignment.");
3865 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3868 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3870 /* Arguments are ready. create the new vector stmt. */
3871 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3873 *vec_stmt = gimple_build_assign (vec_dest, vop);
3874 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3875 gimple_assign_set_lhs (*vec_stmt, new_temp);
3876 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
3877 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3880 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3883 VEC_free (tree, heap, vec_oprnds);
3888 /* Function vect_min_worthwhile_factor.
3890 For a loop where we could vectorize the operation indicated by CODE,
3891 return the minimum vectorization factor that makes it worthwhile
3892 to use generic vectors. */
3894 vect_min_worthwhile_factor (enum tree_code code)
3915 /* Function vectorizable_induction
3917 Check if PHI performs an induction computation that can be vectorized.
3918 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3919 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3920 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3923 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3926 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3927 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3928 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3929 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3930 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3931 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3934 gcc_assert (ncopies >= 1);
3935 /* FORNOW. This restriction should be relaxed. */
3936 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
3938 if (vect_print_dump_info (REPORT_DETAILS))
3939 fprintf (vect_dump, "multiple types in nested loop.");
3943 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3946 /* FORNOW: SLP not supported. */
3947 if (STMT_SLP_TYPE (stmt_info))
3950 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3952 if (gimple_code (phi) != GIMPLE_PHI)
3955 if (!vec_stmt) /* transformation not required. */
3957 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3958 if (vect_print_dump_info (REPORT_DETAILS))
3959 fprintf (vect_dump, "=== vectorizable_induction ===");
3960 vect_model_induction_cost (stmt_info, ncopies);
3966 if (vect_print_dump_info (REPORT_DETAILS))
3967 fprintf (vect_dump, "transform induction phi.");
3969 vec_def = get_initial_def_for_induction (phi);
3970 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3975 /* Function vectorizable_operation.
3977 Check if STMT performs a binary or unary operation that can be vectorized.
3978 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3979 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3980 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3983 vectorizable_operation (gimple stmt, gimple_stmt_iterator *gsi,
3984 gimple *vec_stmt, slp_tree slp_node)
3988 tree op0, op1 = NULL;
3989 tree vec_oprnd1 = NULL_TREE;
3990 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3991 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3992 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3993 enum tree_code code;
3994 enum machine_mode vec_mode;
3999 enum machine_mode optab_op2_mode;
4002 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4003 gimple new_stmt = NULL;
4004 stmt_vec_info prev_stmt_info;
4005 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
4010 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4013 bool shift_p = false;
4014 bool scalar_shift_arg = false;
4016 /* Multiple types in SLP are handled by creating the appropriate number of
4017 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4022 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4024 gcc_assert (ncopies >= 1);
4026 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4029 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4032 /* Is STMT a vectorizable binary/unary operation? */
4033 if (!is_gimple_assign (stmt))
4036 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4039 scalar_dest = gimple_assign_lhs (stmt);
4040 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4043 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4044 if (nunits_out != nunits_in)
4047 code = gimple_assign_rhs_code (stmt);
4049 /* For pointer addition, we should use the normal plus for
4050 the vector addition. */
4051 if (code == POINTER_PLUS_EXPR)
4054 /* Support only unary or binary operations. */
4055 op_type = TREE_CODE_LENGTH (code);
4056 if (op_type != unary_op && op_type != binary_op)
4058 if (vect_print_dump_info (REPORT_DETAILS))
4059 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
4063 op0 = gimple_assign_rhs1 (stmt);
4064 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4066 if (vect_print_dump_info (REPORT_DETAILS))
4067 fprintf (vect_dump, "use not simple.");
4071 if (op_type == binary_op)
4073 op1 = gimple_assign_rhs2 (stmt);
4074 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4076 if (vect_print_dump_info (REPORT_DETAILS))
4077 fprintf (vect_dump, "use not simple.");
4082 /* If this is a shift/rotate, determine whether the shift amount is a vector,
4083 or scalar. If the shift/rotate amount is a vector, use the vector/vector
4085 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
4086 || code == RROTATE_EXPR)
4090 /* vector shifted by vector */
4091 if (dt[1] == vect_loop_def)
4093 optab = optab_for_tree_code (code, vectype, optab_vector);
4094 if (vect_print_dump_info (REPORT_DETAILS))
4095 fprintf (vect_dump, "vector/vector shift/rotate found.");
4098 /* See if the machine has a vector shifted by scalar insn and if not
4099 then see if it has a vector shifted by vector insn */
4100 else if (dt[1] == vect_constant_def || dt[1] == vect_invariant_def)
4102 optab = optab_for_tree_code (code, vectype, optab_scalar);
4104 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4105 != CODE_FOR_nothing))
4107 scalar_shift_arg = true;
4108 if (vect_print_dump_info (REPORT_DETAILS))
4109 fprintf (vect_dump, "vector/scalar shift/rotate found.");
4113 optab = optab_for_tree_code (code, vectype, optab_vector);
4114 if (vect_print_dump_info (REPORT_DETAILS)
4116 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4117 != CODE_FOR_nothing))
4118 fprintf (vect_dump, "vector/vector shift/rotate found.");
4124 if (vect_print_dump_info (REPORT_DETAILS))
4125 fprintf (vect_dump, "operand mode requires invariant argument.");
4130 optab = optab_for_tree_code (code, vectype, optab_default);
4132 /* Supportable by target? */
4135 if (vect_print_dump_info (REPORT_DETAILS))
4136 fprintf (vect_dump, "no optab.");
4139 vec_mode = TYPE_MODE (vectype);
4140 icode = (int) optab_handler (optab, vec_mode)->insn_code;
4141 if (icode == CODE_FOR_nothing)
4143 if (vect_print_dump_info (REPORT_DETAILS))
4144 fprintf (vect_dump, "op not supported by target.");
4145 /* Check only during analysis. */
4146 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4147 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4148 < vect_min_worthwhile_factor (code)
4151 if (vect_print_dump_info (REPORT_DETAILS))
4152 fprintf (vect_dump, "proceeding using word mode.");
4155 /* Worthwhile without SIMD support? Check only during analysis. */
4156 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
4157 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4158 < vect_min_worthwhile_factor (code)
4161 if (vect_print_dump_info (REPORT_DETAILS))
4162 fprintf (vect_dump, "not worthwhile without SIMD support.");
4166 if (!vec_stmt) /* transformation not required. */
4168 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
4169 if (vect_print_dump_info (REPORT_DETAILS))
4170 fprintf (vect_dump, "=== vectorizable_operation ===");
4171 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4177 if (vect_print_dump_info (REPORT_DETAILS))
4178 fprintf (vect_dump, "transform binary/unary operation.");
4181 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4183 /* Allocate VECs for vector operands. In case of SLP, vector operands are
4184 created in the previous stages of the recursion, so no allocation is
4185 needed, except for the case of shift with scalar shift argument. In that
4186 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
4187 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
4188 In case of loop-based vectorization we allocate VECs of size 1. We
4189 allocate VEC_OPRNDS1 only in case of binary operation. */
4192 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4193 if (op_type == binary_op)
4194 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4196 else if (scalar_shift_arg)
4197 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4199 /* In case the vectorization factor (VF) is bigger than the number
4200 of elements that we can fit in a vectype (nunits), we have to generate
4201 more than one vector stmt - i.e - we need to "unroll" the
4202 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4203 from one copy of the vector stmt to the next, in the field
4204 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4205 stages to find the correct vector defs to be used when vectorizing
4206 stmts that use the defs of the current stmt. The example below illustrates
4207 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4208 4 vectorized stmts):
4210 before vectorization:
4211 RELATED_STMT VEC_STMT
4215 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4217 RELATED_STMT VEC_STMT
4218 VS1_0: vx0 = memref0 VS1_1 -
4219 VS1_1: vx1 = memref1 VS1_2 -
4220 VS1_2: vx2 = memref2 VS1_3 -
4221 VS1_3: vx3 = memref3 - -
4222 S1: x = load - VS1_0
4225 step2: vectorize stmt S2 (done here):
4226 To vectorize stmt S2 we first need to find the relevant vector
4227 def for the first operand 'x'. This is, as usual, obtained from
4228 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4229 that defines 'x' (S1). This way we find the stmt VS1_0, and the
4230 relevant vector def 'vx0'. Having found 'vx0' we can generate
4231 the vector stmt VS2_0, and as usual, record it in the
4232 STMT_VINFO_VEC_STMT of stmt S2.
4233 When creating the second copy (VS2_1), we obtain the relevant vector
4234 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4235 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4236 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4237 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4238 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4239 chain of stmts and pointers:
4240 RELATED_STMT VEC_STMT
4241 VS1_0: vx0 = memref0 VS1_1 -
4242 VS1_1: vx1 = memref1 VS1_2 -
4243 VS1_2: vx2 = memref2 VS1_3 -
4244 VS1_3: vx3 = memref3 - -
4245 S1: x = load - VS1_0
4246 VS2_0: vz0 = vx0 + v1 VS2_1 -
4247 VS2_1: vz1 = vx1 + v1 VS2_2 -
4248 VS2_2: vz2 = vx2 + v1 VS2_3 -
4249 VS2_3: vz3 = vx3 + v1 - -
4250 S2: z = x + 1 - VS2_0 */
4252 prev_stmt_info = NULL;
4253 for (j = 0; j < ncopies; j++)
4258 if (op_type == binary_op && scalar_shift_arg)
4260 /* Vector shl and shr insn patterns can be defined with scalar
4261 operand 2 (shift operand). In this case, use constant or loop
4262 invariant op1 directly, without extending it to vector mode
4264 optab_op2_mode = insn_data[icode].operand[2].mode;
4265 if (!VECTOR_MODE_P (optab_op2_mode))
4267 if (vect_print_dump_info (REPORT_DETAILS))
4268 fprintf (vect_dump, "operand 1 using scalar mode.");
4270 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4273 /* Store vec_oprnd1 for every vector stmt to be created
4274 for SLP_NODE. We check during the analysis that all the
4275 shift arguments are the same.
4276 TODO: Allow different constants for different vector
4277 stmts generated for an SLP instance. */
4278 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4279 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4284 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4285 (a special case for certain kind of vector shifts); otherwise,
4286 operand 1 should be of a vector type (the usual case). */
4287 if (op_type == binary_op && !vec_oprnd1)
4288 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4291 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4295 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4297 /* Arguments are ready. Create the new vector stmt. */
4298 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4300 vop1 = ((op_type == binary_op)
4301 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4302 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4303 new_temp = make_ssa_name (vec_dest, new_stmt);
4304 gimple_assign_set_lhs (new_stmt, new_temp);
4305 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4307 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4314 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4316 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4317 prev_stmt_info = vinfo_for_stmt (new_stmt);
4320 VEC_free (tree, heap, vec_oprnds0);
4322 VEC_free (tree, heap, vec_oprnds1);
4328 /* Get vectorized definitions for loop-based vectorization. For the first
4329 operand we call vect_get_vec_def_for_operand() (with OPRND containing
4330 scalar operand), and for the rest we get a copy with
4331 vect_get_vec_def_for_stmt_copy() using the previous vector definition
4332 (stored in OPRND). See vect_get_vec_def_for_stmt_copy() for details.
4333 The vectors are collected into VEC_OPRNDS. */
4336 vect_get_loop_based_defs (tree *oprnd, gimple stmt, enum vect_def_type dt,
4337 VEC (tree, heap) **vec_oprnds, int multi_step_cvt)
4341 /* Get first vector operand. */
4342 /* All the vector operands except the very first one (that is scalar oprnd)
4344 if (TREE_CODE (TREE_TYPE (*oprnd)) != VECTOR_TYPE)
4345 vec_oprnd = vect_get_vec_def_for_operand (*oprnd, stmt, NULL);
4347 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, *oprnd);
4349 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4351 /* Get second vector operand. */
4352 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd);
4353 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4357 /* For conversion in multiple steps, continue to get operands
4360 vect_get_loop_based_defs (oprnd, stmt, dt, vec_oprnds, multi_step_cvt - 1);
4364 /* Create vectorized demotion statements for vector operands from VEC_OPRNDS.
4365 For multi-step conversions store the resulting vectors and call the function
4369 vect_create_vectorized_demotion_stmts (VEC (tree, heap) **vec_oprnds,
4370 int multi_step_cvt, gimple stmt,
4371 VEC (tree, heap) *vec_dsts,
4372 gimple_stmt_iterator *gsi,
4373 slp_tree slp_node, enum tree_code code,
4374 stmt_vec_info *prev_stmt_info)
4377 tree vop0, vop1, new_tmp, vec_dest;
4379 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4381 vec_dest = VEC_pop (tree, vec_dsts);
4383 for (i = 0; i < VEC_length (tree, *vec_oprnds); i += 2)
4385 /* Create demotion operation. */
4386 vop0 = VEC_index (tree, *vec_oprnds, i);
4387 vop1 = VEC_index (tree, *vec_oprnds, i + 1);
4388 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4389 new_tmp = make_ssa_name (vec_dest, new_stmt);
4390 gimple_assign_set_lhs (new_stmt, new_tmp);
4391 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4394 /* Store the resulting vector for next recursive call. */
4395 VEC_replace (tree, *vec_oprnds, i/2, new_tmp);
4398 /* This is the last step of the conversion sequence. Store the
4399 vectors in SLP_NODE or in vector info of the scalar statement
4400 (or in STMT_VINFO_RELATED_STMT chain). */
4402 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4405 if (!*prev_stmt_info)
4406 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4408 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt;
4410 *prev_stmt_info = vinfo_for_stmt (new_stmt);
4415 /* For multi-step demotion operations we first generate demotion operations
4416 from the source type to the intermediate types, and then combine the
4417 results (stored in VEC_OPRNDS) in demotion operation to the destination
4421 /* At each level of recursion we have have of the operands we had at the
4423 VEC_truncate (tree, *vec_oprnds, (i+1)/2);
4424 vect_create_vectorized_demotion_stmts (vec_oprnds, multi_step_cvt - 1,
4425 stmt, vec_dsts, gsi, slp_node,
4426 code, prev_stmt_info);
4431 /* Function vectorizable_type_demotion
4433 Check if STMT performs a binary or unary operation that involves
4434 type demotion, and if it can be vectorized.
4435 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4436 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4437 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4440 vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi,
4441 gimple *vec_stmt, slp_tree slp_node)
4446 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4447 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4448 enum tree_code code, code1 = ERROR_MARK;
4451 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4452 stmt_vec_info prev_stmt_info;
4459 int multi_step_cvt = 0;
4460 VEC (tree, heap) *vec_oprnds0 = NULL;
4461 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4462 tree last_oprnd, intermediate_type;
4464 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4467 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4470 /* Is STMT a vectorizable type-demotion operation? */
4471 if (!is_gimple_assign (stmt))
4474 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4477 code = gimple_assign_rhs_code (stmt);
4478 if (!CONVERT_EXPR_CODE_P (code))
4481 op0 = gimple_assign_rhs1 (stmt);
4482 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4485 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4487 scalar_dest = gimple_assign_lhs (stmt);
4488 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4491 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4492 if (nunits_in >= nunits_out)
4495 /* Multiple types in SLP are handled by creating the appropriate number of
4496 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4501 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4503 gcc_assert (ncopies >= 1);
4505 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4506 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4507 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4508 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4509 && CONVERT_EXPR_CODE_P (code))))
4512 /* Check the operands of the operation. */
4513 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4515 if (vect_print_dump_info (REPORT_DETAILS))
4516 fprintf (vect_dump, "use not simple.");
4520 /* Supportable by target? */
4521 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1,
4522 &multi_step_cvt, &interm_types))
4525 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4527 if (!vec_stmt) /* transformation not required. */
4529 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4530 if (vect_print_dump_info (REPORT_DETAILS))
4531 fprintf (vect_dump, "=== vectorizable_demotion ===");
4532 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4537 if (vect_print_dump_info (REPORT_DETAILS))
4538 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4541 /* In case of multi-step demotion, we first generate demotion operations to
4542 the intermediate types, and then from that types to the final one.
4543 We create vector destinations for the intermediate type (TYPES) received
4544 from supportable_narrowing_operation, and store them in the correct order
4545 for future use in vect_create_vectorized_demotion_stmts(). */
4547 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4549 vec_dsts = VEC_alloc (tree, heap, 1);
4551 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4552 VEC_quick_push (tree, vec_dsts, vec_dest);
4556 for (i = VEC_length (tree, interm_types) - 1;
4557 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4559 vec_dest = vect_create_destination_var (scalar_dest,
4561 VEC_quick_push (tree, vec_dsts, vec_dest);
4565 /* In case the vectorization factor (VF) is bigger than the number
4566 of elements that we can fit in a vectype (nunits), we have to generate
4567 more than one vector stmt - i.e - we need to "unroll" the
4568 vector stmt by a factor VF/nunits. */
4570 prev_stmt_info = NULL;
4571 for (j = 0; j < ncopies; j++)
4575 vect_get_slp_defs (slp_node, &vec_oprnds0, NULL);
4578 VEC_free (tree, heap, vec_oprnds0);
4579 vec_oprnds0 = VEC_alloc (tree, heap,
4580 (multi_step_cvt ? vect_pow2 (multi_step_cvt) * 2 : 2));
4581 vect_get_loop_based_defs (&last_oprnd, stmt, dt[0], &vec_oprnds0,
4582 vect_pow2 (multi_step_cvt) - 1);
4585 /* Arguments are ready. Create the new vector stmts. */
4586 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4587 vect_create_vectorized_demotion_stmts (&vec_oprnds0,
4588 multi_step_cvt, stmt, tmp_vec_dsts,
4589 gsi, slp_node, code1,
4593 VEC_free (tree, heap, vec_oprnds0);
4594 VEC_free (tree, heap, vec_dsts);
4595 VEC_free (tree, heap, tmp_vec_dsts);
4596 VEC_free (tree, heap, interm_types);
4598 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4603 /* Create vectorized promotion statements for vector operands from VEC_OPRNDS0
4604 and VEC_OPRNDS1 (for binary operations). For multi-step conversions store
4605 the resulting vectors and call the function recursively. */
4608 vect_create_vectorized_promotion_stmts (VEC (tree, heap) **vec_oprnds0,
4609 VEC (tree, heap) **vec_oprnds1,
4610 int multi_step_cvt, gimple stmt,
4611 VEC (tree, heap) *vec_dsts,
4612 gimple_stmt_iterator *gsi,
4613 slp_tree slp_node, enum tree_code code1,
4614 enum tree_code code2, tree decl1,
4615 tree decl2, int op_type,
4616 stmt_vec_info *prev_stmt_info)
4619 tree vop0, vop1, new_tmp1, new_tmp2, vec_dest;
4620 gimple new_stmt1, new_stmt2;
4621 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4622 VEC (tree, heap) *vec_tmp;
4624 vec_dest = VEC_pop (tree, vec_dsts);
4625 vec_tmp = VEC_alloc (tree, heap, VEC_length (tree, *vec_oprnds0) * 2);
4627 for (i = 0; VEC_iterate (tree, *vec_oprnds0, i, vop0); i++)
4629 if (op_type == binary_op)
4630 vop1 = VEC_index (tree, *vec_oprnds1, i);
4634 /* Generate the two halves of promotion operation. */
4635 new_stmt1 = vect_gen_widened_results_half (code1, decl1, vop0, vop1,
4636 op_type, vec_dest, gsi, stmt);
4637 new_stmt2 = vect_gen_widened_results_half (code2, decl2, vop0, vop1,
4638 op_type, vec_dest, gsi, stmt);
4639 if (is_gimple_call (new_stmt1))
4641 new_tmp1 = gimple_call_lhs (new_stmt1);
4642 new_tmp2 = gimple_call_lhs (new_stmt2);
4646 new_tmp1 = gimple_assign_lhs (new_stmt1);
4647 new_tmp2 = gimple_assign_lhs (new_stmt2);
4652 /* Store the results for the recursive call. */
4653 VEC_quick_push (tree, vec_tmp, new_tmp1);
4654 VEC_quick_push (tree, vec_tmp, new_tmp2);
4658 /* Last step of promotion sequience - store the results. */
4661 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt1);
4662 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt2);
4666 if (!*prev_stmt_info)
4667 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt1;
4669 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt1;
4671 *prev_stmt_info = vinfo_for_stmt (new_stmt1);
4672 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt2;
4673 *prev_stmt_info = vinfo_for_stmt (new_stmt2);
4680 /* For multi-step promotion operation we first generate we call the
4681 function recurcively for every stage. We start from the input type,
4682 create promotion operations to the intermediate types, and then
4683 create promotions to the output type. */
4684 *vec_oprnds0 = VEC_copy (tree, heap, vec_tmp);
4685 VEC_free (tree, heap, vec_tmp);
4686 vect_create_vectorized_promotion_stmts (vec_oprnds0, vec_oprnds1,
4687 multi_step_cvt - 1, stmt,
4688 vec_dsts, gsi, slp_node, code1,
4689 code2, decl2, decl2, op_type,
4695 /* Function vectorizable_type_promotion
4697 Check if STMT performs a binary or unary operation that involves
4698 type promotion, and if it can be vectorized.
4699 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4700 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4701 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4704 vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
4705 gimple *vec_stmt, slp_tree slp_node)
4709 tree op0, op1 = NULL;
4710 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4711 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4712 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4713 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4714 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4718 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4719 stmt_vec_info prev_stmt_info;
4726 tree intermediate_type = NULL_TREE;
4727 int multi_step_cvt = 0;
4728 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4729 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4731 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4734 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4737 /* Is STMT a vectorizable type-promotion operation? */
4738 if (!is_gimple_assign (stmt))
4741 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4744 code = gimple_assign_rhs_code (stmt);
4745 if (!CONVERT_EXPR_CODE_P (code)
4746 && code != WIDEN_MULT_EXPR)
4749 op0 = gimple_assign_rhs1 (stmt);
4750 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4753 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4755 scalar_dest = gimple_assign_lhs (stmt);
4756 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4759 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4760 if (nunits_in <= nunits_out)
4763 /* Multiple types in SLP are handled by creating the appropriate number of
4764 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4769 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4771 gcc_assert (ncopies >= 1);
4773 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4774 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4775 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4776 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4777 && CONVERT_EXPR_CODE_P (code))))
4780 /* Check the operands of the operation. */
4781 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4783 if (vect_print_dump_info (REPORT_DETAILS))
4784 fprintf (vect_dump, "use not simple.");
4788 op_type = TREE_CODE_LENGTH (code);
4789 if (op_type == binary_op)
4791 op1 = gimple_assign_rhs2 (stmt);
4792 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4794 if (vect_print_dump_info (REPORT_DETAILS))
4795 fprintf (vect_dump, "use not simple.");
4800 /* Supportable by target? */
4801 if (!supportable_widening_operation (code, stmt, vectype_in,
4802 &decl1, &decl2, &code1, &code2,
4803 &multi_step_cvt, &interm_types))
4806 /* Binary widening operation can only be supported directly by the
4808 gcc_assert (!(multi_step_cvt && op_type == binary_op));
4810 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4812 if (!vec_stmt) /* transformation not required. */
4814 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4815 if (vect_print_dump_info (REPORT_DETAILS))
4816 fprintf (vect_dump, "=== vectorizable_promotion ===");
4817 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4823 if (vect_print_dump_info (REPORT_DETAILS))
4824 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4828 /* In case of multi-step promotion, we first generate promotion operations
4829 to the intermediate types, and then from that types to the final one.
4830 We store vector destination in VEC_DSTS in the correct order for
4831 recursive creation of promotion operations in
4832 vect_create_vectorized_promotion_stmts(). Vector destinations are created
4833 according to TYPES recieved from supportable_widening_operation(). */
4835 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4837 vec_dsts = VEC_alloc (tree, heap, 1);
4839 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4840 VEC_quick_push (tree, vec_dsts, vec_dest);
4844 for (i = VEC_length (tree, interm_types) - 1;
4845 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4847 vec_dest = vect_create_destination_var (scalar_dest,
4849 VEC_quick_push (tree, vec_dsts, vec_dest);
4855 vec_oprnds0 = VEC_alloc (tree, heap,
4856 (multi_step_cvt ? vect_pow2 (multi_step_cvt) : 1));
4857 if (op_type == binary_op)
4858 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4861 /* In case the vectorization factor (VF) is bigger than the number
4862 of elements that we can fit in a vectype (nunits), we have to generate
4863 more than one vector stmt - i.e - we need to "unroll" the
4864 vector stmt by a factor VF/nunits. */
4866 prev_stmt_info = NULL;
4867 for (j = 0; j < ncopies; j++)
4873 vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1);
4876 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4877 VEC_quick_push (tree, vec_oprnds0, vec_oprnd0);
4878 if (op_type == binary_op)
4880 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4881 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4887 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4888 VEC_replace (tree, vec_oprnds0, 0, vec_oprnd0);
4889 if (op_type == binary_op)
4891 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4892 VEC_replace (tree, vec_oprnds1, 0, vec_oprnd1);
4896 /* Arguments are ready. Create the new vector stmts. */
4897 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4898 vect_create_vectorized_promotion_stmts (&vec_oprnds0, &vec_oprnds1,
4899 multi_step_cvt, stmt,
4901 gsi, slp_node, code1, code2,
4902 decl1, decl2, op_type,
4906 VEC_free (tree, heap, vec_dsts);
4907 VEC_free (tree, heap, tmp_vec_dsts);
4908 VEC_free (tree, heap, interm_types);
4909 VEC_free (tree, heap, vec_oprnds0);
4910 VEC_free (tree, heap, vec_oprnds1);
4912 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4917 /* Function vect_strided_store_supported.
4919 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4920 and FALSE otherwise. */
4923 vect_strided_store_supported (tree vectype)
4925 optab interleave_high_optab, interleave_low_optab;
4928 mode = (int) TYPE_MODE (vectype);
4930 /* Check that the operation is supported. */
4931 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4932 vectype, optab_default);
4933 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4934 vectype, optab_default);
4935 if (!interleave_high_optab || !interleave_low_optab)
4937 if (vect_print_dump_info (REPORT_DETAILS))
4938 fprintf (vect_dump, "no optab for interleave.");
4942 if (optab_handler (interleave_high_optab, mode)->insn_code
4944 || optab_handler (interleave_low_optab, mode)->insn_code
4945 == CODE_FOR_nothing)
4947 if (vect_print_dump_info (REPORT_DETAILS))
4948 fprintf (vect_dump, "interleave op not supported by target.");
4956 /* Function vect_permute_store_chain.
4958 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4959 a power of 2, generate interleave_high/low stmts to reorder the data
4960 correctly for the stores. Return the final references for stores in
4963 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4964 The input is 4 vectors each containing 8 elements. We assign a number to each
4965 element, the input sequence is:
4967 1st vec: 0 1 2 3 4 5 6 7
4968 2nd vec: 8 9 10 11 12 13 14 15
4969 3rd vec: 16 17 18 19 20 21 22 23
4970 4th vec: 24 25 26 27 28 29 30 31
4972 The output sequence should be:
4974 1st vec: 0 8 16 24 1 9 17 25
4975 2nd vec: 2 10 18 26 3 11 19 27
4976 3rd vec: 4 12 20 28 5 13 21 30
4977 4th vec: 6 14 22 30 7 15 23 31
4979 i.e., we interleave the contents of the four vectors in their order.
4981 We use interleave_high/low instructions to create such output. The input of
4982 each interleave_high/low operation is two vectors:
4985 the even elements of the result vector are obtained left-to-right from the
4986 high/low elements of the first vector. The odd elements of the result are
4987 obtained left-to-right from the high/low elements of the second vector.
4988 The output of interleave_high will be: 0 4 1 5
4989 and of interleave_low: 2 6 3 7
4992 The permutation is done in log LENGTH stages. In each stage interleave_high
4993 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4994 where the first argument is taken from the first half of DR_CHAIN and the
4995 second argument from it's second half.
4998 I1: interleave_high (1st vec, 3rd vec)
4999 I2: interleave_low (1st vec, 3rd vec)
5000 I3: interleave_high (2nd vec, 4th vec)
5001 I4: interleave_low (2nd vec, 4th vec)
5003 The output for the first stage is:
5005 I1: 0 16 1 17 2 18 3 19
5006 I2: 4 20 5 21 6 22 7 23
5007 I3: 8 24 9 25 10 26 11 27
5008 I4: 12 28 13 29 14 30 15 31
5010 The output of the second stage, i.e. the final result is:
5012 I1: 0 8 16 24 1 9 17 25
5013 I2: 2 10 18 26 3 11 19 27
5014 I3: 4 12 20 28 5 13 21 30
5015 I4: 6 14 22 30 7 15 23 31. */
5018 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
5019 unsigned int length,
5021 gimple_stmt_iterator *gsi,
5022 VEC(tree,heap) **result_chain)
5024 tree perm_dest, vect1, vect2, high, low;
5026 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5030 enum tree_code high_code, low_code;
5032 scalar_dest = gimple_assign_lhs (stmt);
5034 /* Check that the operation is supported. */
5035 if (!vect_strided_store_supported (vectype))
5038 *result_chain = VEC_copy (tree, heap, dr_chain);
5040 for (i = 0; i < exact_log2 (length); i++)
5042 for (j = 0; j < length/2; j++)
5044 vect1 = VEC_index (tree, dr_chain, j);
5045 vect2 = VEC_index (tree, dr_chain, j+length/2);
5047 /* Create interleaving stmt:
5048 in the case of big endian:
5049 high = interleave_high (vect1, vect2)
5050 and in the case of little endian:
5051 high = interleave_low (vect1, vect2). */
5052 perm_dest = create_tmp_var (vectype, "vect_inter_high");
5053 DECL_GIMPLE_REG_P (perm_dest) = 1;
5054 add_referenced_var (perm_dest);
5055 if (BYTES_BIG_ENDIAN)
5057 high_code = VEC_INTERLEAVE_HIGH_EXPR;
5058 low_code = VEC_INTERLEAVE_LOW_EXPR;
5062 low_code = VEC_INTERLEAVE_HIGH_EXPR;
5063 high_code = VEC_INTERLEAVE_LOW_EXPR;
5065 perm_stmt = gimple_build_assign_with_ops (high_code, perm_dest,
5067 high = make_ssa_name (perm_dest, perm_stmt);
5068 gimple_assign_set_lhs (perm_stmt, high);
5069 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5070 VEC_replace (tree, *result_chain, 2*j, high);
5072 /* Create interleaving stmt:
5073 in the case of big endian:
5074 low = interleave_low (vect1, vect2)
5075 and in the case of little endian:
5076 low = interleave_high (vect1, vect2). */
5077 perm_dest = create_tmp_var (vectype, "vect_inter_low");
5078 DECL_GIMPLE_REG_P (perm_dest) = 1;
5079 add_referenced_var (perm_dest);
5080 perm_stmt = gimple_build_assign_with_ops (low_code, perm_dest,
5082 low = make_ssa_name (perm_dest, perm_stmt);
5083 gimple_assign_set_lhs (perm_stmt, low);
5084 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5085 VEC_replace (tree, *result_chain, 2*j+1, low);
5087 dr_chain = VEC_copy (tree, heap, *result_chain);
5093 /* Function vectorizable_store.
5095 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
5097 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5098 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5099 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5102 vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
5108 tree vec_oprnd = NULL_TREE;
5109 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5110 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
5111 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5112 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5113 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5114 enum machine_mode vec_mode;
5116 enum dr_alignment_support alignment_support_scheme;
5119 enum vect_def_type dt;
5120 stmt_vec_info prev_stmt_info = NULL;
5121 tree dataref_ptr = NULL_TREE;
5122 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5125 gimple next_stmt, first_stmt = NULL;
5126 bool strided_store = false;
5127 unsigned int group_size, i;
5128 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
5130 VEC(tree,heap) *vec_oprnds = NULL;
5131 bool slp = (slp_node != NULL);
5132 stmt_vec_info first_stmt_vinfo;
5133 unsigned int vec_num;
5135 /* Multiple types in SLP are handled by creating the appropriate number of
5136 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
5141 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5143 gcc_assert (ncopies >= 1);
5145 /* FORNOW. This restriction should be relaxed. */
5146 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
5148 if (vect_print_dump_info (REPORT_DETAILS))
5149 fprintf (vect_dump, "multiple types in nested loop.");
5153 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5156 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5159 /* Is vectorizable store? */
5161 if (!is_gimple_assign (stmt))
5164 scalar_dest = gimple_assign_lhs (stmt);
5165 if (TREE_CODE (scalar_dest) != ARRAY_REF
5166 && TREE_CODE (scalar_dest) != INDIRECT_REF
5167 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5170 gcc_assert (gimple_assign_single_p (stmt));
5171 op = gimple_assign_rhs1 (stmt);
5172 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5174 if (vect_print_dump_info (REPORT_DETAILS))
5175 fprintf (vect_dump, "use not simple.");
5179 /* If accesses through a pointer to vectype do not alias the original
5180 memory reference we have a problem. */
5181 if (get_alias_set (vectype) != get_alias_set (TREE_TYPE (scalar_dest))
5182 && !alias_set_subset_of (get_alias_set (vectype),
5183 get_alias_set (TREE_TYPE (scalar_dest))))
5185 if (vect_print_dump_info (REPORT_DETAILS))
5186 fprintf (vect_dump, "vector type does not alias scalar type");
5190 if (!useless_type_conversion_p (TREE_TYPE (op), TREE_TYPE (scalar_dest)))
5192 if (vect_print_dump_info (REPORT_DETAILS))
5193 fprintf (vect_dump, "operands of different types");
5197 vec_mode = TYPE_MODE (vectype);
5198 /* FORNOW. In some cases can vectorize even if data-type not supported
5199 (e.g. - array initialization with 0). */
5200 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
5203 if (!STMT_VINFO_DATA_REF (stmt_info))
5206 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5208 strided_store = true;
5209 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5210 if (!vect_strided_store_supported (vectype)
5211 && !PURE_SLP_STMT (stmt_info) && !slp)
5214 if (first_stmt == stmt)
5216 /* STMT is the leader of the group. Check the operands of all the
5217 stmts of the group. */
5218 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
5221 gcc_assert (gimple_assign_single_p (next_stmt));
5222 op = gimple_assign_rhs1 (next_stmt);
5223 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5225 if (vect_print_dump_info (REPORT_DETAILS))
5226 fprintf (vect_dump, "use not simple.");
5229 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5234 if (!vec_stmt) /* transformation not required. */
5236 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
5237 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
5245 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5246 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5248 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
5251 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5253 /* We vectorize all the stmts of the interleaving group when we
5254 reach the last stmt in the group. */
5255 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
5256 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
5264 strided_store = false;
5266 /* VEC_NUM is the number of vect stmts to be created for this group. */
5268 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5270 vec_num = group_size;
5276 group_size = vec_num = 1;
5277 first_stmt_vinfo = stmt_info;
5280 if (vect_print_dump_info (REPORT_DETAILS))
5281 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
5283 dr_chain = VEC_alloc (tree, heap, group_size);
5284 oprnds = VEC_alloc (tree, heap, group_size);
5286 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5287 gcc_assert (alignment_support_scheme);
5288 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
5290 /* In case the vectorization factor (VF) is bigger than the number
5291 of elements that we can fit in a vectype (nunits), we have to generate
5292 more than one vector stmt - i.e - we need to "unroll" the
5293 vector stmt by a factor VF/nunits. For more details see documentation in
5294 vect_get_vec_def_for_copy_stmt. */
5296 /* In case of interleaving (non-unit strided access):
5303 We create vectorized stores starting from base address (the access of the
5304 first stmt in the chain (S2 in the above example), when the last store stmt
5305 of the chain (S4) is reached:
5308 VS2: &base + vec_size*1 = vx0
5309 VS3: &base + vec_size*2 = vx1
5310 VS4: &base + vec_size*3 = vx3
5312 Then permutation statements are generated:
5314 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
5315 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
5318 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5319 (the order of the data-refs in the output of vect_permute_store_chain
5320 corresponds to the order of scalar stmts in the interleaving chain - see
5321 the documentation of vect_permute_store_chain()).
5323 In case of both multiple types and interleaving, above vector stores and
5324 permutation stmts are created for every copy. The result vector stmts are
5325 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5326 STMT_VINFO_RELATED_STMT for the next copies.
5329 prev_stmt_info = NULL;
5330 for (j = 0; j < ncopies; j++)
5339 /* Get vectorized arguments for SLP_NODE. */
5340 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
5342 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
5346 /* For interleaved stores we collect vectorized defs for all the
5347 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
5348 used as an input to vect_permute_store_chain(), and OPRNDS as
5349 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
5351 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5352 OPRNDS are of size 1. */
5353 next_stmt = first_stmt;
5354 for (i = 0; i < group_size; i++)
5356 /* Since gaps are not supported for interleaved stores,
5357 GROUP_SIZE is the exact number of stmts in the chain.
5358 Therefore, NEXT_STMT can't be NULL_TREE. In case that
5359 there is no interleaving, GROUP_SIZE is 1, and only one
5360 iteration of the loop will be executed. */
5361 gcc_assert (next_stmt);
5362 gcc_assert (gimple_assign_single_p (next_stmt));
5363 op = gimple_assign_rhs1 (next_stmt);
5365 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
5367 VEC_quick_push(tree, dr_chain, vec_oprnd);
5368 VEC_quick_push(tree, oprnds, vec_oprnd);
5369 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5373 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
5374 &dummy, &ptr_incr, false,
5376 gcc_assert (!inv_p);
5380 /* For interleaved stores we created vectorized defs for all the
5381 defs stored in OPRNDS in the previous iteration (previous copy).
5382 DR_CHAIN is then used as an input to vect_permute_store_chain(),
5383 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
5385 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5386 OPRNDS are of size 1. */
5387 for (i = 0; i < group_size; i++)
5389 op = VEC_index (tree, oprnds, i);
5390 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
5391 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
5392 VEC_replace(tree, dr_chain, i, vec_oprnd);
5393 VEC_replace(tree, oprnds, i, vec_oprnd);
5396 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
5401 result_chain = VEC_alloc (tree, heap, group_size);
5403 if (!vect_permute_store_chain (dr_chain, group_size, stmt, gsi,
5408 next_stmt = first_stmt;
5409 for (i = 0; i < vec_num; i++)
5412 /* Bump the vector pointer. */
5413 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
5417 vec_oprnd = VEC_index (tree, vec_oprnds, i);
5418 else if (strided_store)
5419 /* For strided stores vectorized defs are interleaved in
5420 vect_permute_store_chain(). */
5421 vec_oprnd = VEC_index (tree, result_chain, i);
5423 data_ref = build_fold_indirect_ref (dataref_ptr);
5424 /* Arguments are ready. Create the new vector stmt. */
5425 new_stmt = gimple_build_assign (data_ref, vec_oprnd);
5426 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5427 mark_symbols_for_renaming (new_stmt);
5433 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5435 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5437 prev_stmt_info = vinfo_for_stmt (new_stmt);
5438 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5444 VEC_free (tree, heap, dr_chain);
5445 VEC_free (tree, heap, oprnds);
5447 VEC_free (tree, heap, result_chain);
5453 /* Function vect_setup_realignment
5455 This function is called when vectorizing an unaligned load using
5456 the dr_explicit_realign[_optimized] scheme.
5457 This function generates the following code at the loop prolog:
5460 x msq_init = *(floor(p)); # prolog load
5461 realignment_token = call target_builtin;
5463 x msq = phi (msq_init, ---)
5465 The stmts marked with x are generated only for the case of
5466 dr_explicit_realign_optimized.
5468 The code above sets up a new (vector) pointer, pointing to the first
5469 location accessed by STMT, and a "floor-aligned" load using that pointer.
5470 It also generates code to compute the "realignment-token" (if the relevant
5471 target hook was defined), and creates a phi-node at the loop-header bb
5472 whose arguments are the result of the prolog-load (created by this
5473 function) and the result of a load that takes place in the loop (to be
5474 created by the caller to this function).
5476 For the case of dr_explicit_realign_optimized:
5477 The caller to this function uses the phi-result (msq) to create the
5478 realignment code inside the loop, and sets up the missing phi argument,
5481 msq = phi (msq_init, lsq)
5482 lsq = *(floor(p')); # load in loop
5483 result = realign_load (msq, lsq, realignment_token);
5485 For the case of dr_explicit_realign:
5487 msq = *(floor(p)); # load in loop
5489 lsq = *(floor(p')); # load in loop
5490 result = realign_load (msq, lsq, realignment_token);
5493 STMT - (scalar) load stmt to be vectorized. This load accesses
5494 a memory location that may be unaligned.
5495 BSI - place where new code is to be inserted.
5496 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5500 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5501 target hook, if defined.
5502 Return value - the result of the loop-header phi node. */
5505 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
5506 tree *realignment_token,
5507 enum dr_alignment_support alignment_support_scheme,
5509 struct loop **at_loop)
5511 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5512 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5513 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5514 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5516 tree scalar_dest = gimple_assign_lhs (stmt);
5523 tree msq_init = NULL_TREE;
5526 tree msq = NULL_TREE;
5527 gimple_seq stmts = NULL;
5529 bool compute_in_loop = false;
5530 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5531 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5532 struct loop *loop_for_initial_load;
5534 gcc_assert (alignment_support_scheme == dr_explicit_realign
5535 || alignment_support_scheme == dr_explicit_realign_optimized);
5537 /* We need to generate three things:
5538 1. the misalignment computation
5539 2. the extra vector load (for the optimized realignment scheme).
5540 3. the phi node for the two vectors from which the realignment is
5541 done (for the optimized realignment scheme).
5544 /* 1. Determine where to generate the misalignment computation.
5546 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5547 calculation will be generated by this function, outside the loop (in the
5548 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5549 caller, inside the loop.
5551 Background: If the misalignment remains fixed throughout the iterations of
5552 the loop, then both realignment schemes are applicable, and also the
5553 misalignment computation can be done outside LOOP. This is because we are
5554 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5555 are a multiple of VS (the Vector Size), and therefore the misalignment in
5556 different vectorized LOOP iterations is always the same.
5557 The problem arises only if the memory access is in an inner-loop nested
5558 inside LOOP, which is now being vectorized using outer-loop vectorization.
5559 This is the only case when the misalignment of the memory access may not
5560 remain fixed throughout the iterations of the inner-loop (as explained in
5561 detail in vect_supportable_dr_alignment). In this case, not only is the
5562 optimized realignment scheme not applicable, but also the misalignment
5563 computation (and generation of the realignment token that is passed to
5564 REALIGN_LOAD) have to be done inside the loop.
5566 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5567 or not, which in turn determines if the misalignment is computed inside
5568 the inner-loop, or outside LOOP. */
5570 if (init_addr != NULL_TREE)
5572 compute_in_loop = true;
5573 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5577 /* 2. Determine where to generate the extra vector load.
5579 For the optimized realignment scheme, instead of generating two vector
5580 loads in each iteration, we generate a single extra vector load in the
5581 preheader of the loop, and in each iteration reuse the result of the
5582 vector load from the previous iteration. In case the memory access is in
5583 an inner-loop nested inside LOOP, which is now being vectorized using
5584 outer-loop vectorization, we need to determine whether this initial vector
5585 load should be generated at the preheader of the inner-loop, or can be
5586 generated at the preheader of LOOP. If the memory access has no evolution
5587 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5588 to be generated inside LOOP (in the preheader of the inner-loop). */
5590 if (nested_in_vect_loop)
5592 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5593 bool invariant_in_outerloop =
5594 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5595 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5598 loop_for_initial_load = loop;
5600 *at_loop = loop_for_initial_load;
5602 /* 3. For the case of the optimized realignment, create the first vector
5603 load at the loop preheader. */
5605 if (alignment_support_scheme == dr_explicit_realign_optimized)
5607 /* Create msq_init = *(floor(p1)) in the loop preheader */
5609 gcc_assert (!compute_in_loop);
5610 pe = loop_preheader_edge (loop_for_initial_load);
5611 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5612 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5613 &init_addr, &inc, true, &inv_p);
5614 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5615 new_stmt = gimple_build_assign (vec_dest, data_ref);
5616 new_temp = make_ssa_name (vec_dest, new_stmt);
5617 gimple_assign_set_lhs (new_stmt, new_temp);
5618 mark_symbols_for_renaming (new_stmt);
5619 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5620 gcc_assert (!new_bb);
5621 msq_init = gimple_assign_lhs (new_stmt);
5624 /* 4. Create realignment token using a target builtin, if available.
5625 It is done either inside the containing loop, or before LOOP (as
5626 determined above). */
5628 if (targetm.vectorize.builtin_mask_for_load)
5632 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5633 if (compute_in_loop)
5634 gcc_assert (init_addr); /* already computed by the caller. */
5637 /* Generate the INIT_ADDR computation outside LOOP. */
5638 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5640 pe = loop_preheader_edge (loop);
5641 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5642 gcc_assert (!new_bb);
5645 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5646 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5648 vect_create_destination_var (scalar_dest,
5649 gimple_call_return_type (new_stmt));
5650 new_temp = make_ssa_name (vec_dest, new_stmt);
5651 gimple_call_set_lhs (new_stmt, new_temp);
5653 if (compute_in_loop)
5654 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5657 /* Generate the misalignment computation outside LOOP. */
5658 pe = loop_preheader_edge (loop);
5659 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5660 gcc_assert (!new_bb);
5663 *realignment_token = gimple_call_lhs (new_stmt);
5665 /* The result of the CALL_EXPR to this builtin is determined from
5666 the value of the parameter and no global variables are touched
5667 which makes the builtin a "const" function. Requiring the
5668 builtin to have the "const" attribute makes it unnecessary
5669 to call mark_call_clobbered. */
5670 gcc_assert (TREE_READONLY (builtin_decl));
5673 if (alignment_support_scheme == dr_explicit_realign)
5676 gcc_assert (!compute_in_loop);
5677 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5680 /* 5. Create msq = phi <msq_init, lsq> in loop */
5682 pe = loop_preheader_edge (containing_loop);
5683 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5684 msq = make_ssa_name (vec_dest, NULL);
5685 phi_stmt = create_phi_node (msq, containing_loop->header);
5686 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5687 add_phi_arg (phi_stmt, msq_init, pe);
5693 /* Function vect_strided_load_supported.
5695 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5696 and FALSE otherwise. */
5699 vect_strided_load_supported (tree vectype)
5701 optab perm_even_optab, perm_odd_optab;
5704 mode = (int) TYPE_MODE (vectype);
5706 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype,
5708 if (!perm_even_optab)
5710 if (vect_print_dump_info (REPORT_DETAILS))
5711 fprintf (vect_dump, "no optab for perm_even.");
5715 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5717 if (vect_print_dump_info (REPORT_DETAILS))
5718 fprintf (vect_dump, "perm_even op not supported by target.");
5722 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype,
5724 if (!perm_odd_optab)
5726 if (vect_print_dump_info (REPORT_DETAILS))
5727 fprintf (vect_dump, "no optab for perm_odd.");
5731 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5733 if (vect_print_dump_info (REPORT_DETAILS))
5734 fprintf (vect_dump, "perm_odd op not supported by target.");
5741 /* Function vect_permute_load_chain.
5743 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5744 a power of 2, generate extract_even/odd stmts to reorder the input data
5745 correctly. Return the final references for loads in RESULT_CHAIN.
5747 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5748 The input is 4 vectors each containing 8 elements. We assign a number to each
5749 element, the input sequence is:
5751 1st vec: 0 1 2 3 4 5 6 7
5752 2nd vec: 8 9 10 11 12 13 14 15
5753 3rd vec: 16 17 18 19 20 21 22 23
5754 4th vec: 24 25 26 27 28 29 30 31
5756 The output sequence should be:
5758 1st vec: 0 4 8 12 16 20 24 28
5759 2nd vec: 1 5 9 13 17 21 25 29
5760 3rd vec: 2 6 10 14 18 22 26 30
5761 4th vec: 3 7 11 15 19 23 27 31
5763 i.e., the first output vector should contain the first elements of each
5764 interleaving group, etc.
5766 We use extract_even/odd instructions to create such output. The input of each
5767 extract_even/odd operation is two vectors
5771 and the output is the vector of extracted even/odd elements. The output of
5772 extract_even will be: 0 2 4 6
5773 and of extract_odd: 1 3 5 7
5776 The permutation is done in log LENGTH stages. In each stage extract_even and
5777 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5778 order. In our example,
5780 E1: extract_even (1st vec, 2nd vec)
5781 E2: extract_odd (1st vec, 2nd vec)
5782 E3: extract_even (3rd vec, 4th vec)
5783 E4: extract_odd (3rd vec, 4th vec)
5785 The output for the first stage will be:
5787 E1: 0 2 4 6 8 10 12 14
5788 E2: 1 3 5 7 9 11 13 15
5789 E3: 16 18 20 22 24 26 28 30
5790 E4: 17 19 21 23 25 27 29 31
5792 In order to proceed and create the correct sequence for the next stage (or
5793 for the correct output, if the second stage is the last one, as in our
5794 example), we first put the output of extract_even operation and then the
5795 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5796 The input for the second stage is:
5798 1st vec (E1): 0 2 4 6 8 10 12 14
5799 2nd vec (E3): 16 18 20 22 24 26 28 30
5800 3rd vec (E2): 1 3 5 7 9 11 13 15
5801 4th vec (E4): 17 19 21 23 25 27 29 31
5803 The output of the second stage:
5805 E1: 0 4 8 12 16 20 24 28
5806 E2: 2 6 10 14 18 22 26 30
5807 E3: 1 5 9 13 17 21 25 29
5808 E4: 3 7 11 15 19 23 27 31
5810 And RESULT_CHAIN after reordering:
5812 1st vec (E1): 0 4 8 12 16 20 24 28
5813 2nd vec (E3): 1 5 9 13 17 21 25 29
5814 3rd vec (E2): 2 6 10 14 18 22 26 30
5815 4th vec (E4): 3 7 11 15 19 23 27 31. */
5818 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5819 unsigned int length,
5821 gimple_stmt_iterator *gsi,
5822 VEC(tree,heap) **result_chain)
5824 tree perm_dest, data_ref, first_vect, second_vect;
5826 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5830 /* Check that the operation is supported. */
5831 if (!vect_strided_load_supported (vectype))
5834 *result_chain = VEC_copy (tree, heap, dr_chain);
5835 for (i = 0; i < exact_log2 (length); i++)
5837 for (j = 0; j < length; j +=2)
5839 first_vect = VEC_index (tree, dr_chain, j);
5840 second_vect = VEC_index (tree, dr_chain, j+1);
5842 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5843 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5844 DECL_GIMPLE_REG_P (perm_dest) = 1;
5845 add_referenced_var (perm_dest);
5847 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_EVEN_EXPR,
5848 perm_dest, first_vect,
5851 data_ref = make_ssa_name (perm_dest, perm_stmt);
5852 gimple_assign_set_lhs (perm_stmt, data_ref);
5853 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5854 mark_symbols_for_renaming (perm_stmt);
5856 VEC_replace (tree, *result_chain, j/2, data_ref);
5858 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5859 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5860 DECL_GIMPLE_REG_P (perm_dest) = 1;
5861 add_referenced_var (perm_dest);
5863 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_ODD_EXPR,
5864 perm_dest, first_vect,
5866 data_ref = make_ssa_name (perm_dest, perm_stmt);
5867 gimple_assign_set_lhs (perm_stmt, data_ref);
5868 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5869 mark_symbols_for_renaming (perm_stmt);
5871 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5873 dr_chain = VEC_copy (tree, heap, *result_chain);
5879 /* Function vect_transform_strided_load.
5881 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5882 to perform their permutation and ascribe the result vectorized statements to
5883 the scalar statements.
5887 vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
5888 gimple_stmt_iterator *gsi)
5890 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5891 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5892 gimple next_stmt, new_stmt;
5893 VEC(tree,heap) *result_chain = NULL;
5894 unsigned int i, gap_count;
5897 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5898 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5899 vectors, that are ready for vector computation. */
5900 result_chain = VEC_alloc (tree, heap, size);
5902 if (!vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain))
5905 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5906 Since we scan the chain starting from it's first node, their order
5907 corresponds the order of data-refs in RESULT_CHAIN. */
5908 next_stmt = first_stmt;
5910 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5915 /* Skip the gaps. Loads created for the gaps will be removed by dead
5916 code elimination pass later. No need to check for the first stmt in
5917 the group, since it always exists.
5918 DR_GROUP_GAP is the number of steps in elements from the previous
5919 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5920 correspond to the gaps.
5922 if (next_stmt != first_stmt
5923 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5931 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5932 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5933 copies, and we put the new vector statement in the first available
5935 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5936 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5940 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5942 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5945 prev_stmt = rel_stmt;
5946 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5948 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
5950 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5952 /* If NEXT_STMT accesses the same DR as the previous statement,
5953 put the same TMP_DATA_REF as its vectorized statement; otherwise
5954 get the next data-ref from RESULT_CHAIN. */
5955 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5960 VEC_free (tree, heap, result_chain);
5965 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
5966 building a vector of type MASK_TYPE from it) and two input vectors placed in
5967 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
5968 shifting by STRIDE elements of DR_CHAIN for every copy.
5969 (STRIDE is the number of vectorized stmts for NODE divided by the number of
5971 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
5972 the created stmts must be inserted. */
5975 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
5976 int *mask_array, int mask_nunits,
5977 tree mask_element_type, tree mask_type,
5978 int first_vec_indx, int second_vec_indx,
5979 gimple_stmt_iterator *gsi, slp_tree node,
5980 tree builtin_decl, tree vectype,
5981 VEC(tree,heap) *dr_chain,
5982 int ncopies, int vect_stmts_counter)
5984 tree t = NULL_TREE, mask_vec, mask, perm_dest;
5985 gimple perm_stmt = NULL;
5986 stmt_vec_info next_stmt_info;
5987 int i, group_size, stride, dr_chain_size;
5988 tree first_vec, second_vec, data_ref;
5991 VEC (tree, heap) *params = NULL;
5993 /* Create a vector mask. */
5994 for (i = mask_nunits - 1; i >= 0; --i)
5995 t = tree_cons (NULL_TREE, build_int_cst (mask_element_type, mask_array[i]),
5998 mask_vec = build_vector (mask_type, t);
5999 mask = vect_init_vector (stmt, mask_vec, mask_type, NULL);
6001 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node));
6002 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
6003 dr_chain_size = VEC_length (tree, dr_chain);
6005 /* Initialize the vect stmts of NODE to properly insert the generated
6007 for (i = VEC_length (gimple, SLP_TREE_VEC_STMTS (node));
6008 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6009 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (node), NULL);
6011 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
6012 for (i = 0; i < ncopies; i++)
6014 first_vec = VEC_index (tree, dr_chain, first_vec_indx);
6015 second_vec = VEC_index (tree, dr_chain, second_vec_indx);
6017 /* Build argument list for the vectorized call. */
6018 VEC_free (tree, heap, params);
6019 params = VEC_alloc (tree, heap, 3);
6020 VEC_quick_push (tree, params, first_vec);
6021 VEC_quick_push (tree, params, second_vec);
6022 VEC_quick_push (tree, params, mask);
6024 /* Generate the permute statement. */
6025 perm_stmt = gimple_build_call_vec (builtin_decl, params);
6026 data_ref = make_ssa_name (perm_dest, perm_stmt);
6027 gimple_call_set_lhs (perm_stmt, data_ref);
6028 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6029 FOR_EACH_SSA_TREE_OPERAND (sym, perm_stmt, iter, SSA_OP_ALL_VIRTUALS)
6031 if (TREE_CODE (sym) == SSA_NAME)
6032 sym = SSA_NAME_VAR (sym);
6033 mark_sym_for_renaming (sym);
6036 /* Store the vector statement in NODE. */
6037 VEC_replace (gimple, SLP_TREE_VEC_STMTS (node),
6038 stride * i + vect_stmts_counter, perm_stmt);
6040 first_vec_indx += stride;
6041 second_vec_indx += stride;
6044 /* Mark the scalar stmt as vectorized. */
6045 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
6046 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
6050 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
6051 return in CURRENT_MASK_ELEMENT its equivalent in target specific
6052 representation. Check that the mask is valid and return FALSE if not.
6053 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
6054 the next vector, i.e., the current first vector is not needed. */
6057 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
6058 int mask_nunits, bool only_one_vec, int index,
6059 int *mask, int *current_mask_element,
6060 bool *need_next_vector)
6063 static int number_of_mask_fixes = 1;
6064 static bool mask_fixed = false;
6065 static bool needs_first_vector = false;
6067 /* Convert to target specific representation. */
6068 *current_mask_element = first_mask_element + m;
6069 /* Adjust the value in case it's a mask for second and third vectors. */
6070 *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1);
6072 if (*current_mask_element < mask_nunits)
6073 needs_first_vector = true;
6075 /* We have only one input vector to permute but the mask accesses values in
6076 the next vector as well. */
6077 if (only_one_vec && *current_mask_element >= mask_nunits)
6079 if (vect_print_dump_info (REPORT_DETAILS))
6081 fprintf (vect_dump, "permutation requires at least two vectors ");
6082 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6088 /* The mask requires the next vector. */
6089 if (*current_mask_element >= mask_nunits * 2)
6091 if (needs_first_vector || mask_fixed)
6093 /* We either need the first vector too or have already moved to the
6094 next vector. In both cases, this permutation needs three
6096 if (vect_print_dump_info (REPORT_DETAILS))
6098 fprintf (vect_dump, "permutation requires at "
6099 "least three vectors ");
6100 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6106 /* We move to the next vector, dropping the first one and working with
6107 the second and the third - we need to adjust the values of the mask
6109 *current_mask_element -= mask_nunits * number_of_mask_fixes;
6111 for (i = 0; i < index; i++)
6112 mask[i] -= mask_nunits * number_of_mask_fixes;
6114 (number_of_mask_fixes)++;
6118 *need_next_vector = mask_fixed;
6120 /* This was the last element of this mask. Start a new one. */
6121 if (index == mask_nunits - 1)
6123 number_of_mask_fixes = 1;
6125 needs_first_vector = false;
6132 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6133 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6134 permute statements for SLP_NODE_INSTANCE. */
6136 vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
6137 gimple_stmt_iterator *gsi, int vf,
6138 slp_instance slp_node_instance, bool analyze_only)
6140 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6141 tree mask_element_type = NULL_TREE, mask_type;
6142 int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index;
6144 tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl;
6145 gimple next_scalar_stmt;
6146 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6147 int first_mask_element;
6148 int index, unroll_factor, *mask, current_mask_element, ncopies;
6149 bool only_one_vec = false, need_next_vector = false;
6150 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
6152 if (!targetm.vectorize.builtin_vec_perm)
6154 if (vect_print_dump_info (REPORT_DETAILS))
6156 fprintf (vect_dump, "no builtin for vect permute for ");
6157 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6163 builtin_decl = targetm.vectorize.builtin_vec_perm (vectype,
6164 &mask_element_type);
6165 if (!builtin_decl || !mask_element_type)
6167 if (vect_print_dump_info (REPORT_DETAILS))
6169 fprintf (vect_dump, "no builtin for vect permute for ");
6170 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6176 mask_type = get_vectype_for_scalar_type (mask_element_type);
6177 mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type);
6178 mask = (int *) xmalloc (sizeof (int) * mask_nunits);
6179 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6180 scale = mask_nunits / nunits;
6181 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6183 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
6184 unrolling factor. */
6185 orig_vec_stmts_num = group_size *
6186 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
6187 if (orig_vec_stmts_num == 1)
6188 only_one_vec = true;
6190 /* Number of copies is determined by the final vectorization factor
6191 relatively to SLP_NODE_INSTANCE unrolling factor. */
6192 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6194 /* Generate permutation masks for every NODE. Number of masks for each NODE
6195 is equal to GROUP_SIZE.
6196 E.g., we have a group of three nodes with three loads from the same
6197 location in each node, and the vector size is 4. I.e., we have a
6198 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6199 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6200 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6203 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target
6204 scpecific type, e.g., in bytes for Altivec.
6205 The last mask is illegal since we assume two operands for permute
6206 operation, and the mask element values can't be outside that range. Hence,
6207 the last mask must be converted into {2,5,5,5}.
6208 For the first two permutations we need the first and the second input
6209 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6210 we need the second and the third vectors: {b1,c1,a2,b2} and
6214 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_node_instance),
6220 vect_stmts_counter = 0;
6222 first_vec_index = vec_index++;
6224 second_vec_index = first_vec_index;
6226 second_vec_index = vec_index++;
6228 for (j = 0; j < unroll_factor; j++)
6230 for (k = 0; k < group_size; k++)
6232 first_mask_element = (i + j * group_size) * scale;
6233 for (m = 0; m < scale; m++)
6235 if (!vect_get_mask_element (stmt, first_mask_element, m,
6236 mask_nunits, only_one_vec, index, mask,
6237 ¤t_mask_element, &need_next_vector))
6240 mask[index++] = current_mask_element;
6243 if (index == mask_nunits)
6248 if (need_next_vector)
6250 first_vec_index = second_vec_index;
6251 second_vec_index = vec_index;
6254 next_scalar_stmt = VEC_index (gimple,
6255 SLP_TREE_SCALAR_STMTS (node), scalar_index++);
6257 vect_create_mask_and_perm (stmt, next_scalar_stmt,
6258 mask, mask_nunits, mask_element_type, mask_type,
6259 first_vec_index, second_vec_index, gsi, node,
6260 builtin_decl, vectype, dr_chain, ncopies,
6261 vect_stmts_counter++);
6272 /* vectorizable_load.
6274 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
6276 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6277 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
6278 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6281 vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
6282 slp_tree slp_node, slp_instance slp_node_instance)
6285 tree vec_dest = NULL;
6286 tree data_ref = NULL;
6287 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6288 stmt_vec_info prev_stmt_info;
6289 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6290 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6291 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
6292 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
6293 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
6294 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6297 gimple new_stmt = NULL;
6299 enum dr_alignment_support alignment_support_scheme;
6300 tree dataref_ptr = NULL_TREE;
6302 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6304 int i, j, group_size;
6305 tree msq = NULL_TREE, lsq;
6306 tree offset = NULL_TREE;
6307 tree realignment_token = NULL_TREE;
6309 VEC(tree,heap) *dr_chain = NULL;
6310 bool strided_load = false;
6314 bool compute_in_loop = false;
6315 struct loop *at_loop;
6317 bool slp = (slp_node != NULL);
6318 bool slp_perm = false;
6319 enum tree_code code;
6321 /* Multiple types in SLP are handled by creating the appropriate number of
6322 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
6327 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6329 gcc_assert (ncopies >= 1);
6331 /* FORNOW. This restriction should be relaxed. */
6332 if (nested_in_vect_loop && ncopies > 1)
6334 if (vect_print_dump_info (REPORT_DETAILS))
6335 fprintf (vect_dump, "multiple types in nested loop.");
6339 if (slp && SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance))
6342 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6345 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6348 /* Is vectorizable load? */
6349 if (!is_gimple_assign (stmt))
6352 scalar_dest = gimple_assign_lhs (stmt);
6353 if (TREE_CODE (scalar_dest) != SSA_NAME)
6356 code = gimple_assign_rhs_code (stmt);
6357 if (code != ARRAY_REF
6358 && code != INDIRECT_REF
6359 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
6362 if (!STMT_VINFO_DATA_REF (stmt_info))
6365 scalar_type = TREE_TYPE (DR_REF (dr));
6366 mode = (int) TYPE_MODE (vectype);
6368 /* FORNOW. In some cases can vectorize even if data-type not supported
6369 (e.g. - data copies). */
6370 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
6372 if (vect_print_dump_info (REPORT_DETAILS))
6373 fprintf (vect_dump, "Aligned load, but unsupported type.");
6377 /* If accesses through a pointer to vectype do not alias the original
6378 memory reference we have a problem. */
6379 if (get_alias_set (vectype) != get_alias_set (scalar_type)
6380 && !alias_set_subset_of (get_alias_set (vectype),
6381 get_alias_set (scalar_type)))
6383 if (vect_print_dump_info (REPORT_DETAILS))
6384 fprintf (vect_dump, "vector type does not alias scalar type");
6388 /* Check if the load is a part of an interleaving chain. */
6389 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6391 strided_load = true;
6393 gcc_assert (! nested_in_vect_loop);
6395 /* Check if interleaving is supported. */
6396 if (!vect_strided_load_supported (vectype)
6397 && !PURE_SLP_STMT (stmt_info) && !slp)
6401 if (!vec_stmt) /* transformation not required. */
6403 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
6404 vect_model_load_cost (stmt_info, ncopies, NULL);
6408 if (vect_print_dump_info (REPORT_DETAILS))
6409 fprintf (vect_dump, "transform load.");
6415 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
6416 /* Check if the chain of loads is already vectorized. */
6417 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
6419 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6422 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
6423 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
6425 /* VEC_NUM is the number of vect stmts to be created for this group. */
6428 strided_load = false;
6429 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6432 vec_num = group_size;
6434 dr_chain = VEC_alloc (tree, heap, vec_num);
6440 group_size = vec_num = 1;
6443 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
6444 gcc_assert (alignment_support_scheme);
6446 /* In case the vectorization factor (VF) is bigger than the number
6447 of elements that we can fit in a vectype (nunits), we have to generate
6448 more than one vector stmt - i.e - we need to "unroll" the
6449 vector stmt by a factor VF/nunits. In doing so, we record a pointer
6450 from one copy of the vector stmt to the next, in the field
6451 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
6452 stages to find the correct vector defs to be used when vectorizing
6453 stmts that use the defs of the current stmt. The example below illustrates
6454 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
6455 4 vectorized stmts):
6457 before vectorization:
6458 RELATED_STMT VEC_STMT
6462 step 1: vectorize stmt S1:
6463 We first create the vector stmt VS1_0, and, as usual, record a
6464 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
6465 Next, we create the vector stmt VS1_1, and record a pointer to
6466 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
6467 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
6469 RELATED_STMT VEC_STMT
6470 VS1_0: vx0 = memref0 VS1_1 -
6471 VS1_1: vx1 = memref1 VS1_2 -
6472 VS1_2: vx2 = memref2 VS1_3 -
6473 VS1_3: vx3 = memref3 - -
6474 S1: x = load - VS1_0
6477 See in documentation in vect_get_vec_def_for_stmt_copy for how the
6478 information we recorded in RELATED_STMT field is used to vectorize
6481 /* In case of interleaving (non-unit strided access):
6488 Vectorized loads are created in the order of memory accesses
6489 starting from the access of the first stmt of the chain:
6492 VS2: vx1 = &base + vec_size*1
6493 VS3: vx3 = &base + vec_size*2
6494 VS4: vx4 = &base + vec_size*3
6496 Then permutation statements are generated:
6498 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
6499 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
6502 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
6503 (the order of the data-refs in the output of vect_permute_load_chain
6504 corresponds to the order of scalar stmts in the interleaving chain - see
6505 the documentation of vect_permute_load_chain()).
6506 The generation of permutation stmts and recording them in
6507 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
6509 In case of both multiple types and interleaving, the vector loads and
6510 permutation stmts above are created for every copy. The result vector stmts
6511 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
6512 STMT_VINFO_RELATED_STMT for the next copies. */
6514 /* If the data reference is aligned (dr_aligned) or potentially unaligned
6515 on a target that supports unaligned accesses (dr_unaligned_supported)
6516 we generate the following code:
6520 p = p + indx * vectype_size;
6525 Otherwise, the data reference is potentially unaligned on a target that
6526 does not support unaligned accesses (dr_explicit_realign_optimized) -
6527 then generate the following code, in which the data in each iteration is
6528 obtained by two vector loads, one from the previous iteration, and one
6529 from the current iteration:
6531 msq_init = *(floor(p1))
6532 p2 = initial_addr + VS - 1;
6533 realignment_token = call target_builtin;
6536 p2 = p2 + indx * vectype_size
6538 vec_dest = realign_load (msq, lsq, realignment_token)
6543 /* If the misalignment remains the same throughout the execution of the
6544 loop, we can create the init_addr and permutation mask at the loop
6545 preheader. Otherwise, it needs to be created inside the loop.
6546 This can only occur when vectorizing memory accesses in the inner-loop
6547 nested within an outer-loop that is being vectorized. */
6549 if (nested_in_vect_loop_p (loop, stmt)
6550 && (TREE_INT_CST_LOW (DR_STEP (dr))
6551 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
6553 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
6554 compute_in_loop = true;
6557 if ((alignment_support_scheme == dr_explicit_realign_optimized
6558 || alignment_support_scheme == dr_explicit_realign)
6559 && !compute_in_loop)
6561 msq = vect_setup_realignment (first_stmt, gsi, &realignment_token,
6562 alignment_support_scheme, NULL_TREE,
6564 if (alignment_support_scheme == dr_explicit_realign_optimized)
6566 phi = SSA_NAME_DEF_STMT (msq);
6567 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6573 prev_stmt_info = NULL;
6574 for (j = 0; j < ncopies; j++)
6576 /* 1. Create the vector pointer update chain. */
6578 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
6580 &dummy, &ptr_incr, false,
6584 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
6586 for (i = 0; i < vec_num; i++)
6589 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
6592 /* 2. Create the vector-load in the loop. */
6593 switch (alignment_support_scheme)
6596 gcc_assert (aligned_access_p (first_dr));
6597 data_ref = build_fold_indirect_ref (dataref_ptr);
6599 case dr_unaligned_supported:
6601 int mis = DR_MISALIGNMENT (first_dr);
6602 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
6604 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
6606 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
6609 case dr_explicit_realign:
6612 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6614 if (compute_in_loop)
6615 msq = vect_setup_realignment (first_stmt, gsi,
6617 dr_explicit_realign,
6620 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6621 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6622 new_stmt = gimple_build_assign (vec_dest, data_ref);
6623 new_temp = make_ssa_name (vec_dest, new_stmt);
6624 gimple_assign_set_lhs (new_stmt, new_temp);
6625 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6626 copy_virtual_operands (new_stmt, stmt);
6627 mark_symbols_for_renaming (new_stmt);
6630 bump = size_binop (MULT_EXPR, vs_minus_1,
6631 TYPE_SIZE_UNIT (scalar_type));
6632 ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump);
6633 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
6636 case dr_explicit_realign_optimized:
6637 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6642 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6643 new_stmt = gimple_build_assign (vec_dest, data_ref);
6644 new_temp = make_ssa_name (vec_dest, new_stmt);
6645 gimple_assign_set_lhs (new_stmt, new_temp);
6646 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6647 mark_symbols_for_renaming (new_stmt);
6649 /* 3. Handle explicit realignment if necessary/supported. Create in
6650 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
6651 if (alignment_support_scheme == dr_explicit_realign_optimized
6652 || alignment_support_scheme == dr_explicit_realign)
6656 lsq = gimple_assign_lhs (new_stmt);
6657 if (!realignment_token)
6658 realignment_token = dataref_ptr;
6659 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6660 tmp = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
6662 new_stmt = gimple_build_assign (vec_dest, tmp);
6663 new_temp = make_ssa_name (vec_dest, new_stmt);
6664 gimple_assign_set_lhs (new_stmt, new_temp);
6665 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6667 if (alignment_support_scheme == dr_explicit_realign_optimized)
6670 if (i == vec_num - 1 && j == ncopies - 1)
6671 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
6676 /* 4. Handle invariant-load. */
6679 gcc_assert (!strided_load);
6680 gcc_assert (nested_in_vect_loop_p (loop, stmt));
6685 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
6687 /* CHECKME: bitpos depends on endianess? */
6688 bitpos = bitsize_zero_node;
6689 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6692 vect_create_destination_var (scalar_dest, NULL_TREE);
6693 new_stmt = gimple_build_assign (vec_dest, vec_inv);
6694 new_temp = make_ssa_name (vec_dest, new_stmt);
6695 gimple_assign_set_lhs (new_stmt, new_temp);
6696 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6698 for (k = nunits - 1; k >= 0; --k)
6699 t = tree_cons (NULL_TREE, new_temp, t);
6700 /* FIXME: use build_constructor directly. */
6701 vec_inv = build_constructor_from_list (vectype, t);
6702 new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
6703 new_stmt = SSA_NAME_DEF_STMT (new_temp);
6706 gcc_unreachable (); /* FORNOW. */
6709 /* Collect vector loads and later create their permutation in
6710 vect_transform_strided_load (). */
6711 if (strided_load || slp_perm)
6712 VEC_quick_push (tree, dr_chain, new_temp);
6714 /* Store vector loads in the corresponding SLP_NODE. */
6715 if (slp && !slp_perm)
6716 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
6719 if (slp && !slp_perm)
6724 if (!vect_transform_slp_perm_load (stmt, dr_chain, gsi,
6725 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
6726 slp_node_instance, false))
6728 VEC_free (tree, heap, dr_chain);
6736 if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi))
6739 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6740 VEC_free (tree, heap, dr_chain);
6741 dr_chain = VEC_alloc (tree, heap, group_size);
6746 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6748 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6749 prev_stmt_info = vinfo_for_stmt (new_stmt);
6755 VEC_free (tree, heap, dr_chain);
6761 /* Function vectorizable_live_operation.
6763 STMT computes a value that is used outside the loop. Check if
6764 it can be supported. */
6767 vectorizable_live_operation (gimple stmt,
6768 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6769 gimple *vec_stmt ATTRIBUTE_UNUSED)
6771 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6772 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6773 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6779 enum vect_def_type dt;
6780 enum tree_code code;
6781 enum gimple_rhs_class rhs_class;
6783 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6785 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6788 if (!is_gimple_assign (stmt))
6791 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6794 /* FORNOW. CHECKME. */
6795 if (nested_in_vect_loop_p (loop, stmt))
6798 code = gimple_assign_rhs_code (stmt);
6799 op_type = TREE_CODE_LENGTH (code);
6800 rhs_class = get_gimple_rhs_class (code);
6801 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
6802 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
6804 /* FORNOW: support only if all uses are invariant. This means
6805 that the scalar operations can remain in place, unvectorized.
6806 The original last scalar value that they compute will be used. */
6808 for (i = 0; i < op_type; i++)
6810 if (rhs_class == GIMPLE_SINGLE_RHS)
6811 op = TREE_OPERAND (gimple_op (stmt, 1), i);
6813 op = gimple_op (stmt, i + 1);
6814 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
6816 if (vect_print_dump_info (REPORT_DETAILS))
6817 fprintf (vect_dump, "use not simple.");
6821 if (dt != vect_invariant_def && dt != vect_constant_def)
6825 /* No transformation is required for the cases we currently support. */
6830 /* Function vect_is_simple_cond.
6833 LOOP - the loop that is being vectorized.
6834 COND - Condition that is checked for simple use.
6836 Returns whether a COND can be vectorized. Checks whether
6837 condition operands are supportable using vec_is_simple_use. */
6840 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6844 enum vect_def_type dt;
6846 if (!COMPARISON_CLASS_P (cond))
6849 lhs = TREE_OPERAND (cond, 0);
6850 rhs = TREE_OPERAND (cond, 1);
6852 if (TREE_CODE (lhs) == SSA_NAME)
6854 gimple lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6855 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6858 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6859 && TREE_CODE (lhs) != FIXED_CST)
6862 if (TREE_CODE (rhs) == SSA_NAME)
6864 gimple rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6865 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6868 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
6869 && TREE_CODE (rhs) != FIXED_CST)
6875 /* vectorizable_condition.
6877 Check if STMT is conditional modify expression that can be vectorized.
6878 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6879 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
6882 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6885 vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
6888 tree scalar_dest = NULL_TREE;
6889 tree vec_dest = NULL_TREE;
6890 tree op = NULL_TREE;
6891 tree cond_expr, then_clause, else_clause;
6892 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6893 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6894 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6895 tree vec_compare, vec_cond_expr;
6897 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6898 enum machine_mode vec_mode;
6900 enum vect_def_type dt;
6901 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6902 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6903 enum tree_code code;
6905 gcc_assert (ncopies >= 1);
6907 return false; /* FORNOW */
6909 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6912 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6915 /* FORNOW: SLP not supported. */
6916 if (STMT_SLP_TYPE (stmt_info))
6919 /* FORNOW: not yet supported. */
6920 if (STMT_VINFO_LIVE_P (stmt_info))
6922 if (vect_print_dump_info (REPORT_DETAILS))
6923 fprintf (vect_dump, "value used after loop.");
6927 /* Is vectorizable conditional operation? */
6928 if (!is_gimple_assign (stmt))
6931 code = gimple_assign_rhs_code (stmt);
6933 if (code != COND_EXPR)
6936 gcc_assert (gimple_assign_single_p (stmt));
6937 op = gimple_assign_rhs1 (stmt);
6938 cond_expr = TREE_OPERAND (op, 0);
6939 then_clause = TREE_OPERAND (op, 1);
6940 else_clause = TREE_OPERAND (op, 2);
6942 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6945 /* We do not handle two different vector types for the condition
6947 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6950 if (TREE_CODE (then_clause) == SSA_NAME)
6952 gimple then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6953 if (!vect_is_simple_use (then_clause, loop_vinfo,
6954 &then_def_stmt, &def, &dt))
6957 else if (TREE_CODE (then_clause) != INTEGER_CST
6958 && TREE_CODE (then_clause) != REAL_CST
6959 && TREE_CODE (then_clause) != FIXED_CST)
6962 if (TREE_CODE (else_clause) == SSA_NAME)
6964 gimple else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6965 if (!vect_is_simple_use (else_clause, loop_vinfo,
6966 &else_def_stmt, &def, &dt))
6969 else if (TREE_CODE (else_clause) != INTEGER_CST
6970 && TREE_CODE (else_clause) != REAL_CST
6971 && TREE_CODE (else_clause) != FIXED_CST)
6975 vec_mode = TYPE_MODE (vectype);
6979 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
6980 return expand_vec_cond_expr_p (op, vec_mode);
6986 scalar_dest = gimple_assign_lhs (stmt);
6987 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6989 /* Handle cond expr. */
6991 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
6993 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
6994 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
6995 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
6997 /* Arguments are ready. Create the new vector stmt. */
6998 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
6999 vec_cond_lhs, vec_cond_rhs);
7000 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
7001 vec_compare, vec_then_clause, vec_else_clause);
7003 *vec_stmt = gimple_build_assign (vec_dest, vec_cond_expr);
7004 new_temp = make_ssa_name (vec_dest, *vec_stmt);
7005 gimple_assign_set_lhs (*vec_stmt, new_temp);
7006 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
7012 /* Function vect_transform_stmt.
7014 Create a vectorized stmt to replace STMT, and insert it at BSI. */
7017 vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
7018 bool *strided_store, slp_tree slp_node,
7019 slp_instance slp_node_instance)
7021 bool is_store = false;
7022 gimple vec_stmt = NULL;
7023 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7024 gimple orig_stmt_in_pattern;
7027 switch (STMT_VINFO_TYPE (stmt_info))
7029 case type_demotion_vec_info_type:
7030 done = vectorizable_type_demotion (stmt, gsi, &vec_stmt, slp_node);
7034 case type_promotion_vec_info_type:
7035 done = vectorizable_type_promotion (stmt, gsi, &vec_stmt, slp_node);
7039 case type_conversion_vec_info_type:
7040 done = vectorizable_conversion (stmt, gsi, &vec_stmt, slp_node);
7044 case induc_vec_info_type:
7045 gcc_assert (!slp_node);
7046 done = vectorizable_induction (stmt, gsi, &vec_stmt);
7050 case op_vec_info_type:
7051 done = vectorizable_operation (stmt, gsi, &vec_stmt, slp_node);
7055 case assignment_vec_info_type:
7056 done = vectorizable_assignment (stmt, gsi, &vec_stmt, slp_node);
7060 case load_vec_info_type:
7061 done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node,
7066 case store_vec_info_type:
7067 done = vectorizable_store (stmt, gsi, &vec_stmt, slp_node);
7069 if (STMT_VINFO_STRIDED_ACCESS (stmt_info) && !slp_node)
7071 /* In case of interleaving, the whole chain is vectorized when the
7072 last store in the chain is reached. Store stmts before the last
7073 one are skipped, and there vec_stmt_info shouldn't be freed
7075 *strided_store = true;
7076 if (STMT_VINFO_VEC_STMT (stmt_info))
7083 case condition_vec_info_type:
7084 gcc_assert (!slp_node);
7085 done = vectorizable_condition (stmt, gsi, &vec_stmt);
7089 case call_vec_info_type:
7090 gcc_assert (!slp_node);
7091 done = vectorizable_call (stmt, gsi, &vec_stmt);
7094 case reduc_vec_info_type:
7095 gcc_assert (!slp_node);
7096 done = vectorizable_reduction (stmt, gsi, &vec_stmt);
7101 if (!STMT_VINFO_LIVE_P (stmt_info))
7103 if (vect_print_dump_info (REPORT_DETAILS))
7104 fprintf (vect_dump, "stmt not supported.");
7109 if (STMT_VINFO_LIVE_P (stmt_info)
7110 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
7112 done = vectorizable_live_operation (stmt, gsi, &vec_stmt);
7118 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
7119 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
7120 if (orig_stmt_in_pattern)
7122 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
7123 /* STMT was inserted by the vectorizer to replace a computation idiom.
7124 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
7125 computed this idiom. We need to record a pointer to VEC_STMT in
7126 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
7127 documentation of vect_pattern_recog. */
7128 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
7130 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
7131 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
7140 /* This function builds ni_name = number of iterations loop executes
7141 on the loop preheader. */
7144 vect_build_loop_niters (loop_vec_info loop_vinfo)
7147 gimple_seq stmts = NULL;
7149 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7150 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
7152 var = create_tmp_var (TREE_TYPE (ni), "niters");
7153 add_referenced_var (var);
7154 ni_name = force_gimple_operand (ni, &stmts, false, var);
7156 pe = loop_preheader_edge (loop);
7159 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7160 gcc_assert (!new_bb);
7167 /* This function generates the following statements:
7169 ni_name = number of iterations loop executes
7170 ratio = ni_name / vf
7171 ratio_mult_vf_name = ratio * vf
7173 and places them at the loop preheader edge. */
7176 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
7178 tree *ratio_mult_vf_name_ptr,
7179 tree *ratio_name_ptr)
7188 tree ratio_mult_vf_name;
7189 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7190 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
7191 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7194 pe = loop_preheader_edge (loop);
7196 /* Generate temporary variable that contains
7197 number of iterations loop executes. */
7199 ni_name = vect_build_loop_niters (loop_vinfo);
7200 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
7202 /* Create: ratio = ni >> log2(vf) */
7204 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
7205 if (!is_gimple_val (ratio_name))
7207 var = create_tmp_var (TREE_TYPE (ni), "bnd");
7208 add_referenced_var (var);
7211 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
7212 pe = loop_preheader_edge (loop);
7213 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7214 gcc_assert (!new_bb);
7217 /* Create: ratio_mult_vf = ratio << log2 (vf). */
7219 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
7220 ratio_name, log_vf);
7221 if (!is_gimple_val (ratio_mult_vf_name))
7223 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
7224 add_referenced_var (var);
7227 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
7229 pe = loop_preheader_edge (loop);
7230 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7231 gcc_assert (!new_bb);
7234 *ni_name_ptr = ni_name;
7235 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
7236 *ratio_name_ptr = ratio_name;
7242 /* Function vect_update_ivs_after_vectorizer.
7244 "Advance" the induction variables of LOOP to the value they should take
7245 after the execution of LOOP. This is currently necessary because the
7246 vectorizer does not handle induction variables that are used after the
7247 loop. Such a situation occurs when the last iterations of LOOP are
7249 1. We introduced new uses after LOOP for IVs that were not originally used
7250 after LOOP: the IVs of LOOP are now used by an epilog loop.
7251 2. LOOP is going to be vectorized; this means that it will iterate N/VF
7252 times, whereas the loop IVs should be bumped N times.
7255 - LOOP - a loop that is going to be vectorized. The last few iterations
7256 of LOOP were peeled.
7257 - NITERS - the number of iterations that LOOP executes (before it is
7258 vectorized). i.e, the number of times the ivs should be bumped.
7259 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
7260 coming out from LOOP on which there are uses of the LOOP ivs
7261 (this is the path from LOOP->exit to epilog_loop->preheader).
7263 The new definitions of the ivs are placed in LOOP->exit.
7264 The phi args associated with the edge UPDATE_E in the bb
7265 UPDATE_E->dest are updated accordingly.
7267 Assumption 1: Like the rest of the vectorizer, this function assumes
7268 a single loop exit that has a single predecessor.
7270 Assumption 2: The phi nodes in the LOOP header and in update_bb are
7271 organized in the same order.
7273 Assumption 3: The access function of the ivs is simple enough (see
7274 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
7276 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
7277 coming out of LOOP on which the ivs of LOOP are used (this is the path
7278 that leads to the epilog loop; other paths skip the epilog loop). This
7279 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
7280 needs to have its phis updated.
7284 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
7287 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7288 basic_block exit_bb = single_exit (loop)->dest;
7290 gimple_stmt_iterator gsi, gsi1;
7291 basic_block update_bb = update_e->dest;
7293 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
7295 /* Make sure there exists a single-predecessor exit bb: */
7296 gcc_assert (single_pred_p (exit_bb));
7298 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
7299 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
7300 gsi_next (&gsi), gsi_next (&gsi1))
7302 tree access_fn = NULL;
7303 tree evolution_part;
7306 tree var, ni, ni_name;
7307 gimple_stmt_iterator last_gsi;
7309 phi = gsi_stmt (gsi);
7310 phi1 = gsi_stmt (gsi1);
7311 if (vect_print_dump_info (REPORT_DETAILS))
7313 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
7314 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
7317 /* Skip virtual phi's. */
7318 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
7320 if (vect_print_dump_info (REPORT_DETAILS))
7321 fprintf (vect_dump, "virtual phi. skip.");
7325 /* Skip reduction phis. */
7326 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
7328 if (vect_print_dump_info (REPORT_DETAILS))
7329 fprintf (vect_dump, "reduc phi. skip.");
7333 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
7334 gcc_assert (access_fn);
7335 STRIP_NOPS (access_fn);
7337 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
7338 gcc_assert (evolution_part != NULL_TREE);
7340 /* FORNOW: We do not support IVs whose evolution function is a polynomial
7341 of degree >= 2 or exponential. */
7342 gcc_assert (!tree_is_chrec (evolution_part));
7344 step_expr = evolution_part;
7345 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
7348 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
7349 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
7351 fold_convert (sizetype,
7352 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
7353 niters, step_expr)));
7355 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
7356 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
7357 fold_convert (TREE_TYPE (init_expr),
7364 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
7365 add_referenced_var (var);
7367 last_gsi = gsi_last_bb (exit_bb);
7368 ni_name = force_gimple_operand_gsi (&last_gsi, ni, false, var,
7369 true, GSI_SAME_STMT);
7371 /* Fix phi expressions in the successor bb. */
7372 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
7376 /* Return the more conservative threshold between the
7377 min_profitable_iters returned by the cost model and the user
7378 specified threshold, if provided. */
7381 conservative_cost_threshold (loop_vec_info loop_vinfo,
7382 int min_profitable_iters)
7385 int min_scalar_loop_bound;
7387 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
7388 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
7390 /* Use the cost model only if it is more conservative than user specified
7392 th = (unsigned) min_scalar_loop_bound;
7393 if (min_profitable_iters
7394 && (!min_scalar_loop_bound
7395 || min_profitable_iters > min_scalar_loop_bound))
7396 th = (unsigned) min_profitable_iters;
7398 if (th && vect_print_dump_info (REPORT_COST))
7399 fprintf (vect_dump, "Vectorization may not be profitable.");
7404 /* Function vect_do_peeling_for_loop_bound
7406 Peel the last iterations of the loop represented by LOOP_VINFO.
7407 The peeled iterations form a new epilog loop. Given that the loop now
7408 iterates NITERS times, the new epilog loop iterates
7409 NITERS % VECTORIZATION_FACTOR times.
7411 The original loop will later be made to iterate
7412 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
7415 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
7417 tree ni_name, ratio_mult_vf_name;
7418 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7419 struct loop *new_loop;
7421 basic_block preheader;
7423 bool check_profitability = false;
7424 unsigned int th = 0;
7425 int min_profitable_iters;
7427 if (vect_print_dump_info (REPORT_DETAILS))
7428 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
7430 initialize_original_copy_tables ();
7432 /* Generate the following variables on the preheader of original loop:
7434 ni_name = number of iteration the original loop executes
7435 ratio = ni_name / vf
7436 ratio_mult_vf_name = ratio * vf */
7437 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
7438 &ratio_mult_vf_name, ratio);
7440 loop_num = loop->num;
7442 /* If cost model check not done during versioning and
7443 peeling for alignment. */
7444 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7445 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
7446 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7448 check_profitability = true;
7450 /* Get profitability threshold for vectorized loop. */
7451 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7453 th = conservative_cost_threshold (loop_vinfo,
7454 min_profitable_iters);
7457 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
7458 ratio_mult_vf_name, ni_name, false,
7459 th, check_profitability);
7460 gcc_assert (new_loop);
7461 gcc_assert (loop_num == loop->num);
7462 #ifdef ENABLE_CHECKING
7463 slpeel_verify_cfg_after_peeling (loop, new_loop);
7466 /* A guard that controls whether the new_loop is to be executed or skipped
7467 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
7468 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
7469 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
7470 is on the path where the LOOP IVs are used and need to be updated. */
7472 preheader = loop_preheader_edge (new_loop)->src;
7473 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
7474 update_e = EDGE_PRED (preheader, 0);
7476 update_e = EDGE_PRED (preheader, 1);
7478 /* Update IVs of original loop as if they were advanced
7479 by ratio_mult_vf_name steps. */
7480 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
7482 /* After peeling we have to reset scalar evolution analyzer. */
7485 free_original_copy_tables ();
7489 /* Function vect_gen_niters_for_prolog_loop
7491 Set the number of iterations for the loop represented by LOOP_VINFO
7492 to the minimum between LOOP_NITERS (the original iteration count of the loop)
7493 and the misalignment of DR - the data reference recorded in
7494 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
7495 this loop, the data reference DR will refer to an aligned location.
7497 The following computation is generated:
7499 If the misalignment of DR is known at compile time:
7500 addr_mis = int mis = DR_MISALIGNMENT (dr);
7501 Else, compute address misalignment in bytes:
7502 addr_mis = addr & (vectype_size - 1)
7504 prolog_niters = min (LOOP_NITERS, ((VF - addr_mis/elem_size)&(VF-1))/step)
7506 (elem_size = element type size; an element is the scalar element whose type
7507 is the inner type of the vectype)
7509 When the step of the data-ref in the loop is not 1 (as in interleaved data
7510 and SLP), the number of iterations of the prolog must be divided by the step
7511 (which is equal to the size of interleaved group).
7513 The above formulas assume that VF == number of elements in the vector. This
7514 may not hold when there are multiple-types in the loop.
7515 In this case, for some data-references in the loop the VF does not represent
7516 the number of elements that fit in the vector. Therefore, instead of VF we
7517 use TYPE_VECTOR_SUBPARTS. */
7520 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
7522 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
7523 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7526 tree iters, iters_name;
7529 gimple dr_stmt = DR_STMT (dr);
7530 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
7531 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7532 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
7533 tree niters_type = TREE_TYPE (loop_niters);
7535 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
7536 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
7538 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7539 step = DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info)));
7541 pe = loop_preheader_edge (loop);
7543 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
7545 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
7546 int elem_misalign = byte_misalign / element_size;
7548 if (vect_print_dump_info (REPORT_DETAILS))
7549 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
7551 iters = build_int_cst (niters_type,
7552 (((nelements - elem_misalign) & (nelements - 1)) / step));
7556 gimple_seq new_stmts = NULL;
7557 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
7558 &new_stmts, NULL_TREE, loop);
7559 tree ptr_type = TREE_TYPE (start_addr);
7560 tree size = TYPE_SIZE (ptr_type);
7561 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
7562 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
7563 tree elem_size_log =
7564 build_int_cst (type, exact_log2 (vectype_align/nelements));
7565 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
7566 tree nelements_tree = build_int_cst (type, nelements);
7570 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmts);
7571 gcc_assert (!new_bb);
7573 /* Create: byte_misalign = addr & (vectype_size - 1) */
7575 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
7577 /* Create: elem_misalign = byte_misalign / element_size */
7579 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
7581 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
7582 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
7583 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
7584 iters = fold_convert (niters_type, iters);
7587 /* Create: prolog_loop_niters = min (iters, loop_niters) */
7588 /* If the loop bound is known at compile time we already verified that it is
7589 greater than vf; since the misalignment ('iters') is at most vf, there's
7590 no need to generate the MIN_EXPR in this case. */
7591 if (TREE_CODE (loop_niters) != INTEGER_CST)
7592 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
7594 if (vect_print_dump_info (REPORT_DETAILS))
7596 fprintf (vect_dump, "niters for prolog loop: ");
7597 print_generic_expr (vect_dump, iters, TDF_SLIM);
7600 var = create_tmp_var (niters_type, "prolog_loop_niters");
7601 add_referenced_var (var);
7603 iters_name = force_gimple_operand (iters, &stmts, false, var);
7605 /* Insert stmt on loop preheader edge. */
7608 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7609 gcc_assert (!new_bb);
7616 /* Function vect_update_init_of_dr
7618 NITERS iterations were peeled from LOOP. DR represents a data reference
7619 in LOOP. This function updates the information recorded in DR to
7620 account for the fact that the first NITERS iterations had already been
7621 executed. Specifically, it updates the OFFSET field of DR. */
7624 vect_update_init_of_dr (struct data_reference *dr, tree niters)
7626 tree offset = DR_OFFSET (dr);
7628 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
7629 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
7630 DR_OFFSET (dr) = offset;
7634 /* Function vect_update_inits_of_drs
7636 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
7637 This function updates the information recorded for the data references in
7638 the loop to account for the fact that the first NITERS iterations had
7639 already been executed. Specifically, it updates the initial_condition of
7640 the access_function of all the data_references in the loop. */
7643 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
7646 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
7647 struct data_reference *dr;
7649 if (vect_print_dump_info (REPORT_DETAILS))
7650 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
7652 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
7653 vect_update_init_of_dr (dr, niters);
7657 /* Function vect_do_peeling_for_alignment
7659 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
7660 'niters' is set to the misalignment of one of the data references in the
7661 loop, thereby forcing it to refer to an aligned location at the beginning
7662 of the execution of this loop. The data reference for which we are
7663 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
7666 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
7668 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7669 tree niters_of_prolog_loop, ni_name;
7671 struct loop *new_loop;
7672 bool check_profitability = false;
7673 unsigned int th = 0;
7674 int min_profitable_iters;
7676 if (vect_print_dump_info (REPORT_DETAILS))
7677 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
7679 initialize_original_copy_tables ();
7681 ni_name = vect_build_loop_niters (loop_vinfo);
7682 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
7685 /* If cost model check not done during versioning. */
7686 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7687 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7689 check_profitability = true;
7691 /* Get profitability threshold for vectorized loop. */
7692 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7694 th = conservative_cost_threshold (loop_vinfo,
7695 min_profitable_iters);
7698 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
7700 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
7701 niters_of_prolog_loop, ni_name, true,
7702 th, check_profitability);
7704 gcc_assert (new_loop);
7705 #ifdef ENABLE_CHECKING
7706 slpeel_verify_cfg_after_peeling (new_loop, loop);
7709 /* Update number of times loop executes. */
7710 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
7711 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
7712 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
7714 /* Update the init conditions of the access functions of all data refs. */
7715 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
7717 /* After peeling we have to reset scalar evolution analyzer. */
7720 free_original_copy_tables ();
7724 /* Function vect_create_cond_for_align_checks.
7726 Create a conditional expression that represents the alignment checks for
7727 all of data references (array element references) whose alignment must be
7731 COND_EXPR - input conditional expression. New conditions will be chained
7732 with logical AND operation.
7733 LOOP_VINFO - two fields of the loop information are used.
7734 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
7735 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
7738 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7740 The returned value is the conditional expression to be used in the if
7741 statement that controls which version of the loop gets executed at runtime.
7743 The algorithm makes two assumptions:
7744 1) The number of bytes "n" in a vector is a power of 2.
7745 2) An address "a" is aligned if a%n is zero and that this
7746 test can be done as a&(n-1) == 0. For example, for 16
7747 byte vectors the test is a&0xf == 0. */
7750 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
7752 gimple_seq *cond_expr_stmt_list)
7754 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7755 VEC(gimple,heap) *may_misalign_stmts
7756 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
7758 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
7762 tree int_ptrsize_type;
7764 tree or_tmp_name = NULL_TREE;
7765 tree and_tmp, and_tmp_name;
7768 tree part_cond_expr;
7770 /* Check that mask is one less than a power of 2, i.e., mask is
7771 all zeros followed by all ones. */
7772 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
7774 /* CHECKME: what is the best integer or unsigned type to use to hold a
7775 cast from a pointer value? */
7776 psize = TYPE_SIZE (ptr_type_node);
7778 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
7780 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
7781 of the first vector of the i'th data reference. */
7783 for (i = 0; VEC_iterate (gimple, may_misalign_stmts, i, ref_stmt); i++)
7785 gimple_seq new_stmt_list = NULL;
7787 tree addr_tmp, addr_tmp_name;
7788 tree or_tmp, new_or_tmp_name;
7789 gimple addr_stmt, or_stmt;
7791 /* create: addr_tmp = (int)(address_of_first_vector) */
7793 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
7795 if (new_stmt_list != NULL)
7796 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
7798 sprintf (tmp_name, "%s%d", "addr2int", i);
7799 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7800 add_referenced_var (addr_tmp);
7801 addr_tmp_name = make_ssa_name (addr_tmp, NULL);
7802 addr_stmt = gimple_build_assign_with_ops (NOP_EXPR, addr_tmp_name,
7803 addr_base, NULL_TREE);
7804 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
7805 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
7807 /* The addresses are OR together. */
7809 if (or_tmp_name != NULL_TREE)
7811 /* create: or_tmp = or_tmp | addr_tmp */
7812 sprintf (tmp_name, "%s%d", "orptrs", i);
7813 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7814 add_referenced_var (or_tmp);
7815 new_or_tmp_name = make_ssa_name (or_tmp, NULL);
7816 or_stmt = gimple_build_assign_with_ops (BIT_IOR_EXPR,
7818 or_tmp_name, addr_tmp_name);
7819 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
7820 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
7821 or_tmp_name = new_or_tmp_name;
7824 or_tmp_name = addr_tmp_name;
7828 mask_cst = build_int_cst (int_ptrsize_type, mask);
7830 /* create: and_tmp = or_tmp & mask */
7831 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
7832 add_referenced_var (and_tmp);
7833 and_tmp_name = make_ssa_name (and_tmp, NULL);
7835 and_stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, and_tmp_name,
7836 or_tmp_name, mask_cst);
7837 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
7838 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
7840 /* Make and_tmp the left operand of the conditional test against zero.
7841 if and_tmp has a nonzero bit then some address is unaligned. */
7842 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
7843 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
7844 and_tmp_name, ptrsize_zero);
7846 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7847 *cond_expr, part_cond_expr);
7849 *cond_expr = part_cond_expr;
7852 /* Function vect_vfa_segment_size.
7854 Create an expression that computes the size of segment
7855 that will be accessed for a data reference. The functions takes into
7856 account that realignment loads may access one more vector.
7859 DR: The data reference.
7860 VECT_FACTOR: vectorization factor.
7862 Return an expression whose value is the size of segment which will be
7866 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7868 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7869 DR_STEP (dr), vect_factor);
7871 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7873 tree vector_size = TYPE_SIZE_UNIT
7874 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7876 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7877 segment_length, vector_size);
7879 return fold_convert (sizetype, segment_length);
7882 /* Function vect_create_cond_for_alias_checks.
7884 Create a conditional expression that represents the run-time checks for
7885 overlapping of address ranges represented by a list of data references
7886 relations passed as input.
7889 COND_EXPR - input conditional expression. New conditions will be chained
7890 with logical AND operation.
7891 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7895 COND_EXPR - conditional expression.
7896 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7900 The returned value is the conditional expression to be used in the if
7901 statement that controls which version of the loop gets executed at runtime.
7905 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7907 gimple_seq * cond_expr_stmt_list)
7909 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7910 VEC (ddr_p, heap) * may_alias_ddrs =
7911 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7913 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7917 tree part_cond_expr;
7919 /* Create expression
7920 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7921 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7925 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7926 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
7928 if (VEC_empty (ddr_p, may_alias_ddrs))
7931 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
7933 struct data_reference *dr_a, *dr_b;
7934 gimple dr_group_first_a, dr_group_first_b;
7935 tree addr_base_a, addr_base_b;
7936 tree segment_length_a, segment_length_b;
7937 gimple stmt_a, stmt_b;
7940 stmt_a = DR_STMT (DDR_A (ddr));
7941 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
7942 if (dr_group_first_a)
7944 stmt_a = dr_group_first_a;
7945 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
7949 stmt_b = DR_STMT (DDR_B (ddr));
7950 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
7951 if (dr_group_first_b)
7953 stmt_b = dr_group_first_b;
7954 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
7958 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
7961 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
7964 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
7965 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
7967 if (vect_print_dump_info (REPORT_DR_DETAILS))
7970 "create runtime check for data references ");
7971 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
7972 fprintf (vect_dump, " and ");
7973 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
7978 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
7979 fold_build2 (LT_EXPR, boolean_type_node,
7980 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
7984 fold_build2 (LT_EXPR, boolean_type_node,
7985 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
7991 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7992 *cond_expr, part_cond_expr);
7994 *cond_expr = part_cond_expr;
7996 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7997 fprintf (vect_dump, "created %u versioning for alias checks.\n",
7998 VEC_length (ddr_p, may_alias_ddrs));
8002 /* Function vect_loop_versioning.
8004 If the loop has data references that may or may not be aligned or/and
8005 has data reference relations whose independence was not proven then
8006 two versions of the loop need to be generated, one which is vectorized
8007 and one which isn't. A test is then generated to control which of the
8008 loops is executed. The test checks for the alignment of all of the
8009 data references that may or may not be aligned. An additional
8010 sequence of runtime tests is generated for each pairs of DDRs whose
8011 independence was not proven. The vectorized version of loop is
8012 executed only if both alias and alignment tests are passed.
8014 The test generated to check which version of loop is executed
8015 is modified to also check for profitability as indicated by the
8016 cost model initially. */
8019 vect_loop_versioning (loop_vec_info loop_vinfo)
8021 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8023 tree cond_expr = NULL_TREE;
8024 gimple_seq cond_expr_stmt_list = NULL;
8025 basic_block condition_bb;
8026 gimple_stmt_iterator gsi, cond_exp_gsi;
8027 basic_block merge_bb;
8028 basic_block new_exit_bb;
8030 gimple orig_phi, new_phi;
8032 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
8033 gimple_seq gimplify_stmt_list = NULL;
8034 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
8035 int min_profitable_iters = 0;
8038 /* Get profitability threshold for vectorized loop. */
8039 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
8041 th = conservative_cost_threshold (loop_vinfo,
8042 min_profitable_iters);
8045 build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
8046 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
8048 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
8051 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
8052 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
8053 &cond_expr_stmt_list);
8055 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8056 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
8057 &cond_expr_stmt_list);
8060 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
8062 force_gimple_operand (cond_expr, &gimplify_stmt_list, true, NULL_TREE);
8063 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
8065 initialize_original_copy_tables ();
8066 nloop = loop_version (loop, cond_expr, &condition_bb,
8067 prob, prob, REG_BR_PROB_BASE - prob, true);
8068 free_original_copy_tables();
8070 /* Loop versioning violates an assumption we try to maintain during
8071 vectorization - that the loop exit block has a single predecessor.
8072 After versioning, the exit block of both loop versions is the same
8073 basic block (i.e. it has two predecessors). Just in order to simplify
8074 following transformations in the vectorizer, we fix this situation
8075 here by adding a new (empty) block on the exit-edge of the loop,
8076 with the proper loop-exit phis to maintain loop-closed-form. */
8078 merge_bb = single_exit (loop)->dest;
8079 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
8080 new_exit_bb = split_edge (single_exit (loop));
8081 new_exit_e = single_exit (loop);
8082 e = EDGE_SUCC (new_exit_bb, 0);
8084 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8086 orig_phi = gsi_stmt (gsi);
8087 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
8089 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
8090 add_phi_arg (new_phi, arg, new_exit_e);
8091 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
8094 /* End loop-exit-fixes after versioning. */
8096 update_ssa (TODO_update_ssa);
8097 if (cond_expr_stmt_list)
8099 cond_exp_gsi = gsi_last_bb (condition_bb);
8100 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, GSI_SAME_STMT);
8104 /* Remove a group of stores (for SLP or interleaving), free their
8108 vect_remove_stores (gimple first_stmt)
8110 gimple next = first_stmt;
8112 gimple_stmt_iterator next_si;
8116 /* Free the attached stmt_vec_info and remove the stmt. */
8117 next_si = gsi_for_stmt (next);
8118 gsi_remove (&next_si, true);
8119 tmp = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
8120 free_stmt_vec_info (next);
8126 /* Vectorize SLP instance tree in postorder. */
8129 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
8130 unsigned int vectorization_factor)
8133 bool strided_store, is_store;
8134 gimple_stmt_iterator si;
8135 stmt_vec_info stmt_info;
8136 unsigned int vec_stmts_size, nunits, group_size;
8139 slp_tree loads_node;
8144 vect_schedule_slp_instance (SLP_TREE_LEFT (node), instance,
8145 vectorization_factor);
8146 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), instance,
8147 vectorization_factor);
8149 stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
8150 stmt_info = vinfo_for_stmt (stmt);
8151 /* VECTYPE is the type of the destination. */
8152 vectype = get_vectype_for_scalar_type (TREE_TYPE (gimple_assign_lhs (stmt)));
8153 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
8154 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
8156 /* For each SLP instance calculate number of vector stmts to be created
8157 for the scalar stmts in each node of the SLP tree. Number of vector
8158 elements in one vector iteration is the number of scalar elements in
8159 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
8161 vec_stmts_size = (vectorization_factor * group_size) / nunits;
8163 /* In case of load permutation we have to allocate vectorized statements for
8164 all the nodes that participate in that permutation. */
8165 if (SLP_INSTANCE_LOAD_PERMUTATION (instance))
8168 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), i, loads_node);
8171 if (!SLP_TREE_VEC_STMTS (loads_node))
8173 SLP_TREE_VEC_STMTS (loads_node) = VEC_alloc (gimple, heap,
8175 SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
8180 if (!SLP_TREE_VEC_STMTS (node))
8182 SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size);
8183 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
8186 if (vect_print_dump_info (REPORT_DETAILS))
8188 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
8189 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8192 si = gsi_for_stmt (stmt);
8193 is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance);
8196 if (DR_GROUP_FIRST_DR (stmt_info))
8197 /* If IS_STORE is TRUE, the vectorization of the
8198 interleaving chain was completed - free all the stores in
8200 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8202 /* FORNOW: SLP originates only from strided stores. */
8208 /* FORNOW: SLP originates only from strided stores. */
8214 vect_schedule_slp (loop_vec_info loop_vinfo)
8216 VEC (slp_instance, heap) *slp_instances =
8217 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
8218 slp_instance instance;
8220 bool is_store = false;
8222 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
8224 /* Schedule the tree of INSTANCE. */
8225 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
8227 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
8229 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
8230 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
8231 fprintf (vect_dump, "vectorizing stmts using SLP.");
8237 /* Function vect_transform_loop.
8239 The analysis phase has determined that the loop is vectorizable.
8240 Vectorize the loop - created vectorized stmts to replace the scalar
8241 stmts in the loop, and update the loop exit condition. */
8244 vect_transform_loop (loop_vec_info loop_vinfo)
8246 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8247 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8248 int nbbs = loop->num_nodes;
8249 gimple_stmt_iterator si;
8252 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8254 bool slp_scheduled = false;
8255 unsigned int nunits;
8257 if (vect_print_dump_info (REPORT_DETAILS))
8258 fprintf (vect_dump, "=== vec_transform_loop ===");
8260 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
8261 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8262 vect_loop_versioning (loop_vinfo);
8264 /* CHECKME: we wouldn't need this if we called update_ssa once
8266 bitmap_zero (vect_memsyms_to_rename);
8268 /* Peel the loop if there are data refs with unknown alignment.
8269 Only one data ref with unknown store is allowed. */
8271 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
8272 vect_do_peeling_for_alignment (loop_vinfo);
8274 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
8275 compile time constant), or it is a constant that doesn't divide by the
8276 vectorization factor, then an epilog loop needs to be created.
8277 We therefore duplicate the loop: the original loop will be vectorized,
8278 and will compute the first (n/VF) iterations. The second copy of the loop
8279 will remain scalar and will compute the remaining (n%VF) iterations.
8280 (VF is the vectorization factor). */
8282 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8283 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8284 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
8285 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
8287 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8288 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
8290 /* 1) Make sure the loop header has exactly two entries
8291 2) Make sure we have a preheader basic block. */
8293 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8295 split_edge (loop_preheader_edge (loop));
8297 /* FORNOW: the vectorizer supports only loops which body consist
8298 of one basic block (header + empty latch). When the vectorizer will
8299 support more involved loop forms, the order by which the BBs are
8300 traversed need to be reconsidered. */
8302 for (i = 0; i < nbbs; i++)
8304 basic_block bb = bbs[i];
8305 stmt_vec_info stmt_info;
8308 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
8310 phi = gsi_stmt (si);
8311 if (vect_print_dump_info (REPORT_DETAILS))
8313 fprintf (vect_dump, "------>vectorizing phi: ");
8314 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
8316 stmt_info = vinfo_for_stmt (phi);
8320 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8321 && !STMT_VINFO_LIVE_P (stmt_info))
8324 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
8325 != (unsigned HOST_WIDE_INT) vectorization_factor)
8326 && vect_print_dump_info (REPORT_DETAILS))
8327 fprintf (vect_dump, "multiple-types.");
8329 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
8331 if (vect_print_dump_info (REPORT_DETAILS))
8332 fprintf (vect_dump, "transform phi.");
8333 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8337 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
8339 gimple stmt = gsi_stmt (si);
8342 if (vect_print_dump_info (REPORT_DETAILS))
8344 fprintf (vect_dump, "------>vectorizing statement: ");
8345 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8348 stmt_info = vinfo_for_stmt (stmt);
8350 /* vector stmts created in the outer-loop during vectorization of
8351 stmts in an inner-loop may not have a stmt_info, and do not
8352 need to be vectorized. */
8359 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8360 && !STMT_VINFO_LIVE_P (stmt_info))
8366 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
8368 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8369 if (!STMT_SLP_TYPE (stmt_info)
8370 && nunits != (unsigned int) vectorization_factor
8371 && vect_print_dump_info (REPORT_DETAILS))
8372 /* For SLP VF is set according to unrolling factor, and not to
8373 vector size, hence for SLP this print is not valid. */
8374 fprintf (vect_dump, "multiple-types.");
8376 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8378 if (STMT_SLP_TYPE (stmt_info))
8382 slp_scheduled = true;
8384 if (vect_print_dump_info (REPORT_DETAILS))
8385 fprintf (vect_dump, "=== scheduling SLP instances ===");
8387 is_store = vect_schedule_slp (loop_vinfo);
8389 /* IS_STORE is true if STMT is a store. Stores cannot be of
8390 hybrid SLP type. They are removed in
8391 vect_schedule_slp_instance and their vinfo is destroyed. */
8399 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8400 if (PURE_SLP_STMT (stmt_info))
8407 /* -------- vectorize statement ------------ */
8408 if (vect_print_dump_info (REPORT_DETAILS))
8409 fprintf (vect_dump, "transform statement.");
8411 strided_store = false;
8412 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
8415 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
8417 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8418 interleaving chain was completed - free all the stores in
8420 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8421 gsi_remove (&si, true);
8426 /* Free the attached stmt_vec_info and remove the stmt. */
8427 free_stmt_vec_info (stmt);
8428 gsi_remove (&si, true);
8436 slpeel_make_loop_iterate_ntimes (loop, ratio);
8438 mark_set_for_renaming (vect_memsyms_to_rename);
8440 /* The memory tags and pointers in vectorized statements need to
8441 have their SSA forms updated. FIXME, why can't this be delayed
8442 until all the loops have been transformed? */
8443 update_ssa (TODO_update_ssa);
8445 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8446 fprintf (vect_dump, "LOOP VECTORIZED.");
8447 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8448 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");