1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *, slp_tree);
50 static tree vect_create_destination_var (tree, tree);
51 static tree vect_create_data_ref_ptr
52 (tree, struct loop*, tree, tree *, tree *, bool, tree, bool *);
53 static tree vect_create_addr_base_for_vector_ref
54 (tree, tree *, tree, struct loop *);
55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
57 static tree vect_init_vector (tree, tree, tree, block_stmt_iterator *);
58 static void vect_finish_stmt_generation
59 (tree stmt, tree vec_stmt, block_stmt_iterator *);
60 static bool vect_is_simple_cond (tree, loop_vec_info);
61 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
62 static tree get_initial_def_for_reduction (tree, tree, tree *);
64 /* Utility function dealing with loop peeling (not peeling itself). */
65 static void vect_generate_tmps_on_preheader
66 (loop_vec_info, tree *, tree *, tree *);
67 static tree vect_build_loop_niters (loop_vec_info);
68 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
69 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
70 static void vect_update_init_of_dr (struct data_reference *, tree niters);
71 static void vect_update_inits_of_drs (loop_vec_info, tree);
72 static int vect_min_worthwhile_factor (enum tree_code);
76 cost_for_stmt (tree stmt)
78 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
80 switch (STMT_VINFO_TYPE (stmt_info))
82 case load_vec_info_type:
83 return TARG_SCALAR_LOAD_COST;
84 case store_vec_info_type:
85 return TARG_SCALAR_STORE_COST;
86 case op_vec_info_type:
87 case condition_vec_info_type:
88 case assignment_vec_info_type:
89 case reduc_vec_info_type:
90 case induc_vec_info_type:
91 case type_promotion_vec_info_type:
92 case type_demotion_vec_info_type:
93 case type_conversion_vec_info_type:
94 case call_vec_info_type:
95 return TARG_SCALAR_STMT_COST;
96 case undef_vec_info_type:
103 /* Function vect_estimate_min_profitable_iters
105 Return the number of iterations required for the vector version of the
106 loop to be profitable relative to the cost of the scalar version of the
109 TODO: Take profile info into account before making vectorization
110 decisions, if available. */
113 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
116 int min_profitable_iters;
117 int peel_iters_prologue;
118 int peel_iters_epilogue;
119 int vec_inside_cost = 0;
120 int vec_outside_cost = 0;
121 int scalar_single_iter_cost = 0;
122 int scalar_outside_cost = 0;
123 bool runtime_test = false;
124 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
125 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
126 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
127 int nbbs = loop->num_nodes;
128 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
129 int peel_guard_costs = 0;
130 int innerloop_iters = 0, factor;
131 VEC (slp_instance, heap) *slp_instances;
132 slp_instance instance;
134 /* Cost model disabled. */
135 if (!flag_vect_cost_model)
137 if (vect_print_dump_info (REPORT_DETAILS))
138 fprintf (vect_dump, "cost model disabled.");
142 /* If the number of iterations is unknown, or the
143 peeling-for-misalignment amount is unknown, we will have to generate
144 a runtime test to test the loop count against the threshold. */
145 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
146 || (byte_misalign < 0))
149 /* Requires loop versioning tests to handle misalignment. */
151 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
153 /* FIXME: Make cost depend on complexity of individual check. */
155 VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
156 if (vect_print_dump_info (REPORT_DETAILS))
157 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
158 "versioning to treat misalignment.\n");
161 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
163 /* FIXME: Make cost depend on complexity of individual check. */
165 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
166 if (vect_print_dump_info (REPORT_DETAILS))
167 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
168 "versioning aliasing.\n");
171 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
172 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
174 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
177 /* Count statements in scalar loop. Using this as scalar cost for a single
180 TODO: Add outer loop support.
182 TODO: Consider assigning different costs to different scalar
187 innerloop_iters = 50; /* FIXME */
189 for (i = 0; i < nbbs; i++)
191 block_stmt_iterator si;
192 basic_block bb = bbs[i];
194 if (bb->loop_father == loop->inner)
195 factor = innerloop_iters;
199 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
201 tree stmt = bsi_stmt (si);
202 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
203 /* Skip stmts that are not vectorized inside the loop. */
204 if (!STMT_VINFO_RELEVANT_P (stmt_info)
205 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
207 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
208 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
209 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
210 some of the "outside" costs are generated inside the outer-loop. */
211 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
215 /* Add additional cost for the peeled instructions in prologue and epilogue
218 FORNOW: If we dont know the value of peel_iters for prologue or epilogue
219 at compile-time - we assume it's vf/2 (the worst would be vf-1).
221 TODO: Build an expression that represents peel_iters for prologue and
222 epilogue to be used in a run-time test. */
224 if (byte_misalign < 0)
226 peel_iters_prologue = vf/2;
227 if (vect_print_dump_info (REPORT_DETAILS))
228 fprintf (vect_dump, "cost model: "
229 "prologue peel iters set to vf/2.");
231 /* If peeling for alignment is unknown, loop bound of main loop becomes
233 peel_iters_epilogue = vf/2;
234 if (vect_print_dump_info (REPORT_DETAILS))
235 fprintf (vect_dump, "cost model: "
236 "epilogue peel iters set to vf/2 because "
237 "peeling for alignment is unknown .");
239 /* If peeled iterations are unknown, count a taken branch and a not taken
240 branch per peeled loop. Even if scalar loop iterations are known,
241 vector iterations are not known since peeled prologue iterations are
242 not known. Hence guards remain the same. */
243 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
244 + TARG_COND_NOT_TAKEN_BRANCH_COST);
251 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
252 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
253 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
254 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
256 peel_iters_prologue = nelements - (byte_misalign / element_size);
259 peel_iters_prologue = 0;
261 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
263 peel_iters_epilogue = vf/2;
264 if (vect_print_dump_info (REPORT_DETAILS))
265 fprintf (vect_dump, "cost model: "
266 "epilogue peel iters set to vf/2 because "
267 "loop iterations are unknown .");
269 /* If peeled iterations are known but number of scalar loop
270 iterations are unknown, count a taken branch per peeled loop. */
271 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
276 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
277 peel_iters_prologue = niters < peel_iters_prologue ?
278 niters : peel_iters_prologue;
279 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
283 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
284 + (peel_iters_epilogue * scalar_single_iter_cost)
287 /* FORNOW: The scalar outside cost is incremented in one of the
290 1. The vectorizer checks for alignment and aliasing and generates
291 a condition that allows dynamic vectorization. A cost model
292 check is ANDED with the versioning condition. Hence scalar code
293 path now has the added cost of the versioning check.
295 if (cost > th & versioning_check)
298 Hence run-time scalar is incremented by not-taken branch cost.
300 2. The vectorizer then checks if a prologue is required. If the
301 cost model check was not done before during versioning, it has to
302 be done before the prologue check.
305 prologue = scalar_iters
310 if (prologue == num_iters)
313 Hence the run-time scalar cost is incremented by a taken branch,
314 plus a not-taken branch, plus a taken branch cost.
316 3. The vectorizer then checks if an epilogue is required. If the
317 cost model check was not done before during prologue check, it
318 has to be done with the epilogue check.
324 if (prologue == num_iters)
327 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
330 Hence the run-time scalar cost should be incremented by 2 taken
333 TODO: The back end may reorder the BBS's differently and reverse
334 conditions/branch directions. Change the stimates below to
335 something more reasonable. */
339 /* Cost model check occurs at versioning. */
340 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
341 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
342 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
345 /* Cost model occurs at prologue generation. */
346 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
347 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
348 + TARG_COND_NOT_TAKEN_BRANCH_COST;
349 /* Cost model check occurs at epilogue generation. */
351 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
356 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
357 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
359 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
360 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
363 /* Calculate number of iterations required to make the vector version
364 profitable, relative to the loop bodies only. The following condition
366 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
368 SIC = scalar iteration cost, VIC = vector iteration cost,
369 VOC = vector outside cost, VF = vectorization factor,
370 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
371 SOC = scalar outside cost for run time cost model check. */
373 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
375 if (vec_outside_cost <= 0)
376 min_profitable_iters = 1;
379 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
380 - vec_inside_cost * peel_iters_prologue
381 - vec_inside_cost * peel_iters_epilogue)
382 / ((scalar_single_iter_cost * vf)
385 if ((scalar_single_iter_cost * vf * min_profitable_iters)
386 <= ((vec_inside_cost * min_profitable_iters)
387 + ((vec_outside_cost - scalar_outside_cost) * vf)))
388 min_profitable_iters++;
391 /* vector version will never be profitable. */
394 if (vect_print_dump_info (REPORT_DETAILS))
395 fprintf (vect_dump, "cost model: vector iteration cost = %d "
396 "is divisible by scalar iteration cost = %d by a factor "
397 "greater than or equal to the vectorization factor = %d .",
398 vec_inside_cost, scalar_single_iter_cost, vf);
402 if (vect_print_dump_info (REPORT_DETAILS))
404 fprintf (vect_dump, "Cost model analysis: \n");
405 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
407 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
409 fprintf (vect_dump, " Scalar iteration cost: %d\n",
410 scalar_single_iter_cost);
411 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
412 fprintf (vect_dump, " prologue iterations: %d\n",
413 peel_iters_prologue);
414 fprintf (vect_dump, " epilogue iterations: %d\n",
415 peel_iters_epilogue);
416 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
417 min_profitable_iters);
420 min_profitable_iters =
421 min_profitable_iters < vf ? vf : min_profitable_iters;
423 /* Because the condition we create is:
424 if (niters <= min_profitable_iters)
425 then skip the vectorized loop. */
426 min_profitable_iters--;
428 if (vect_print_dump_info (REPORT_DETAILS))
429 fprintf (vect_dump, " Profitability threshold = %d\n",
430 min_profitable_iters);
432 return min_profitable_iters;
436 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
437 functions. Design better to avoid maintenance issues. */
439 /* Function vect_model_reduction_cost.
441 Models cost for a reduction operation, including the vector ops
442 generated within the strip-mine loop, the initial definition before
443 the loop, and the epilogue code that must be generated. */
446 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
455 enum machine_mode mode;
456 tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1);
457 int op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
458 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
459 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
461 /* Cost of reduction op inside loop. */
462 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
464 reduction_op = TREE_OPERAND (operation, op_type-1);
465 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
468 if (vect_print_dump_info (REPORT_DETAILS))
470 fprintf (vect_dump, "unsupported data-type ");
471 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
476 mode = TYPE_MODE (vectype);
477 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
480 orig_stmt = STMT_VINFO_STMT (stmt_info);
482 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
484 /* Add in cost for initial definition. */
485 outer_cost += TARG_SCALAR_TO_VEC_COST;
487 /* Determine cost of epilogue code.
489 We have a reduction operator that will reduce the vector in one statement.
490 Also requires scalar extract. */
492 if (!nested_in_vect_loop_p (loop, orig_stmt))
494 if (reduc_code < NUM_TREE_CODES)
495 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
498 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
500 TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0)));
501 int element_bitsize = tree_low_cst (bitsize, 1);
502 int nelements = vec_size_in_bits / element_bitsize;
504 optab = optab_for_tree_code (code, vectype);
506 /* We have a whole vector shift available. */
507 if (VECTOR_MODE_P (mode)
508 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
509 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
510 /* Final reduction via vector shifts and the reduction operator. Also
511 requires scalar extract. */
512 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
513 + TARG_VEC_TO_SCALAR_COST);
515 /* Use extracts and reduction op for final reduction. For N elements,
516 we have N extracts and N-1 reduction ops. */
517 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
521 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
523 if (vect_print_dump_info (REPORT_DETAILS))
524 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
525 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
526 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
532 /* Function vect_model_induction_cost.
534 Models cost for induction operations. */
537 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
539 /* loop cost for vec_loop. */
540 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
541 /* prologue cost for vec_init and vec_step. */
542 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
544 if (vect_print_dump_info (REPORT_DETAILS))
545 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
546 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
547 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
551 /* Function vect_model_simple_cost.
553 Models cost for simple operations, i.e. those that only emit ncopies of a
554 single op. Right now, this does not account for multiple insns that could
555 be generated for the single vector op. We will handle that shortly. */
558 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
559 enum vect_def_type *dt, slp_tree slp_node)
562 int inside_cost = 0, outside_cost = 0;
564 inside_cost = ncopies * TARG_VEC_STMT_COST;
566 /* FORNOW: Assuming maximum 2 args per stmts. */
567 for (i = 0; i < 2; i++)
569 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
570 outside_cost += TARG_SCALAR_TO_VEC_COST;
573 if (vect_print_dump_info (REPORT_DETAILS))
574 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
575 "outside_cost = %d .", inside_cost, outside_cost);
577 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
578 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
579 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
583 /* Function vect_cost_strided_group_size
585 For strided load or store, return the group_size only if it is the first
586 load or store of a group, else return 1. This ensures that group size is
587 only returned once per group. */
590 vect_cost_strided_group_size (stmt_vec_info stmt_info)
592 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
594 if (first_stmt == STMT_VINFO_STMT (stmt_info))
595 return DR_GROUP_SIZE (stmt_info);
601 /* Function vect_model_store_cost
603 Models cost for stores. In the case of strided accesses, one access
604 has the overhead of the strided access attributed to it. */
607 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
608 enum vect_def_type dt, slp_tree slp_node)
611 int inside_cost = 0, outside_cost = 0;
613 if (dt == vect_constant_def || dt == vect_invariant_def)
614 outside_cost = TARG_SCALAR_TO_VEC_COST;
616 /* Strided access? */
617 if (DR_GROUP_FIRST_DR (stmt_info))
618 group_size = vect_cost_strided_group_size (stmt_info);
619 /* Not a strided access. */
623 /* Is this an access in a group of stores, which provide strided access?
624 If so, add in the cost of the permutes. */
627 /* Uses a high and low interleave operation for each needed permute. */
628 inside_cost = ncopies * exact_log2(group_size) * group_size
629 * TARG_VEC_STMT_COST;
631 if (vect_print_dump_info (REPORT_DETAILS))
632 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
637 /* Costs of the stores. */
638 inside_cost += ncopies * TARG_VEC_STORE_COST;
640 if (vect_print_dump_info (REPORT_DETAILS))
641 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
642 "outside_cost = %d .", inside_cost, outside_cost);
644 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
645 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
646 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
650 /* Function vect_model_load_cost
652 Models cost for loads. In the case of strided accesses, the last access
653 has the overhead of the strided access attributed to it. Since unaligned
654 accesses are supported for loads, we also account for the costs of the
655 access scheme chosen. */
658 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
662 int alignment_support_cheme;
664 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
665 int inside_cost = 0, outside_cost = 0;
667 /* Strided accesses? */
668 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
669 if (first_stmt && !slp_node)
671 group_size = vect_cost_strided_group_size (stmt_info);
672 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
674 /* Not a strided access. */
681 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
683 /* Is this an access in a group of loads providing strided access?
684 If so, add in the cost of the permutes. */
687 /* Uses an even and odd extract operations for each needed permute. */
688 inside_cost = ncopies * exact_log2(group_size) * group_size
689 * TARG_VEC_STMT_COST;
691 if (vect_print_dump_info (REPORT_DETAILS))
692 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
697 /* The loads themselves. */
698 switch (alignment_support_cheme)
702 inside_cost += ncopies * TARG_VEC_LOAD_COST;
704 if (vect_print_dump_info (REPORT_DETAILS))
705 fprintf (vect_dump, "vect_model_load_cost: aligned.");
709 case dr_unaligned_supported:
711 /* Here, we assign an additional cost for the unaligned load. */
712 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
714 if (vect_print_dump_info (REPORT_DETAILS))
715 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
720 case dr_explicit_realign:
722 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
724 /* FIXME: If the misalignment remains fixed across the iterations of
725 the containing loop, the following cost should be added to the
727 if (targetm.vectorize.builtin_mask_for_load)
728 inside_cost += TARG_VEC_STMT_COST;
732 case dr_explicit_realign_optimized:
734 if (vect_print_dump_info (REPORT_DETAILS))
735 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
738 /* Unaligned software pipeline has a load of an address, an initial
739 load, and possibly a mask operation to "prime" the loop. However,
740 if this is an access in a group of loads, which provide strided
741 access, then the above cost should only be considered for one
742 access in the group. Inside the loop, there is a load op
743 and a realignment op. */
745 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
747 outside_cost = 2*TARG_VEC_STMT_COST;
748 if (targetm.vectorize.builtin_mask_for_load)
749 outside_cost += TARG_VEC_STMT_COST;
752 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
761 if (vect_print_dump_info (REPORT_DETAILS))
762 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
763 "outside_cost = %d .", inside_cost, outside_cost);
765 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
766 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
767 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
771 /* Function vect_get_new_vect_var.
773 Returns a name for a new variable. The current naming scheme appends the
774 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
775 the name of vectorizer generated variables, and appends that to NAME if
779 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
786 case vect_simple_var:
789 case vect_scalar_var:
792 case vect_pointer_var:
801 char* tmp = concat (prefix, name, NULL);
802 new_vect_var = create_tmp_var (type, tmp);
806 new_vect_var = create_tmp_var (type, prefix);
808 /* Mark vector typed variable as a gimple register variable. */
809 if (TREE_CODE (type) == VECTOR_TYPE)
810 DECL_GIMPLE_REG_P (new_vect_var) = true;
816 /* Function vect_create_addr_base_for_vector_ref.
818 Create an expression that computes the address of the first memory location
819 that will be accessed for a data reference.
822 STMT: The statement containing the data reference.
823 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
824 OFFSET: Optional. If supplied, it is be added to the initial address.
825 LOOP: Specify relative to which loop-nest should the address be computed.
826 For example, when the dataref is in an inner-loop nested in an
827 outer-loop that is now being vectorized, LOOP can be either the
828 outer-loop, or the inner-loop. The first memory location accessed
829 by the following dataref ('in' points to short):
836 if LOOP=i_loop: &in (relative to i_loop)
837 if LOOP=j_loop: &in+i*2B (relative to j_loop)
840 1. Return an SSA_NAME whose value is the address of the memory location of
841 the first vector of the data reference.
842 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
843 these statement(s) which define the returned SSA_NAME.
845 FORNOW: We are only handling array accesses with step 1. */
848 vect_create_addr_base_for_vector_ref (tree stmt,
853 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
854 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
855 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
856 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
858 tree data_ref_base_var;
861 tree addr_base, addr_expr;
863 tree base_offset = unshare_expr (DR_OFFSET (dr));
864 tree init = unshare_expr (DR_INIT (dr));
865 tree vect_ptr_type, addr_expr2;
866 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
869 if (loop != containing_loop)
871 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
872 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
874 gcc_assert (nested_in_vect_loop_p (loop, stmt));
876 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
877 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
878 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
881 /* Create data_ref_base */
882 base_name = build_fold_indirect_ref (data_ref_base);
883 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
884 add_referenced_var (data_ref_base_var);
885 data_ref_base = force_gimple_operand (data_ref_base, &new_base_stmt,
886 true, data_ref_base_var);
887 append_to_statement_list_force(new_base_stmt, new_stmt_list);
889 /* Create base_offset */
890 base_offset = size_binop (PLUS_EXPR, base_offset, init);
891 base_offset = fold_convert (sizetype, base_offset);
892 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
893 add_referenced_var (dest);
894 base_offset = force_gimple_operand (base_offset, &new_stmt, true, dest);
895 append_to_statement_list_force (new_stmt, new_stmt_list);
899 tree tmp = create_tmp_var (sizetype, "offset");
901 add_referenced_var (tmp);
902 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
903 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
904 base_offset, offset);
905 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
906 append_to_statement_list_force (new_stmt, new_stmt_list);
909 /* base + base_offset */
910 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
911 data_ref_base, base_offset);
913 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
915 /* addr_expr = addr_base */
916 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
917 get_name (base_name));
918 add_referenced_var (addr_expr);
919 vec_stmt = fold_convert (vect_ptr_type, addr_base);
920 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
921 get_name (base_name));
922 add_referenced_var (addr_expr2);
923 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
924 append_to_statement_list_force (new_stmt, new_stmt_list);
926 if (vect_print_dump_info (REPORT_DETAILS))
928 fprintf (vect_dump, "created ");
929 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
935 /* Function vect_create_data_ref_ptr.
937 Create a new pointer to vector type (vp), that points to the first location
938 accessed in the loop by STMT, along with the def-use update chain to
939 appropriately advance the pointer through the loop iterations. Also set
940 aliasing information for the pointer. This vector pointer is used by the
941 callers to this function to create a memory reference expression for vector
945 1. STMT: a stmt that references memory. Expected to be of the form
946 GIMPLE_MODIFY_STMT <name, data-ref> or
947 GIMPLE_MODIFY_STMT <data-ref, name>.
948 2. AT_LOOP: the loop where the vector memref is to be created.
949 3. OFFSET (optional): an offset to be added to the initial address accessed
950 by the data-ref in STMT.
951 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
952 pointing to the initial address.
953 5. TYPE: if not NULL indicates the required type of the data-ref
956 1. Declare a new ptr to vector_type, and have it point to the base of the
957 data reference (initial addressed accessed by the data reference).
958 For example, for vector of type V8HI, the following code is generated:
961 vp = (v8hi *)initial_address;
963 if OFFSET is not supplied:
964 initial_address = &a[init];
965 if OFFSET is supplied:
966 initial_address = &a[init + OFFSET];
968 Return the initial_address in INITIAL_ADDRESS.
970 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
971 update the pointer in each iteration of the loop.
973 Return the increment stmt that updates the pointer in PTR_INCR.
975 3. Set INV_P to true if the access pattern of the data reference in the
976 vectorized loop is invariant. Set it to false otherwise.
978 4. Return the pointer. */
981 vect_create_data_ref_ptr (tree stmt, struct loop *at_loop,
982 tree offset, tree *initial_address, tree *ptr_incr,
983 bool only_init, tree type, bool *inv_p)
986 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
987 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
988 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
989 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
990 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
991 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
997 tree new_stmt_list = NULL_TREE;
1001 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1003 block_stmt_iterator incr_bsi;
1005 tree indx_before_incr, indx_after_incr;
1009 /* Check the step (evolution) of the load in LOOP, and record
1010 whether it's invariant. */
1011 if (nested_in_vect_loop)
1012 step = STMT_VINFO_DR_STEP (stmt_info);
1014 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1016 if (tree_int_cst_compare (step, size_zero_node) == 0)
1021 /* Create an expression for the first address accessed by this load
1023 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1025 if (vect_print_dump_info (REPORT_DETAILS))
1027 tree data_ref_base = base_name;
1028 fprintf (vect_dump, "create vector-pointer variable to type: ");
1029 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1030 if (TREE_CODE (data_ref_base) == VAR_DECL)
1031 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
1032 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1033 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
1034 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1035 fprintf (vect_dump, " vectorizing a record based array ref: ");
1036 else if (TREE_CODE (data_ref_base) == SSA_NAME)
1037 fprintf (vect_dump, " vectorizing a pointer ref: ");
1038 print_generic_expr (vect_dump, base_name, TDF_SLIM);
1041 /** (1) Create the new vector-pointer variable: **/
1043 vect_ptr_type = build_pointer_type (type);
1045 vect_ptr_type = build_pointer_type (vectype);
1046 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1047 get_name (base_name));
1048 add_referenced_var (vect_ptr);
1050 /** (2) Add aliasing information to the new vector-pointer:
1051 (The points-to info (DR_PTR_INFO) may be defined later.) **/
1053 tag = DR_SYMBOL_TAG (dr);
1056 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1057 tag must be created with tag added to its may alias list. */
1059 new_type_alias (vect_ptr, tag, DR_REF (dr));
1061 set_symbol_mem_tag (vect_ptr, tag);
1063 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
1065 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1066 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1067 def-use update cycles for the pointer: One relative to the outer-loop
1068 (LOOP), which is what steps (3) and (4) below do. The other is relative
1069 to the inner-loop (which is the inner-most loop containing the dataref),
1070 and this is done be step (5) below.
1072 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1073 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1074 redundant. Steps (3),(4) create the following:
1077 LOOP: vp1 = phi(vp0,vp2)
1083 If there is an inner-loop nested in loop, then step (5) will also be
1084 applied, and an additional update in the inner-loop will be created:
1087 LOOP: vp1 = phi(vp0,vp2)
1089 inner: vp3 = phi(vp1,vp4)
1090 vp4 = vp3 + inner_step
1096 /** (3) Calculate the initial address the vector-pointer, and set
1097 the vector-pointer to point to it before the loop: **/
1099 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1101 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1103 pe = loop_preheader_edge (loop);
1104 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
1105 gcc_assert (!new_bb);
1106 *initial_address = new_temp;
1108 /* Create: p = (vectype *) initial_base */
1109 vec_stmt = fold_convert (vect_ptr_type, new_temp);
1110 vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
1111 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1112 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
1113 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
1114 gcc_assert (!new_bb);
1117 /** (4) Handle the updating of the vector-pointer inside the loop.
1118 This is needed when ONLY_INIT is false, and also when AT_LOOP
1119 is the inner-loop nested in LOOP (during outer-loop vectorization).
1122 if (only_init && at_loop == loop) /* No update in loop is required. */
1124 /* Copy the points-to information if it exists. */
1125 if (DR_PTR_INFO (dr))
1126 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1127 vptr = vect_ptr_init;
1131 /* The step of the vector pointer is the Vector Size. */
1132 tree step = TYPE_SIZE_UNIT (vectype);
1133 /* One exception to the above is when the scalar step of the load in
1134 LOOP is zero. In this case the step here is also zero. */
1136 step = size_zero_node;
1138 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
1140 create_iv (vect_ptr_init,
1141 fold_convert (vect_ptr_type, step),
1142 NULL_TREE, loop, &incr_bsi, insert_after,
1143 &indx_before_incr, &indx_after_incr);
1144 incr = bsi_stmt (incr_bsi);
1145 set_stmt_info (stmt_ann (incr),
1146 new_stmt_vec_info (incr, loop_vinfo));
1148 /* Copy the points-to information if it exists. */
1149 if (DR_PTR_INFO (dr))
1151 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1152 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1154 merge_alias_info (vect_ptr_init, indx_before_incr);
1155 merge_alias_info (vect_ptr_init, indx_after_incr);
1159 vptr = indx_before_incr;
1162 if (!nested_in_vect_loop || only_init)
1166 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1167 nested in LOOP, if exists: **/
1169 gcc_assert (nested_in_vect_loop);
1172 standard_iv_increment_position (containing_loop, &incr_bsi,
1174 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE,
1175 containing_loop, &incr_bsi, insert_after, &indx_before_incr,
1177 incr = bsi_stmt (incr_bsi);
1178 set_stmt_info (stmt_ann (incr), new_stmt_vec_info (incr, loop_vinfo));
1180 /* Copy the points-to information if it exists. */
1181 if (DR_PTR_INFO (dr))
1183 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1184 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1186 merge_alias_info (vect_ptr_init, indx_before_incr);
1187 merge_alias_info (vect_ptr_init, indx_after_incr);
1191 return indx_before_incr;
1198 /* Function bump_vector_ptr
1200 Increment a pointer (to a vector type) by vector-size. If requested,
1201 i.e. if PTR-INCR is given, then also connect the new increment stmt
1202 to the existing def-use update-chain of the pointer, by modifying
1203 the PTR_INCR as illustrated below:
1205 The pointer def-use update-chain before this function:
1206 DATAREF_PTR = phi (p_0, p_2)
1208 PTR_INCR: p_2 = DATAREF_PTR + step
1210 The pointer def-use update-chain after this function:
1211 DATAREF_PTR = phi (p_0, p_2)
1213 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1215 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1218 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1220 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1221 the loop. The increment amount across iterations is expected
1223 BSI - location where the new update stmt is to be placed.
1224 STMT - the original scalar memory-access stmt that is being vectorized.
1225 BUMP - optional. The offset by which to bump the pointer. If not given,
1226 the offset is assumed to be vector_size.
1228 Output: Return NEW_DATAREF_PTR as illustrated above.
1233 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
1234 tree stmt, tree bump)
1236 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1237 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1238 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1239 tree vptr_type = TREE_TYPE (dataref_ptr);
1240 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1241 tree update = TYPE_SIZE_UNIT (vectype);
1244 use_operand_p use_p;
1245 tree new_dataref_ptr;
1250 incr_stmt = build_gimple_modify_stmt (ptr_var,
1251 build2 (POINTER_PLUS_EXPR, vptr_type,
1252 dataref_ptr, update));
1253 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1254 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
1255 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
1257 /* Copy the points-to information if it exists. */
1258 if (DR_PTR_INFO (dr))
1259 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1260 merge_alias_info (new_dataref_ptr, dataref_ptr);
1263 return new_dataref_ptr;
1265 /* Update the vector-pointer's cross-iteration increment. */
1266 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1268 tree use = USE_FROM_PTR (use_p);
1270 if (use == dataref_ptr)
1271 SET_USE (use_p, new_dataref_ptr);
1273 gcc_assert (tree_int_cst_compare (use, update) == 0);
1276 return new_dataref_ptr;
1280 /* Function vect_create_destination_var.
1282 Create a new temporary of type VECTYPE. */
1285 vect_create_destination_var (tree scalar_dest, tree vectype)
1288 const char *new_name;
1290 enum vect_var_kind kind;
1292 kind = vectype ? vect_simple_var : vect_scalar_var;
1293 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1295 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1297 new_name = get_name (scalar_dest);
1300 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1301 add_referenced_var (vec_dest);
1307 /* Function vect_init_vector.
1309 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1310 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1311 is not NULL. Otherwise, place the initialization at the loop preheader.
1312 Return the DEF of INIT_STMT.
1313 It will be used in the vectorization of STMT. */
1316 vect_init_vector (tree stmt, tree vector_var, tree vector_type,
1317 block_stmt_iterator *bsi)
1319 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1327 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1328 add_referenced_var (new_var);
1329 init_stmt = build_gimple_modify_stmt (new_var, vector_var);
1330 new_temp = make_ssa_name (new_var, init_stmt);
1331 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
1334 vect_finish_stmt_generation (stmt, init_stmt, bsi);
1337 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1338 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1340 if (nested_in_vect_loop_p (loop, stmt))
1342 pe = loop_preheader_edge (loop);
1343 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1344 gcc_assert (!new_bb);
1347 if (vect_print_dump_info (REPORT_DETAILS))
1349 fprintf (vect_dump, "created new init_stmt: ");
1350 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1353 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
1358 /* For constant and loop invariant defs of SLP_NODE this function returns
1359 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1360 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1364 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1365 unsigned int op_num)
1367 VEC (tree, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1368 tree stmt = VEC_index (tree, stmts, 0);
1369 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1370 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1371 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1374 int j, number_of_places_left_in_vector;
1376 tree op, vop, operation;
1377 int group_size = VEC_length (tree, stmts);
1378 unsigned int vec_num, i;
1379 int number_of_copies = 1;
1380 bool is_store = false;
1381 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1382 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1385 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1388 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1389 created vectors. It is greater than 1 if unrolling is performed.
1391 For example, we have two scalar operands, s1 and s2 (e.g., group of
1392 strided accesses of size two), while NUINTS is four (i.e., four scalars
1393 of this type can be packed in a vector). The output vector will contain
1394 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1397 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1398 containing the operands.
1400 For example, NUINTS is four as before, and the group size is 8
1401 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1402 {s5, s6, s7, s8}. */
1404 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1406 number_of_places_left_in_vector = nunits;
1408 for (j = 0; j < number_of_copies; j++)
1410 for (i = group_size - 1; VEC_iterate (tree, stmts, i, stmt); i--)
1412 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1416 op = TREE_OPERAND (operation, op_num);
1417 if (!CONSTANT_CLASS_P (op))
1420 /* Create 'vect_ = {op0,op1,...,opn}'. */
1421 t = tree_cons (NULL_TREE, op, t);
1423 number_of_places_left_in_vector--;
1425 if (number_of_places_left_in_vector == 0)
1427 number_of_places_left_in_vector = nunits;
1429 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1430 gcc_assert (vector_type);
1432 vec_cst = build_vector (vector_type, t);
1434 vec_cst = build_constructor_from_list (vector_type, t);
1436 VEC_quick_push (tree, voprnds,
1437 vect_init_vector (stmt, vec_cst, vector_type,
1444 /* Since the vectors are created in the reverse order, we should invert
1446 vec_num = VEC_length (tree, voprnds);
1447 for (j = vec_num - 1; j >= 0; j--)
1449 vop = VEC_index (tree, voprnds, j);
1450 VEC_quick_push (tree, *vec_oprnds, vop);
1453 VEC_free (tree, heap, voprnds);
1455 /* In case that VF is greater than the unrolling factor needed for the SLP
1456 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1457 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1458 to replicate the vectors. */
1459 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1461 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1462 VEC_quick_push (tree, *vec_oprnds, vop);
1467 /* Get vectorized definitions from SLP_NODE that contains corresponding
1468 vectorized def-stmts. */
1471 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1477 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1480 VEC_iterate (tree, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1483 gcc_assert (vec_def_stmt);
1484 vec_oprnd = GIMPLE_STMT_OPERAND (vec_def_stmt, 0);
1485 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1490 /* Get vectorized definitions for SLP_NODE.
1491 If the scalar definitions are loop invariants or constants, collect them and
1492 call vect_get_constant_vectors() to create vector stmts.
1493 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1494 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1495 vect_get_slp_vect_defs() to retrieve them.
1496 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1497 the right node. This is used when the second operand must remain scalar. */
1500 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1501 VEC (tree,heap) **vec_oprnds1)
1503 tree operation, first_stmt;
1505 /* Allocate memory for vectorized defs. */
1506 *vec_oprnds0 = VEC_alloc (tree, heap,
1507 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1509 /* SLP_NODE corresponds either to a group of stores or to a group of
1510 unary/binary operations. We don't call this function for loads. */
1511 if (SLP_TREE_LEFT (slp_node))
1512 /* The defs are already vectorized. */
1513 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1515 /* Build vectors from scalar defs. */
1516 vect_get_constant_vectors (slp_node, vec_oprnds0, 0);
1518 first_stmt = VEC_index (tree, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1519 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1520 /* Since we don't call this function with loads, this is a group of
1524 operation = GIMPLE_STMT_OPERAND (first_stmt, 1);
1525 if (TREE_OPERAND_LENGTH (operation) == unary_op || !vec_oprnds1)
1528 *vec_oprnds1 = VEC_alloc (tree, heap,
1529 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1531 if (SLP_TREE_RIGHT (slp_node))
1532 /* The defs are already vectorized. */
1533 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1535 /* Build vectors from scalar defs. */
1536 vect_get_constant_vectors (slp_node, vec_oprnds1, 1);
1540 /* Function get_initial_def_for_induction
1543 STMT - a stmt that performs an induction operation in the loop.
1544 IV_PHI - the initial value of the induction variable
1547 Return a vector variable, initialized with the first VF values of
1548 the induction variable. E.g., for an iv with IV_PHI='X' and
1549 evolution S, for a vector of 4 units, we want to return:
1550 [X, X + S, X + 2*S, X + 3*S]. */
1553 get_initial_def_for_induction (tree iv_phi)
1555 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1556 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1557 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1558 tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
1561 edge pe = loop_preheader_edge (loop);
1562 struct loop *iv_loop;
1564 tree vec, vec_init, vec_step, t;
1569 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
1570 tree init_expr, step_expr;
1571 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1576 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1577 bool nested_in_vect_loop = false;
1579 imm_use_iterator imm_iter;
1580 use_operand_p use_p;
1584 block_stmt_iterator si;
1585 basic_block bb = bb_for_stmt (iv_phi);
1587 vectype = get_vectype_for_scalar_type (scalar_type);
1588 gcc_assert (vectype);
1589 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1590 ncopies = vf / nunits;
1592 gcc_assert (phi_info);
1593 gcc_assert (ncopies >= 1);
1595 /* Find the first insertion point in the BB. */
1596 si = bsi_after_labels (bb);
1598 if (INTEGRAL_TYPE_P (scalar_type))
1599 step_expr = build_int_cst (scalar_type, 0);
1601 step_expr = build_real (scalar_type, dconst0);
1603 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1604 if (nested_in_vect_loop_p (loop, iv_phi))
1606 nested_in_vect_loop = true;
1607 iv_loop = loop->inner;
1611 gcc_assert (iv_loop == (bb_for_stmt (iv_phi))->loop_father);
1613 latch_e = loop_latch_edge (iv_loop);
1614 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1616 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1617 gcc_assert (access_fn);
1618 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1619 &init_expr, &step_expr);
1621 pe = loop_preheader_edge (iv_loop);
1623 /* Create the vector that holds the initial_value of the induction. */
1624 if (nested_in_vect_loop)
1626 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1627 been created during vectorization of previous stmts; We obtain it from
1628 the STMT_VINFO_VEC_STMT of the defining stmt. */
1629 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1630 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1634 /* iv_loop is the loop to be vectorized. Create:
1635 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1636 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1637 add_referenced_var (new_var);
1639 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1642 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
1643 gcc_assert (!new_bb);
1647 t = tree_cons (NULL_TREE, init_expr, t);
1648 for (i = 1; i < nunits; i++)
1652 /* Create: new_name_i = new_name + step_expr */
1653 tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
1654 init_stmt = build_gimple_modify_stmt (new_var, tmp);
1655 new_name = make_ssa_name (new_var, init_stmt);
1656 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
1658 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1659 gcc_assert (!new_bb);
1661 if (vect_print_dump_info (REPORT_DETAILS))
1663 fprintf (vect_dump, "created new init_stmt: ");
1664 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1666 t = tree_cons (NULL_TREE, new_name, t);
1668 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1669 vec = build_constructor_from_list (vectype, nreverse (t));
1670 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1674 /* Create the vector that holds the step of the induction. */
1675 if (nested_in_vect_loop)
1676 /* iv_loop is nested in the loop to be vectorized. Generate:
1677 vec_step = [S, S, S, S] */
1678 new_name = step_expr;
1681 /* iv_loop is the loop to be vectorized. Generate:
1682 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1683 expr = build_int_cst (scalar_type, vf);
1684 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1688 for (i = 0; i < nunits; i++)
1689 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1690 gcc_assert (CONSTANT_CLASS_P (new_name));
1691 vec = build_vector (vectype, t);
1692 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1695 /* Create the following def-use cycle:
1700 vec_iv = PHI <vec_init, vec_loop>
1704 vec_loop = vec_iv + vec_step; */
1706 /* Create the induction-phi that defines the induction-operand. */
1707 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1708 add_referenced_var (vec_dest);
1709 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1710 set_stmt_info (get_stmt_ann (induction_phi),
1711 new_stmt_vec_info (induction_phi, loop_vinfo));
1712 induc_def = PHI_RESULT (induction_phi);
1714 /* Create the iv update inside the loop */
1715 new_stmt = build_gimple_modify_stmt (NULL_TREE,
1716 build2 (PLUS_EXPR, vectype,
1717 induc_def, vec_step));
1718 vec_def = make_ssa_name (vec_dest, new_stmt);
1719 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1720 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1721 set_stmt_info (get_stmt_ann (new_stmt),
1722 new_stmt_vec_info (new_stmt, loop_vinfo));
1724 /* Set the arguments of the phi node: */
1725 add_phi_arg (induction_phi, vec_init, pe);
1726 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1729 /* In case that vectorization factor (VF) is bigger than the number
1730 of elements that we can fit in a vectype (nunits), we have to generate
1731 more than one vector stmt - i.e - we need to "unroll" the
1732 vector stmt by a factor VF/nunits. For more details see documentation
1733 in vectorizable_operation. */
1737 stmt_vec_info prev_stmt_vinfo;
1738 /* FORNOW. This restriction should be relaxed. */
1739 gcc_assert (!nested_in_vect_loop);
1741 /* Create the vector that holds the step of the induction. */
1742 expr = build_int_cst (scalar_type, nunits);
1743 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1745 for (i = 0; i < nunits; i++)
1746 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1747 gcc_assert (CONSTANT_CLASS_P (new_name));
1748 vec = build_vector (vectype, t);
1749 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1751 vec_def = induc_def;
1752 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1753 for (i = 1; i < ncopies; i++)
1757 /* vec_i = vec_prev + vec_step */
1758 tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
1759 new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
1760 vec_def = make_ssa_name (vec_dest, new_stmt);
1761 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1762 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1763 set_stmt_info (get_stmt_ann (new_stmt),
1764 new_stmt_vec_info (new_stmt, loop_vinfo));
1765 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1766 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1770 if (nested_in_vect_loop)
1772 /* Find the loop-closed exit-phi of the induction, and record
1773 the final vector of induction results: */
1775 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1777 if (!flow_bb_inside_loop_p (iv_loop, bb_for_stmt (USE_STMT (use_p))))
1779 exit_phi = USE_STMT (use_p);
1785 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1786 /* FORNOW. Currently not supporting the case that an inner-loop induction
1787 is not used in the outer-loop (i.e. only outside the outer-loop). */
1788 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1789 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1791 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1792 if (vect_print_dump_info (REPORT_DETAILS))
1794 fprintf (vect_dump, "vector of inductions after inner-loop:");
1795 print_generic_expr (vect_dump, new_stmt, TDF_SLIM);
1801 if (vect_print_dump_info (REPORT_DETAILS))
1803 fprintf (vect_dump, "transform induction: created def-use cycle:");
1804 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
1805 fprintf (vect_dump, "\n");
1806 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
1809 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1814 /* Function vect_get_vec_def_for_operand.
1816 OP is an operand in STMT. This function returns a (vector) def that will be
1817 used in the vectorized stmt for STMT.
1819 In the case that OP is an SSA_NAME which is defined in the loop, then
1820 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1822 In case OP is an invariant or constant, a new stmt that creates a vector def
1823 needs to be introduced. */
1826 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
1831 stmt_vec_info def_stmt_info = NULL;
1832 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1833 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1834 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1835 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1841 enum vect_def_type dt;
1845 if (vect_print_dump_info (REPORT_DETAILS))
1847 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1848 print_generic_expr (vect_dump, op, TDF_SLIM);
1851 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1852 gcc_assert (is_simple_use);
1853 if (vect_print_dump_info (REPORT_DETAILS))
1857 fprintf (vect_dump, "def = ");
1858 print_generic_expr (vect_dump, def, TDF_SLIM);
1862 fprintf (vect_dump, " def_stmt = ");
1863 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
1869 /* Case 1: operand is a constant. */
1870 case vect_constant_def:
1875 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1876 if (vect_print_dump_info (REPORT_DETAILS))
1877 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1879 for (i = nunits - 1; i >= 0; --i)
1881 t = tree_cons (NULL_TREE, op, t);
1883 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1884 gcc_assert (vector_type);
1885 vec_cst = build_vector (vector_type, t);
1887 return vect_init_vector (stmt, vec_cst, vector_type, NULL);
1890 /* Case 2: operand is defined outside the loop - loop invariant. */
1891 case vect_invariant_def:
1896 /* Create 'vec_inv = {inv,inv,..,inv}' */
1897 if (vect_print_dump_info (REPORT_DETAILS))
1898 fprintf (vect_dump, "Create vector_inv.");
1900 for (i = nunits - 1; i >= 0; --i)
1902 t = tree_cons (NULL_TREE, def, t);
1905 /* FIXME: use build_constructor directly. */
1906 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1907 gcc_assert (vector_type);
1908 vec_inv = build_constructor_from_list (vector_type, t);
1909 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1912 /* Case 3: operand is defined inside the loop. */
1916 *scalar_def = def_stmt;
1918 /* Get the def from the vectorized stmt. */
1919 def_stmt_info = vinfo_for_stmt (def_stmt);
1920 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1921 gcc_assert (vec_stmt);
1922 if (TREE_CODE (vec_stmt) == PHI_NODE)
1923 vec_oprnd = PHI_RESULT (vec_stmt);
1925 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
1929 /* Case 4: operand is defined by a loop header phi - reduction */
1930 case vect_reduction_def:
1934 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1935 loop = (bb_for_stmt (def_stmt))->loop_father;
1937 /* Get the def before the loop */
1938 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1939 return get_initial_def_for_reduction (stmt, op, scalar_def);
1942 /* Case 5: operand is defined by loop-header phi - induction. */
1943 case vect_induction_def:
1945 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1947 /* Get the def from the vectorized stmt. */
1948 def_stmt_info = vinfo_for_stmt (def_stmt);
1949 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1950 gcc_assert (vec_stmt && (TREE_CODE (vec_stmt) == PHI_NODE));
1951 vec_oprnd = PHI_RESULT (vec_stmt);
1961 /* Function vect_get_vec_def_for_stmt_copy
1963 Return a vector-def for an operand. This function is used when the
1964 vectorized stmt to be created (by the caller to this function) is a "copy"
1965 created in case the vectorized result cannot fit in one vector, and several
1966 copies of the vector-stmt are required. In this case the vector-def is
1967 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
1968 of the stmt that defines VEC_OPRND.
1969 DT is the type of the vector def VEC_OPRND.
1972 In case the vectorization factor (VF) is bigger than the number
1973 of elements that can fit in a vectype (nunits), we have to generate
1974 more than one vector stmt to vectorize the scalar stmt. This situation
1975 arises when there are multiple data-types operated upon in the loop; the
1976 smallest data-type determines the VF, and as a result, when vectorizing
1977 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
1978 vector stmt (each computing a vector of 'nunits' results, and together
1979 computing 'VF' results in each iteration). This function is called when
1980 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
1981 which VF=16 and nunits=4, so the number of copies required is 4):
1983 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
1985 S1: x = load VS1.0: vx.0 = memref0 VS1.1
1986 VS1.1: vx.1 = memref1 VS1.2
1987 VS1.2: vx.2 = memref2 VS1.3
1988 VS1.3: vx.3 = memref3
1990 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
1991 VSnew.1: vz1 = vx.1 + ... VSnew.2
1992 VSnew.2: vz2 = vx.2 + ... VSnew.3
1993 VSnew.3: vz3 = vx.3 + ...
1995 The vectorization of S1 is explained in vectorizable_load.
1996 The vectorization of S2:
1997 To create the first vector-stmt out of the 4 copies - VSnew.0 -
1998 the function 'vect_get_vec_def_for_operand' is called to
1999 get the relevant vector-def for each operand of S2. For operand x it
2000 returns the vector-def 'vx.0'.
2002 To create the remaining copies of the vector-stmt (VSnew.j), this
2003 function is called to get the relevant vector-def for each operand. It is
2004 obtained from the respective VS1.j stmt, which is recorded in the
2005 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2007 For example, to obtain the vector-def 'vx.1' in order to create the
2008 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2009 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2010 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2011 and return its def ('vx.1').
2012 Overall, to create the above sequence this function will be called 3 times:
2013 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2014 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2015 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
2018 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2020 tree vec_stmt_for_operand;
2021 stmt_vec_info def_stmt_info;
2023 /* Do nothing; can reuse same def. */
2024 if (dt == vect_invariant_def || dt == vect_constant_def )
2027 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2028 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2029 gcc_assert (def_stmt_info);
2030 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2031 gcc_assert (vec_stmt_for_operand);
2032 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
2037 /* Get vectorized definitions for the operands to create a copy of an original
2038 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
2041 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2042 VEC(tree,heap) **vec_oprnds0,
2043 VEC(tree,heap) **vec_oprnds1)
2045 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2047 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2048 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2050 if (vec_oprnds1 && *vec_oprnds1)
2052 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2053 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2054 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2059 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
2062 vect_get_vec_defs (tree op0, tree op1, tree stmt, VEC(tree,heap) **vec_oprnds0,
2063 VEC(tree,heap) **vec_oprnds1, slp_tree slp_node)
2066 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2071 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2072 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2073 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2077 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2078 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2079 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2085 /* Function vect_finish_stmt_generation.
2087 Insert a new stmt. */
2090 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
2091 block_stmt_iterator *bsi)
2093 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2094 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2096 gcc_assert (stmt == bsi_stmt (*bsi));
2097 gcc_assert (TREE_CODE (stmt) != LABEL_EXPR);
2099 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
2101 set_stmt_info (get_stmt_ann (vec_stmt),
2102 new_stmt_vec_info (vec_stmt, loop_vinfo));
2104 if (vect_print_dump_info (REPORT_DETAILS))
2106 fprintf (vect_dump, "add new stmt: ");
2107 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
2110 /* Make sure bsi points to the stmt that is being vectorized. */
2111 gcc_assert (stmt == bsi_stmt (*bsi));
2113 #ifdef USE_MAPPED_LOCATION
2114 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
2116 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
2121 /* Function get_initial_def_for_reduction
2124 STMT - a stmt that performs a reduction operation in the loop.
2125 INIT_VAL - the initial value of the reduction variable
2128 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2129 of the reduction (used for adjusting the epilog - see below).
2130 Return a vector variable, initialized according to the operation that STMT
2131 performs. This vector will be used as the initial value of the
2132 vector of partial results.
2134 Option1 (adjust in epilog): Initialize the vector as follows:
2137 min/max: [init_val,init_val,..,init_val,init_val]
2138 bit and/or: [init_val,init_val,..,init_val,init_val]
2139 and when necessary (e.g. add/mult case) let the caller know
2140 that it needs to adjust the result by init_val.
2142 Option2: Initialize the vector as follows:
2143 add: [0,0,...,0,init_val]
2144 mult: [1,1,...,1,init_val]
2145 min/max: [init_val,init_val,...,init_val]
2146 bit and/or: [init_val,init_val,...,init_val]
2147 and no adjustments are needed.
2149 For example, for the following code:
2155 STMT is 's = s + a[i]', and the reduction variable is 's'.
2156 For a vector of 4 units, we want to return either [0,0,0,init_val],
2157 or [0,0,0,0] and let the caller know that it needs to adjust
2158 the result at the end by 'init_val'.
2160 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2161 initialization vector is simpler (same element in all entries).
2162 A cost model should help decide between these two schemes. */
2165 get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
2167 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2168 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2169 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2170 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2171 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2172 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2173 tree type = TREE_TYPE (init_val);
2180 bool nested_in_vect_loop = false;
2182 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2183 if (nested_in_vect_loop_p (loop, stmt))
2184 nested_in_vect_loop = true;
2186 gcc_assert (loop == (bb_for_stmt (stmt))->loop_father);
2188 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2192 case WIDEN_SUM_EXPR:
2195 if (nested_in_vect_loop)
2196 *adjustment_def = vecdef;
2198 *adjustment_def = init_val;
2199 /* Create a vector of zeros for init_def. */
2200 if (SCALAR_FLOAT_TYPE_P (type))
2201 def_for_init = build_real (type, dconst0);
2203 def_for_init = build_int_cst (type, 0);
2204 for (i = nunits - 1; i >= 0; --i)
2205 t = tree_cons (NULL_TREE, def_for_init, t);
2206 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
2207 gcc_assert (vector_type);
2208 init_def = build_vector (vector_type, t);
2213 *adjustment_def = NULL_TREE;
2225 /* Function vect_create_epilog_for_reduction
2227 Create code at the loop-epilog to finalize the result of a reduction
2230 VECT_DEF is a vector of partial results.
2231 REDUC_CODE is the tree-code for the epilog reduction.
2232 STMT is the scalar reduction stmt that is being vectorized.
2233 REDUCTION_PHI is the phi-node that carries the reduction computation.
2236 1. Creates the reduction def-use cycle: sets the arguments for
2238 The loop-entry argument is the vectorized initial-value of the reduction.
2239 The loop-latch argument is VECT_DEF - the vector of partial sums.
2240 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2241 by applying the operation specified by REDUC_CODE if available, or by
2242 other means (whole-vector shifts or a scalar loop).
2243 The function also creates a new phi node at the loop exit to preserve
2244 loop-closed form, as illustrated below.
2246 The flow at the entry to this function:
2249 vec_def = phi <null, null> # REDUCTION_PHI
2250 VECT_DEF = vector_stmt # vectorized form of STMT
2251 s_loop = scalar_stmt # (scalar) STMT
2253 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2257 The above is transformed by this function into:
2260 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2261 VECT_DEF = vector_stmt # vectorized form of STMT
2262 s_loop = scalar_stmt # (scalar) STMT
2264 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2265 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2266 v_out2 = reduce <v_out1>
2267 s_out3 = extract_field <v_out2, 0>
2268 s_out4 = adjust_result <s_out3>
2274 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
2275 enum tree_code reduc_code, tree reduction_phi)
2277 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2279 enum machine_mode mode;
2280 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2281 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2282 basic_block exit_bb;
2286 block_stmt_iterator exit_bsi;
2288 tree new_temp = NULL_TREE;
2290 tree epilog_stmt = NULL_TREE;
2291 tree new_scalar_dest, exit_phi, new_dest;
2292 tree bitsize, bitpos, bytesize;
2293 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2294 tree adjustment_def;
2295 tree vec_initial_def;
2297 imm_use_iterator imm_iter;
2298 use_operand_p use_p;
2299 bool extract_scalar_result = false;
2300 tree reduction_op, expr;
2303 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
2304 bool nested_in_vect_loop = false;
2306 VEC(tree,heap) *phis = NULL;
2309 if (nested_in_vect_loop_p (loop, stmt))
2312 nested_in_vect_loop = true;
2315 op_type = TREE_OPERAND_LENGTH (operation);
2316 reduction_op = TREE_OPERAND (operation, op_type-1);
2317 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2318 gcc_assert (vectype);
2319 mode = TYPE_MODE (vectype);
2321 /*** 1. Create the reduction def-use cycle ***/
2323 /* 1.1 set the loop-entry arg of the reduction-phi: */
2324 /* For the case of reduction, vect_get_vec_def_for_operand returns
2325 the scalar def before the loop, that defines the initial value
2326 of the reduction variable. */
2327 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2329 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
2331 /* 1.2 set the loop-latch arg for the reduction-phi: */
2332 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
2334 if (vect_print_dump_info (REPORT_DETAILS))
2336 fprintf (vect_dump, "transform reduction: created def-use cycle:");
2337 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
2338 fprintf (vect_dump, "\n");
2339 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
2343 /*** 2. Create epilog code
2344 The reduction epilog code operates across the elements of the vector
2345 of partial results computed by the vectorized loop.
2346 The reduction epilog code consists of:
2347 step 1: compute the scalar result in a vector (v_out2)
2348 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2349 step 3: adjust the scalar result (s_out3) if needed.
2351 Step 1 can be accomplished using one the following three schemes:
2352 (scheme 1) using reduc_code, if available.
2353 (scheme 2) using whole-vector shifts, if available.
2354 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2357 The overall epilog code looks like this:
2359 s_out0 = phi <s_loop> # original EXIT_PHI
2360 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2361 v_out2 = reduce <v_out1> # step 1
2362 s_out3 = extract_field <v_out2, 0> # step 2
2363 s_out4 = adjust_result <s_out3> # step 3
2365 (step 3 is optional, and step2 1 and 2 may be combined).
2366 Lastly, the uses of s_out0 are replaced by s_out4.
2370 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2371 v_out1 = phi <v_loop> */
2373 exit_bb = single_exit (loop)->dest;
2374 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2375 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
2376 exit_bsi = bsi_after_labels (exit_bb);
2378 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2379 (i.e. when reduc_code is not available) and in the final adjustment
2380 code (if needed). Also get the original scalar reduction variable as
2381 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2382 represents a reduction pattern), the tree-code and scalar-def are
2383 taken from the original stmt that the pattern-stmt (STMT) replaces.
2384 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2385 are taken from STMT. */
2387 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2390 /* Regular reduction */
2395 /* Reduction pattern */
2396 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2397 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2398 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2400 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2401 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
2402 scalar_type = TREE_TYPE (scalar_dest);
2403 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2404 bitsize = TYPE_SIZE (scalar_type);
2405 bytesize = TYPE_SIZE_UNIT (scalar_type);
2408 /* In case this is a reduction in an inner-loop while vectorizing an outer
2409 loop - we don't need to extract a single scalar result at the end of the
2410 inner-loop. The final vector of partial results will be used in the
2411 vectorized outer-loop, or reduced to a scalar result at the end of the
2413 if (nested_in_vect_loop)
2414 goto vect_finalize_reduction;
2416 /* 2.3 Create the reduction code, using one of the three schemes described
2419 if (reduc_code < NUM_TREE_CODES)
2423 /*** Case 1: Create:
2424 v_out2 = reduc_expr <v_out1> */
2426 if (vect_print_dump_info (REPORT_DETAILS))
2427 fprintf (vect_dump, "Reduce using direct vector reduction.");
2429 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2430 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2431 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2432 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2433 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2434 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2436 extract_scalar_result = true;
2440 enum tree_code shift_code = 0;
2441 bool have_whole_vector_shift = true;
2443 int element_bitsize = tree_low_cst (bitsize, 1);
2444 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2447 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2448 shift_code = VEC_RSHIFT_EXPR;
2450 have_whole_vector_shift = false;
2452 /* Regardless of whether we have a whole vector shift, if we're
2453 emulating the operation via tree-vect-generic, we don't want
2454 to use it. Only the first round of the reduction is likely
2455 to still be profitable via emulation. */
2456 /* ??? It might be better to emit a reduction tree code here, so that
2457 tree-vect-generic can expand the first round via bit tricks. */
2458 if (!VECTOR_MODE_P (mode))
2459 have_whole_vector_shift = false;
2462 optab optab = optab_for_tree_code (code, vectype);
2463 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2464 have_whole_vector_shift = false;
2467 if (have_whole_vector_shift)
2469 /*** Case 2: Create:
2470 for (offset = VS/2; offset >= element_size; offset/=2)
2472 Create: va' = vec_shift <va, offset>
2473 Create: va = vop <va, va'>
2476 if (vect_print_dump_info (REPORT_DETAILS))
2477 fprintf (vect_dump, "Reduce using vector shifts");
2479 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2480 new_temp = PHI_RESULT (new_phi);
2482 for (bit_offset = vec_size_in_bits/2;
2483 bit_offset >= element_bitsize;
2486 tree bitpos = size_int (bit_offset);
2487 tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
2488 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2489 new_name = make_ssa_name (vec_dest, epilog_stmt);
2490 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2491 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2493 tmp = build2 (code, vectype, new_name, new_temp);
2494 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2495 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2496 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2497 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2500 extract_scalar_result = true;
2506 /*** Case 3: Create:
2507 s = extract_field <v_out2, 0>
2508 for (offset = element_size;
2509 offset < vector_size;
2510 offset += element_size;)
2512 Create: s' = extract_field <v_out2, offset>
2513 Create: s = op <s, s'>
2516 if (vect_print_dump_info (REPORT_DETAILS))
2517 fprintf (vect_dump, "Reduce using scalar code. ");
2519 vec_temp = PHI_RESULT (new_phi);
2520 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2521 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2523 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2524 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2525 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2526 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2527 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2529 for (bit_offset = element_bitsize;
2530 bit_offset < vec_size_in_bits;
2531 bit_offset += element_bitsize)
2534 tree bitpos = bitsize_int (bit_offset);
2535 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2538 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2539 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2540 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2541 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2542 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2544 tmp = build2 (code, scalar_type, new_name, new_temp);
2545 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
2546 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2547 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2548 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2551 extract_scalar_result = false;
2555 /* 2.4 Extract the final scalar result. Create:
2556 s_out3 = extract_field <v_out2, bitpos> */
2558 if (extract_scalar_result)
2562 gcc_assert (!nested_in_vect_loop);
2563 if (vect_print_dump_info (REPORT_DETAILS))
2564 fprintf (vect_dump, "extract scalar result");
2566 if (BYTES_BIG_ENDIAN)
2567 bitpos = size_binop (MULT_EXPR,
2568 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2569 TYPE_SIZE (scalar_type));
2571 bitpos = bitsize_zero_node;
2573 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2574 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2575 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2576 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2577 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2578 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2581 vect_finalize_reduction:
2583 /* 2.5 Adjust the final result by the initial value of the reduction
2584 variable. (When such adjustment is not needed, then
2585 'adjustment_def' is zero). For example, if code is PLUS we create:
2586 new_temp = loop_exit_def + adjustment_def */
2590 if (nested_in_vect_loop)
2592 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2593 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2594 new_dest = vect_create_destination_var (scalar_dest, vectype);
2598 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2599 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2600 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2602 epilog_stmt = build_gimple_modify_stmt (new_dest, expr);
2603 new_temp = make_ssa_name (new_dest, epilog_stmt);
2604 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2605 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2609 /* 2.6 Handle the loop-exit phi */
2611 /* Replace uses of s_out0 with uses of s_out3:
2612 Find the loop-closed-use at the loop exit of the original scalar result.
2613 (The reduction result is expected to have two immediate uses - one at the
2614 latch block, and one at the loop exit). */
2615 phis = VEC_alloc (tree, heap, 10);
2616 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2618 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
2620 exit_phi = USE_STMT (use_p);
2621 VEC_quick_push (tree, phis, exit_phi);
2624 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2625 gcc_assert (!VEC_empty (tree, phis));
2627 for (i = 0; VEC_iterate (tree, phis, i, exit_phi); i++)
2629 if (nested_in_vect_loop)
2631 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2633 /* FORNOW. Currently not supporting the case that an inner-loop reduction
2634 is not used in the outer-loop (but only outside the outer-loop). */
2635 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2636 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2638 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2639 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2640 set_stmt_info (get_stmt_ann (epilog_stmt),
2641 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2645 /* Replace the uses: */
2646 orig_name = PHI_RESULT (exit_phi);
2647 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2648 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2649 SET_USE (use_p, new_temp);
2651 VEC_free (tree, heap, phis);
2655 /* Function vectorizable_reduction.
2657 Check if STMT performs a reduction operation that can be vectorized.
2658 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2659 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2660 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2662 This function also handles reduction idioms (patterns) that have been
2663 recognized in advance during vect_pattern_recog. In this case, STMT may be
2665 X = pattern_expr (arg0, arg1, ..., X)
2666 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2667 sequence that had been detected and replaced by the pattern-stmt (STMT).
2669 In some cases of reduction patterns, the type of the reduction variable X is
2670 different than the type of the other arguments of STMT.
2671 In such cases, the vectype that is used when transforming STMT into a vector
2672 stmt is different than the vectype that is used to determine the
2673 vectorization factor, because it consists of a different number of elements
2674 than the actual number of elements that are being operated upon in parallel.
2676 For example, consider an accumulation of shorts into an int accumulator.
2677 On some targets it's possible to vectorize this pattern operating on 8
2678 shorts at a time (hence, the vectype for purposes of determining the
2679 vectorization factor should be V8HI); on the other hand, the vectype that
2680 is used to create the vector form is actually V4SI (the type of the result).
2682 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2683 indicates what is the actual level of parallelism (V8HI in the example), so
2684 that the right vectorization factor would be derived. This vectype
2685 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2686 be used to create the vectorized stmt. The right vectype for the vectorized
2687 stmt is obtained from the type of the result X:
2688 get_vectype_for_scalar_type (TREE_TYPE (X))
2690 This means that, contrary to "regular" reductions (or "regular" stmts in
2691 general), the following equation:
2692 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2693 does *NOT* necessarily hold for reduction patterns. */
2696 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2701 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2702 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2703 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2704 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2705 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2707 enum tree_code code, orig_code, epilog_reduc_code = 0;
2708 enum machine_mode vec_mode;
2710 optab optab, reduc_optab;
2711 tree new_temp = NULL_TREE;
2713 enum vect_def_type dt;
2718 stmt_vec_info orig_stmt_info;
2719 tree expr = NULL_TREE;
2721 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2722 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2723 stmt_vec_info prev_stmt_info;
2725 tree new_stmt = NULL_TREE;
2728 if (nested_in_vect_loop_p (loop, stmt))
2731 /* FORNOW. This restriction should be relaxed. */
2734 if (vect_print_dump_info (REPORT_DETAILS))
2735 fprintf (vect_dump, "multiple types in nested loop.");
2740 gcc_assert (ncopies >= 1);
2742 /* FORNOW: SLP not supported. */
2743 if (STMT_SLP_TYPE (stmt_info))
2746 /* 1. Is vectorizable reduction? */
2748 /* Not supportable if the reduction variable is used in the loop. */
2749 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2752 /* Reductions that are not used even in an enclosing outer-loop,
2753 are expected to be "live" (used out of the loop). */
2754 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2755 && !STMT_VINFO_LIVE_P (stmt_info))
2758 /* Make sure it was already recognized as a reduction computation. */
2759 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2762 /* 2. Has this been recognized as a reduction pattern?
2764 Check if STMT represents a pattern that has been recognized
2765 in earlier analysis stages. For stmts that represent a pattern,
2766 the STMT_VINFO_RELATED_STMT field records the last stmt in
2767 the original sequence that constitutes the pattern. */
2769 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2772 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2773 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2774 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2775 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2778 /* 3. Check the operands of the operation. The first operands are defined
2779 inside the loop body. The last operand is the reduction variable,
2780 which is defined by the loop-header-phi. */
2782 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
2784 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2785 code = TREE_CODE (operation);
2786 op_type = TREE_OPERAND_LENGTH (operation);
2787 if (op_type != binary_op && op_type != ternary_op)
2789 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2790 scalar_type = TREE_TYPE (scalar_dest);
2791 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2792 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2795 /* All uses but the last are expected to be defined in the loop.
2796 The last use is the reduction variable. */
2797 for (i = 0; i < op_type-1; i++)
2799 op = TREE_OPERAND (operation, i);
2800 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2801 gcc_assert (is_simple_use);
2802 if (dt != vect_loop_def
2803 && dt != vect_invariant_def
2804 && dt != vect_constant_def
2805 && dt != vect_induction_def)
2809 op = TREE_OPERAND (operation, i);
2810 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2811 gcc_assert (is_simple_use);
2812 gcc_assert (dt == vect_reduction_def);
2813 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
2815 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2817 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2819 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2822 /* 4. Supportable by target? */
2824 /* 4.1. check support for the operation in the loop */
2825 optab = optab_for_tree_code (code, vectype);
2828 if (vect_print_dump_info (REPORT_DETAILS))
2829 fprintf (vect_dump, "no optab.");
2832 vec_mode = TYPE_MODE (vectype);
2833 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2835 if (vect_print_dump_info (REPORT_DETAILS))
2836 fprintf (vect_dump, "op not supported by target.");
2837 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2838 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2839 < vect_min_worthwhile_factor (code))
2841 if (vect_print_dump_info (REPORT_DETAILS))
2842 fprintf (vect_dump, "proceeding using word mode.");
2845 /* Worthwhile without SIMD support? */
2846 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2847 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2848 < vect_min_worthwhile_factor (code))
2850 if (vect_print_dump_info (REPORT_DETAILS))
2851 fprintf (vect_dump, "not worthwhile without SIMD support.");
2855 /* 4.2. Check support for the epilog operation.
2857 If STMT represents a reduction pattern, then the type of the
2858 reduction variable may be different than the type of the rest
2859 of the arguments. For example, consider the case of accumulation
2860 of shorts into an int accumulator; The original code:
2861 S1: int_a = (int) short_a;
2862 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2865 STMT: int_acc = widen_sum <short_a, int_acc>
2868 1. The tree-code that is used to create the vector operation in the
2869 epilog code (that reduces the partial results) is not the
2870 tree-code of STMT, but is rather the tree-code of the original
2871 stmt from the pattern that STMT is replacing. I.e, in the example
2872 above we want to use 'widen_sum' in the loop, but 'plus' in the
2874 2. The type (mode) we use to check available target support
2875 for the vector operation to be created in the *epilog*, is
2876 determined by the type of the reduction variable (in the example
2877 above we'd check this: plus_optab[vect_int_mode]).
2878 However the type (mode) we use to check available target support
2879 for the vector operation to be created *inside the loop*, is
2880 determined by the type of the other arguments to STMT (in the
2881 example we'd check this: widen_sum_optab[vect_short_mode]).
2883 This is contrary to "regular" reductions, in which the types of all
2884 the arguments are the same as the type of the reduction variable.
2885 For "regular" reductions we can therefore use the same vector type
2886 (and also the same tree-code) when generating the epilog code and
2887 when generating the code inside the loop. */
2891 /* This is a reduction pattern: get the vectype from the type of the
2892 reduction variable, and get the tree-code from orig_stmt. */
2893 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2894 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
2897 if (vect_print_dump_info (REPORT_DETAILS))
2899 fprintf (vect_dump, "unsupported data-type ");
2900 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
2905 vec_mode = TYPE_MODE (vectype);
2909 /* Regular reduction: use the same vectype and tree-code as used for
2910 the vector code inside the loop can be used for the epilog code. */
2914 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
2916 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
2919 if (vect_print_dump_info (REPORT_DETAILS))
2920 fprintf (vect_dump, "no optab for reduction.");
2921 epilog_reduc_code = NUM_TREE_CODES;
2923 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
2925 if (vect_print_dump_info (REPORT_DETAILS))
2926 fprintf (vect_dump, "reduc op not supported by target.");
2927 epilog_reduc_code = NUM_TREE_CODES;
2930 if (!vec_stmt) /* transformation not required. */
2932 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
2933 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
2940 if (vect_print_dump_info (REPORT_DETAILS))
2941 fprintf (vect_dump, "transform reduction.");
2943 /* Create the destination vector */
2944 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2946 /* Create the reduction-phi that defines the reduction-operand. */
2947 new_phi = create_phi_node (vec_dest, loop->header);
2949 /* In case the vectorization factor (VF) is bigger than the number
2950 of elements that we can fit in a vectype (nunits), we have to generate
2951 more than one vector stmt - i.e - we need to "unroll" the
2952 vector stmt by a factor VF/nunits. For more details see documentation
2953 in vectorizable_operation. */
2955 prev_stmt_info = NULL;
2956 for (j = 0; j < ncopies; j++)
2961 op = TREE_OPERAND (operation, 0);
2962 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
2963 if (op_type == ternary_op)
2965 op = TREE_OPERAND (operation, 1);
2966 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
2969 /* Get the vector def for the reduction variable from the phi node */
2970 reduc_def = PHI_RESULT (new_phi);
2974 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2975 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
2976 if (op_type == ternary_op)
2977 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
2979 /* Get the vector def for the reduction variable from the vectorized
2980 reduction operation generated in the previous iteration (j-1) */
2981 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
2984 /* Arguments are ready. create the new vector stmt. */
2985 if (op_type == binary_op)
2986 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
2988 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
2990 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2991 new_temp = make_ssa_name (vec_dest, new_stmt);
2992 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2993 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2996 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2998 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2999 prev_stmt_info = vinfo_for_stmt (new_stmt);
3002 /* Finalize the reduction-phi (set it's arguments) and create the
3003 epilog reduction code. */
3004 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
3008 /* Checks if CALL can be vectorized in type VECTYPE. Returns
3009 a function declaration if the target has a vectorized version
3010 of the function, or NULL_TREE if the function cannot be vectorized. */
3013 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
3015 tree fndecl = get_callee_fndecl (call);
3016 enum built_in_function code;
3018 /* We only handle functions that do not read or clobber memory -- i.e.
3019 const or novops ones. */
3020 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3024 || TREE_CODE (fndecl) != FUNCTION_DECL
3025 || !DECL_BUILT_IN (fndecl))
3028 code = DECL_FUNCTION_CODE (fndecl);
3029 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3033 /* Function vectorizable_call.
3035 Check if STMT performs a function call that can be vectorized.
3036 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3037 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3038 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3041 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3047 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3048 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3049 tree vectype_out, vectype_in;
3052 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3053 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3054 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
3055 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3057 int ncopies, j, nargs;
3058 call_expr_arg_iterator iter;
3060 enum { NARROW, NONE, WIDEN } modifier;
3062 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3065 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3068 /* FORNOW: SLP not supported. */
3069 if (STMT_SLP_TYPE (stmt_info))
3072 /* Is STMT a vectorizable call? */
3073 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3076 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3079 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3080 if (TREE_CODE (operation) != CALL_EXPR)
3083 /* Process function arguments. */
3084 rhs_type = NULL_TREE;
3086 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3088 /* Bail out if the function has more than two arguments, we
3089 do not have interesting builtin functions to vectorize with
3090 more than two arguments. */
3094 /* We can only handle calls with arguments of the same type. */
3096 && rhs_type != TREE_TYPE (op))
3098 if (vect_print_dump_info (REPORT_DETAILS))
3099 fprintf (vect_dump, "argument types differ.");
3102 rhs_type = TREE_TYPE (op);
3104 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
3106 if (vect_print_dump_info (REPORT_DETAILS))
3107 fprintf (vect_dump, "use not simple.");
3114 /* No arguments is also not good. */
3118 vectype_in = get_vectype_for_scalar_type (rhs_type);
3121 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3123 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
3124 vectype_out = get_vectype_for_scalar_type (lhs_type);
3127 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3130 if (nunits_in == nunits_out / 2)
3132 else if (nunits_out == nunits_in)
3134 else if (nunits_out == nunits_in / 2)
3139 /* For now, we only vectorize functions if a target specific builtin
3140 is available. TODO -- in some cases, it might be profitable to
3141 insert the calls for pieces of the vector, in order to be able
3142 to vectorize other operations in the loop. */
3143 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
3144 if (fndecl == NULL_TREE)
3146 if (vect_print_dump_info (REPORT_DETAILS))
3147 fprintf (vect_dump, "function is not vectorizable.");
3152 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3154 if (modifier == NARROW)
3155 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3157 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3159 /* Sanity check: make sure that at least one copy of the vectorized stmt
3160 needs to be generated. */
3161 gcc_assert (ncopies >= 1);
3163 /* FORNOW. This restriction should be relaxed. */
3164 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3166 if (vect_print_dump_info (REPORT_DETAILS))
3167 fprintf (vect_dump, "multiple types in nested loop.");
3171 if (!vec_stmt) /* transformation not required. */
3173 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3174 if (vect_print_dump_info (REPORT_DETAILS))
3175 fprintf (vect_dump, "=== vectorizable_call ===");
3176 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3182 if (vect_print_dump_info (REPORT_DETAILS))
3183 fprintf (vect_dump, "transform operation.");
3185 /* FORNOW. This restriction should be relaxed. */
3186 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3188 if (vect_print_dump_info (REPORT_DETAILS))
3189 fprintf (vect_dump, "multiple types in nested loop.");
3194 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3195 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3197 prev_stmt_info = NULL;
3201 for (j = 0; j < ncopies; ++j)
3203 /* Build argument list for the vectorized call. */
3204 /* FIXME: Rewrite this so that it doesn't
3205 construct a temporary list. */
3208 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3212 = vect_get_vec_def_for_operand (op, stmt, NULL);
3215 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3217 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3221 vargs = nreverse (vargs);
3223 rhs = build_function_call_expr (fndecl, vargs);
3224 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3225 new_temp = make_ssa_name (vec_dest, new_stmt);
3226 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3228 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3231 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3233 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3235 prev_stmt_info = vinfo_for_stmt (new_stmt);
3241 for (j = 0; j < ncopies; ++j)
3243 /* Build argument list for the vectorized call. */
3244 /* FIXME: Rewrite this so that it doesn't
3245 construct a temporary list. */
3248 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3253 = vect_get_vec_def_for_operand (op, stmt, NULL);
3255 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3260 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3262 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3265 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3266 vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs);
3270 vargs = nreverse (vargs);
3272 rhs = build_function_call_expr (fndecl, vargs);
3273 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3274 new_temp = make_ssa_name (vec_dest, new_stmt);
3275 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3277 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3280 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3282 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3284 prev_stmt_info = vinfo_for_stmt (new_stmt);
3287 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3292 /* No current target implements this case. */
3296 /* The call in STMT might prevent it from being removed in dce.
3297 We however cannot remove it here, due to the way the ssa name
3298 it defines is mapped to the new definition. So just replace
3299 rhs of the statement with something harmless. */
3300 type = TREE_TYPE (scalar_dest);
3301 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
3308 /* Function vect_gen_widened_results_half
3310 Create a vector stmt whose code, type, number of arguments, and result
3311 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
3312 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3313 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3314 needs to be created (DECL is a function-decl of a target-builtin).
3315 STMT is the original scalar stmt that we are vectorizing. */
3318 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
3319 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3320 tree vec_dest, block_stmt_iterator *bsi,
3329 /* Generate half of the widened result: */
3330 if (code == CALL_EXPR)
3332 /* Target specific support */
3333 if (op_type == binary_op)
3334 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
3336 expr = build_call_expr (decl, 1, vec_oprnd0);
3340 /* Generic support */
3341 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3342 if (op_type == binary_op)
3343 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
3345 expr = build1 (code, vectype, vec_oprnd0);
3347 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3348 new_temp = make_ssa_name (vec_dest, new_stmt);
3349 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3350 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3352 if (code == CALL_EXPR)
3354 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3356 if (TREE_CODE (sym) == SSA_NAME)
3357 sym = SSA_NAME_VAR (sym);
3358 mark_sym_for_renaming (sym);
3366 /* Check if STMT performs a conversion operation, that can be vectorized.
3367 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3368 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3369 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3372 vectorizable_conversion (tree stmt, block_stmt_iterator *bsi,
3373 tree *vec_stmt, slp_tree slp_node)
3379 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3380 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3381 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3382 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3383 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3384 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3387 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3388 tree new_stmt = NULL_TREE;
3389 stmt_vec_info prev_stmt_info;
3392 tree vectype_out, vectype_in;
3395 tree rhs_type, lhs_type;
3397 enum { NARROW, NONE, WIDEN } modifier;
3399 VEC(tree,heap) *vec_oprnds0 = NULL;
3402 /* Is STMT a vectorizable conversion? */
3404 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3407 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3410 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3413 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3416 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3417 code = TREE_CODE (operation);
3418 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3421 /* Check types of lhs and rhs. */
3422 op0 = TREE_OPERAND (operation, 0);
3423 rhs_type = TREE_TYPE (op0);
3424 vectype_in = get_vectype_for_scalar_type (rhs_type);
3427 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3429 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3430 lhs_type = TREE_TYPE (scalar_dest);
3431 vectype_out = get_vectype_for_scalar_type (lhs_type);
3434 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3437 if (nunits_in == nunits_out / 2)
3439 else if (nunits_out == nunits_in)
3441 else if (nunits_out == nunits_in / 2)
3446 if (modifier == NONE)
3447 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3449 /* Bail out if the types are both integral or non-integral. */
3450 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3451 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3454 if (modifier == NARROW)
3455 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3457 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3459 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3460 this, so we can safely override NCOPIES with 1 here. */
3464 /* Sanity check: make sure that at least one copy of the vectorized stmt
3465 needs to be generated. */
3466 gcc_assert (ncopies >= 1);
3468 /* FORNOW. This restriction should be relaxed. */
3469 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3471 if (vect_print_dump_info (REPORT_DETAILS))
3472 fprintf (vect_dump, "multiple types in nested loop.");
3476 /* Check the operands of the operation. */
3477 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3479 if (vect_print_dump_info (REPORT_DETAILS))
3480 fprintf (vect_dump, "use not simple.");
3484 /* Supportable by target? */
3485 if ((modifier == NONE
3486 && !targetm.vectorize.builtin_conversion (code, vectype_in))
3487 || (modifier == WIDEN
3488 && !supportable_widening_operation (code, stmt, vectype_in,
3491 || (modifier == NARROW
3492 && !supportable_narrowing_operation (code, stmt, vectype_in,
3495 if (vect_print_dump_info (REPORT_DETAILS))
3496 fprintf (vect_dump, "op not supported by target.");
3500 if (modifier != NONE)
3502 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3503 /* FORNOW: SLP not supported. */
3504 if (STMT_SLP_TYPE (stmt_info))
3508 if (!vec_stmt) /* transformation not required. */
3510 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3515 if (vect_print_dump_info (REPORT_DETAILS))
3516 fprintf (vect_dump, "transform conversion.");
3519 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3521 if (modifier == NONE && !slp_node)
3522 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3524 prev_stmt_info = NULL;
3528 for (j = 0; j < ncopies; j++)
3534 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3536 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3539 targetm.vectorize.builtin_conversion (code, vectype_in);
3540 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3542 new_stmt = build_call_expr (builtin_decl, 1, vop0);
3544 /* Arguments are ready. create the new vector stmt. */
3545 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3546 new_temp = make_ssa_name (vec_dest, new_stmt);
3547 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3548 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3549 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3550 SSA_OP_ALL_VIRTUALS)
3552 if (TREE_CODE (sym) == SSA_NAME)
3553 sym = SSA_NAME_VAR (sym);
3554 mark_sym_for_renaming (sym);
3557 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3561 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3563 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3564 prev_stmt_info = vinfo_for_stmt (new_stmt);
3569 /* In case the vectorization factor (VF) is bigger than the number
3570 of elements that we can fit in a vectype (nunits), we have to
3571 generate more than one vector stmt - i.e - we need to "unroll"
3572 the vector stmt by a factor VF/nunits. */
3573 for (j = 0; j < ncopies; j++)
3576 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3578 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3580 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3582 /* Generate first half of the widened result: */
3584 = vect_gen_widened_results_half (code1, vectype_out, decl1,
3585 vec_oprnd0, vec_oprnd1,
3586 unary_op, vec_dest, bsi, stmt);
3588 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3590 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3591 prev_stmt_info = vinfo_for_stmt (new_stmt);
3593 /* Generate second half of the widened result: */
3595 = vect_gen_widened_results_half (code2, vectype_out, decl2,
3596 vec_oprnd0, vec_oprnd1,
3597 unary_op, vec_dest, bsi, stmt);
3598 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3599 prev_stmt_info = vinfo_for_stmt (new_stmt);
3604 /* In case the vectorization factor (VF) is bigger than the number
3605 of elements that we can fit in a vectype (nunits), we have to
3606 generate more than one vector stmt - i.e - we need to "unroll"
3607 the vector stmt by a factor VF/nunits. */
3608 for (j = 0; j < ncopies; j++)
3613 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3614 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3618 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3619 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3622 /* Arguments are ready. Create the new vector stmt. */
3623 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3624 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3625 new_temp = make_ssa_name (vec_dest, new_stmt);
3626 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3627 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3630 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3632 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3634 prev_stmt_info = vinfo_for_stmt (new_stmt);
3637 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3644 /* Function vectorizable_assignment.
3646 Check if STMT performs an assignment (copy) that can be vectorized.
3647 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3648 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3649 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3652 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3658 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3659 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3660 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3663 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3664 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3665 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3667 VEC(tree,heap) *vec_oprnds = NULL;
3670 gcc_assert (ncopies >= 1);
3672 return false; /* FORNOW */
3674 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3677 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3680 /* Is vectorizable assignment? */
3681 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3684 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3685 if (TREE_CODE (scalar_dest) != SSA_NAME)
3688 op = GIMPLE_STMT_OPERAND (stmt, 1);
3689 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3691 if (vect_print_dump_info (REPORT_DETAILS))
3692 fprintf (vect_dump, "use not simple.");
3696 if (!vec_stmt) /* transformation not required. */
3698 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3699 if (vect_print_dump_info (REPORT_DETAILS))
3700 fprintf (vect_dump, "=== vectorizable_assignment ===");
3701 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3706 if (vect_print_dump_info (REPORT_DETAILS))
3707 fprintf (vect_dump, "transform assignment.");
3710 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3713 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3715 /* Arguments are ready. create the new vector stmt. */
3716 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3718 *vec_stmt = build_gimple_modify_stmt (vec_dest, vop);
3719 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3720 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
3721 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3722 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3725 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3728 VEC_free (tree, heap, vec_oprnds);
3733 /* Function vect_min_worthwhile_factor.
3735 For a loop where we could vectorize the operation indicated by CODE,
3736 return the minimum vectorization factor that makes it worthwhile
3737 to use generic vectors. */
3739 vect_min_worthwhile_factor (enum tree_code code)
3760 /* Function vectorizable_induction
3762 Check if PHI performs an induction computation that can be vectorized.
3763 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3764 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3765 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3768 vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3771 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3772 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3773 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3774 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3775 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3778 gcc_assert (ncopies >= 1);
3780 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3783 /* FORNOW: SLP not supported. */
3784 if (STMT_SLP_TYPE (stmt_info))
3787 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3789 if (TREE_CODE (phi) != PHI_NODE)
3792 if (!vec_stmt) /* transformation not required. */
3794 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3795 if (vect_print_dump_info (REPORT_DETAILS))
3796 fprintf (vect_dump, "=== vectorizable_induction ===");
3797 vect_model_induction_cost (stmt_info, ncopies);
3803 if (vect_print_dump_info (REPORT_DETAILS))
3804 fprintf (vect_dump, "transform induction phi.");
3806 vec_def = get_initial_def_for_induction (phi);
3807 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3812 /* Function vectorizable_operation.
3814 Check if STMT performs a binary or unary operation that can be vectorized.
3815 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3816 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3817 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3820 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3826 tree op0, op1 = NULL;
3827 tree vec_oprnd1 = NULL_TREE;
3828 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3829 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3830 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3831 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3832 enum tree_code code;
3833 enum machine_mode vec_mode;
3838 enum machine_mode optab_op2_mode;
3840 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3841 tree new_stmt = NULL_TREE;
3842 stmt_vec_info prev_stmt_info;
3843 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
3846 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3848 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
3851 bool scalar_shift_arg = false;
3853 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3854 this, so we can safely override NCOPIES with 1 here. */
3857 gcc_assert (ncopies >= 1);
3858 /* FORNOW. This restriction should be relaxed. */
3859 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3861 if (vect_print_dump_info (REPORT_DETAILS))
3862 fprintf (vect_dump, "multiple types in nested loop.");
3866 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3869 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3872 /* Is STMT a vectorizable binary/unary operation? */
3873 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3876 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3879 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3880 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3883 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3884 if (nunits_out != nunits_in)
3887 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3888 code = TREE_CODE (operation);
3890 /* For pointer addition, we should use the normal plus for
3891 the vector addition. */
3892 if (code == POINTER_PLUS_EXPR)
3895 optab = optab_for_tree_code (code, vectype);
3897 /* Support only unary or binary operations. */
3898 op_type = TREE_OPERAND_LENGTH (operation);
3899 if (op_type != unary_op && op_type != binary_op)
3901 if (vect_print_dump_info (REPORT_DETAILS))
3902 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
3906 op0 = TREE_OPERAND (operation, 0);
3907 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3909 if (vect_print_dump_info (REPORT_DETAILS))
3910 fprintf (vect_dump, "use not simple.");
3914 if (op_type == binary_op)
3916 op1 = TREE_OPERAND (operation, 1);
3917 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
3919 if (vect_print_dump_info (REPORT_DETAILS))
3920 fprintf (vect_dump, "use not simple.");
3925 /* Supportable by target? */
3928 if (vect_print_dump_info (REPORT_DETAILS))
3929 fprintf (vect_dump, "no optab.");
3932 vec_mode = TYPE_MODE (vectype);
3933 icode = (int) optab_handler (optab, vec_mode)->insn_code;
3934 if (icode == CODE_FOR_nothing)
3936 if (vect_print_dump_info (REPORT_DETAILS))
3937 fprintf (vect_dump, "op not supported by target.");
3938 /* Check only during analysis. */
3939 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3940 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3941 < vect_min_worthwhile_factor (code)
3944 if (vect_print_dump_info (REPORT_DETAILS))
3945 fprintf (vect_dump, "proceeding using word mode.");
3948 /* Worthwhile without SIMD support? Check only during analysis. */
3949 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3950 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3951 < vect_min_worthwhile_factor (code)
3954 if (vect_print_dump_info (REPORT_DETAILS))
3955 fprintf (vect_dump, "not worthwhile without SIMD support.");
3959 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
3961 /* FORNOW: not yet supported. */
3962 if (!VECTOR_MODE_P (vec_mode))
3965 /* Invariant argument is needed for a vector shift
3966 by a scalar shift operand. */
3967 optab_op2_mode = insn_data[icode].operand[2].mode;
3968 if (!VECTOR_MODE_P (optab_op2_mode))
3970 if (dt[1] != vect_constant_def && dt[1] != vect_invariant_def)
3972 if (vect_print_dump_info (REPORT_DETAILS))
3973 fprintf (vect_dump, "operand mode requires invariant"
3978 scalar_shift_arg = true;
3982 if (!vec_stmt) /* transformation not required. */
3984 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
3985 if (vect_print_dump_info (REPORT_DETAILS))
3986 fprintf (vect_dump, "=== vectorizable_operation ===");
3987 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3993 if (vect_print_dump_info (REPORT_DETAILS))
3994 fprintf (vect_dump, "transform binary/unary operation.");
3997 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3999 /* Allocate VECs for vector operands. In case of SLP, vector operands are
4000 created in the previous stages of the recursion, so no allocation is
4001 needed, except for the case of shift with scalar shift argument. In that
4002 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
4003 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
4004 In case of loop-based vectorization we allocate VECs of size 1. We
4005 allocate VEC_OPRNDS1 only in case of binary operation. */
4008 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4009 if (op_type == binary_op)
4010 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4012 else if (scalar_shift_arg)
4013 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4015 /* In case the vectorization factor (VF) is bigger than the number
4016 of elements that we can fit in a vectype (nunits), we have to generate
4017 more than one vector stmt - i.e - we need to "unroll" the
4018 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4019 from one copy of the vector stmt to the next, in the field
4020 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4021 stages to find the correct vector defs to be used when vectorizing
4022 stmts that use the defs of the current stmt. The example below illustrates
4023 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4024 4 vectorized stmts):
4026 before vectorization:
4027 RELATED_STMT VEC_STMT
4031 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4033 RELATED_STMT VEC_STMT
4034 VS1_0: vx0 = memref0 VS1_1 -
4035 VS1_1: vx1 = memref1 VS1_2 -
4036 VS1_2: vx2 = memref2 VS1_3 -
4037 VS1_3: vx3 = memref3 - -
4038 S1: x = load - VS1_0
4041 step2: vectorize stmt S2 (done here):
4042 To vectorize stmt S2 we first need to find the relevant vector
4043 def for the first operand 'x'. This is, as usual, obtained from
4044 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4045 that defines 'x' (S1). This way we find the stmt VS1_0, and the
4046 relevant vector def 'vx0'. Having found 'vx0' we can generate
4047 the vector stmt VS2_0, and as usual, record it in the
4048 STMT_VINFO_VEC_STMT of stmt S2.
4049 When creating the second copy (VS2_1), we obtain the relevant vector
4050 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4051 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4052 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4053 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4054 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4055 chain of stmts and pointers:
4056 RELATED_STMT VEC_STMT
4057 VS1_0: vx0 = memref0 VS1_1 -
4058 VS1_1: vx1 = memref1 VS1_2 -
4059 VS1_2: vx2 = memref2 VS1_3 -
4060 VS1_3: vx3 = memref3 - -
4061 S1: x = load - VS1_0
4062 VS2_0: vz0 = vx0 + v1 VS2_1 -
4063 VS2_1: vz1 = vx1 + v1 VS2_2 -
4064 VS2_2: vz2 = vx2 + v1 VS2_3 -
4065 VS2_3: vz3 = vx3 + v1 - -
4066 S2: z = x + 1 - VS2_0 */
4068 prev_stmt_info = NULL;
4069 for (j = 0; j < ncopies; j++)
4074 if (op_type == binary_op
4075 && (code == LSHIFT_EXPR || code == RSHIFT_EXPR))
4077 /* Vector shl and shr insn patterns can be defined with scalar
4078 operand 2 (shift operand). In this case, use constant or loop
4079 invariant op1 directly, without extending it to vector mode
4081 optab_op2_mode = insn_data[icode].operand[2].mode;
4082 if (!VECTOR_MODE_P (optab_op2_mode))
4084 if (vect_print_dump_info (REPORT_DETAILS))
4085 fprintf (vect_dump, "operand 1 using scalar mode.");
4087 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4090 /* Store vec_oprnd1 for every vector stmt to be created
4091 for SLP_NODE. We check during the analysis that all the
4092 shift arguments are the same.
4093 TODO: Allow different constants for different vector
4094 stmts generated for an SLP instance. */
4095 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4096 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4101 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4102 (a special case for certain kind of vector shifts); otherwise,
4103 operand 1 should be of a vector type (the usual case). */
4104 if (op_type == binary_op && !vec_oprnd1)
4105 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4108 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4112 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4114 /* Arguments are ready. Create the new vector stmt. */
4115 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4117 if (op_type == binary_op)
4119 vop1 = VEC_index (tree, vec_oprnds1, i);
4120 new_stmt = build_gimple_modify_stmt (vec_dest,
4121 build2 (code, vectype, vop0, vop1));
4124 new_stmt = build_gimple_modify_stmt (vec_dest,
4125 build1 (code, vectype, vop0));
4127 new_temp = make_ssa_name (vec_dest, new_stmt);
4128 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4129 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4131 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4135 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4137 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4138 prev_stmt_info = vinfo_for_stmt (new_stmt);
4141 VEC_free (tree, heap, vec_oprnds0);
4143 VEC_free (tree, heap, vec_oprnds1);
4149 /* Function vectorizable_type_demotion
4151 Check if STMT performs a binary or unary operation that involves
4152 type demotion, and if it can be vectorized.
4153 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4154 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4155 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4158 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
4165 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4166 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4167 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4168 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4169 enum tree_code code, code1 = ERROR_MARK;
4172 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4174 stmt_vec_info prev_stmt_info;
4183 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4186 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4189 /* Is STMT a vectorizable type-demotion operation? */
4190 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4193 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4196 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4197 code = TREE_CODE (operation);
4198 if (code != NOP_EXPR && code != CONVERT_EXPR)
4201 op0 = TREE_OPERAND (operation, 0);
4202 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4205 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4207 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4208 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4211 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4212 if (nunits_in != nunits_out / 2) /* FORNOW */
4215 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4216 gcc_assert (ncopies >= 1);
4217 /* FORNOW. This restriction should be relaxed. */
4218 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4220 if (vect_print_dump_info (REPORT_DETAILS))
4221 fprintf (vect_dump, "multiple types in nested loop.");
4225 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4226 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4227 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4228 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4229 && (code == NOP_EXPR || code == CONVERT_EXPR))))
4232 /* Check the operands of the operation. */
4233 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4235 if (vect_print_dump_info (REPORT_DETAILS))
4236 fprintf (vect_dump, "use not simple.");
4240 /* Supportable by target? */
4241 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1))
4244 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4246 if (!vec_stmt) /* transformation not required. */
4248 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4249 if (vect_print_dump_info (REPORT_DETAILS))
4250 fprintf (vect_dump, "=== vectorizable_demotion ===");
4251 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4256 if (vect_print_dump_info (REPORT_DETAILS))
4257 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4261 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4263 /* In case the vectorization factor (VF) is bigger than the number
4264 of elements that we can fit in a vectype (nunits), we have to generate
4265 more than one vector stmt - i.e - we need to "unroll" the
4266 vector stmt by a factor VF/nunits. */
4267 prev_stmt_info = NULL;
4268 for (j = 0; j < ncopies; j++)
4273 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4274 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4278 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
4279 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4282 /* Arguments are ready. Create the new vector stmt. */
4283 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
4284 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
4285 new_temp = make_ssa_name (vec_dest, new_stmt);
4286 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4287 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4290 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4292 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4294 prev_stmt_info = vinfo_for_stmt (new_stmt);
4297 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4302 /* Function vectorizable_type_promotion
4304 Check if STMT performs a binary or unary operation that involves
4305 type promotion, and if it can be vectorized.
4306 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4307 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4308 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4311 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
4317 tree op0, op1 = NULL;
4318 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4319 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4320 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4321 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4322 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4323 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4326 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4328 stmt_vec_info prev_stmt_info;
4336 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4339 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4342 /* Is STMT a vectorizable type-promotion operation? */
4343 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4346 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4349 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4350 code = TREE_CODE (operation);
4351 if (code != NOP_EXPR && code != CONVERT_EXPR
4352 && code != WIDEN_MULT_EXPR)
4355 op0 = TREE_OPERAND (operation, 0);
4356 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4359 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4361 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4362 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4365 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4366 if (nunits_out != nunits_in / 2) /* FORNOW */
4369 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4370 gcc_assert (ncopies >= 1);
4371 /* FORNOW. This restriction should be relaxed. */
4372 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4374 if (vect_print_dump_info (REPORT_DETAILS))
4375 fprintf (vect_dump, "multiple types in nested loop.");
4379 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4380 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4381 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4382 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4383 && (code == CONVERT_EXPR || code == NOP_EXPR))))
4386 /* Check the operands of the operation. */
4387 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4389 if (vect_print_dump_info (REPORT_DETAILS))
4390 fprintf (vect_dump, "use not simple.");
4394 op_type = TREE_CODE_LENGTH (code);
4395 if (op_type == binary_op)
4397 op1 = TREE_OPERAND (operation, 1);
4398 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4400 if (vect_print_dump_info (REPORT_DETAILS))
4401 fprintf (vect_dump, "use not simple.");
4406 /* Supportable by target? */
4407 if (!supportable_widening_operation (code, stmt, vectype_in,
4408 &decl1, &decl2, &code1, &code2))
4411 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4413 if (!vec_stmt) /* transformation not required. */
4415 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4416 if (vect_print_dump_info (REPORT_DETAILS))
4417 fprintf (vect_dump, "=== vectorizable_promotion ===");
4418 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4424 if (vect_print_dump_info (REPORT_DETAILS))
4425 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4429 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4431 /* In case the vectorization factor (VF) is bigger than the number
4432 of elements that we can fit in a vectype (nunits), we have to generate
4433 more than one vector stmt - i.e - we need to "unroll" the
4434 vector stmt by a factor VF/nunits. */
4436 prev_stmt_info = NULL;
4437 for (j = 0; j < ncopies; j++)
4442 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4443 if (op_type == binary_op)
4444 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4448 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4449 if (op_type == binary_op)
4450 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4453 /* Arguments are ready. Create the new vector stmt. We are creating
4454 two vector defs because the widened result does not fit in one vector.
4455 The vectorized stmt can be expressed as a call to a taregt builtin,
4456 or a using a tree-code. */
4457 /* Generate first half of the widened result: */
4458 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
4459 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4461 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4463 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4464 prev_stmt_info = vinfo_for_stmt (new_stmt);
4466 /* Generate second half of the widened result: */
4467 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
4468 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4469 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4470 prev_stmt_info = vinfo_for_stmt (new_stmt);
4474 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4479 /* Function vect_strided_store_supported.
4481 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4482 and FALSE otherwise. */
4485 vect_strided_store_supported (tree vectype)
4487 optab interleave_high_optab, interleave_low_optab;
4490 mode = (int) TYPE_MODE (vectype);
4492 /* Check that the operation is supported. */
4493 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4495 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4497 if (!interleave_high_optab || !interleave_low_optab)
4499 if (vect_print_dump_info (REPORT_DETAILS))
4500 fprintf (vect_dump, "no optab for interleave.");
4504 if (optab_handler (interleave_high_optab, mode)->insn_code
4506 || optab_handler (interleave_low_optab, mode)->insn_code
4507 == CODE_FOR_nothing)
4509 if (vect_print_dump_info (REPORT_DETAILS))
4510 fprintf (vect_dump, "interleave op not supported by target.");
4518 /* Function vect_permute_store_chain.
4520 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4521 a power of 2, generate interleave_high/low stmts to reorder the data
4522 correctly for the stores. Return the final references for stores in
4525 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4526 The input is 4 vectors each containing 8 elements. We assign a number to each
4527 element, the input sequence is:
4529 1st vec: 0 1 2 3 4 5 6 7
4530 2nd vec: 8 9 10 11 12 13 14 15
4531 3rd vec: 16 17 18 19 20 21 22 23
4532 4th vec: 24 25 26 27 28 29 30 31
4534 The output sequence should be:
4536 1st vec: 0 8 16 24 1 9 17 25
4537 2nd vec: 2 10 18 26 3 11 19 27
4538 3rd vec: 4 12 20 28 5 13 21 30
4539 4th vec: 6 14 22 30 7 15 23 31
4541 i.e., we interleave the contents of the four vectors in their order.
4543 We use interleave_high/low instructions to create such output. The input of
4544 each interleave_high/low operation is two vectors:
4547 the even elements of the result vector are obtained left-to-right from the
4548 high/low elements of the first vector. The odd elements of the result are
4549 obtained left-to-right from the high/low elements of the second vector.
4550 The output of interleave_high will be: 0 4 1 5
4551 and of interleave_low: 2 6 3 7
4554 The permutation is done in log LENGTH stages. In each stage interleave_high
4555 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4556 where the first argument is taken from the first half of DR_CHAIN and the
4557 second argument from it's second half.
4560 I1: interleave_high (1st vec, 3rd vec)
4561 I2: interleave_low (1st vec, 3rd vec)
4562 I3: interleave_high (2nd vec, 4th vec)
4563 I4: interleave_low (2nd vec, 4th vec)
4565 The output for the first stage is:
4567 I1: 0 16 1 17 2 18 3 19
4568 I2: 4 20 5 21 6 22 7 23
4569 I3: 8 24 9 25 10 26 11 27
4570 I4: 12 28 13 29 14 30 15 31
4572 The output of the second stage, i.e. the final result is:
4574 I1: 0 8 16 24 1 9 17 25
4575 I2: 2 10 18 26 3 11 19 27
4576 I3: 4 12 20 28 5 13 21 30
4577 I4: 6 14 22 30 7 15 23 31. */
4580 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
4581 unsigned int length,
4583 block_stmt_iterator *bsi,
4584 VEC(tree,heap) **result_chain)
4586 tree perm_dest, perm_stmt, vect1, vect2, high, low;
4587 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4588 tree scalar_dest, tmp;
4591 VEC(tree,heap) *first, *second;
4593 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4594 first = VEC_alloc (tree, heap, length/2);
4595 second = VEC_alloc (tree, heap, length/2);
4597 /* Check that the operation is supported. */
4598 if (!vect_strided_store_supported (vectype))
4601 *result_chain = VEC_copy (tree, heap, dr_chain);
4603 for (i = 0; i < exact_log2 (length); i++)
4605 for (j = 0; j < length/2; j++)
4607 vect1 = VEC_index (tree, dr_chain, j);
4608 vect2 = VEC_index (tree, dr_chain, j+length/2);
4610 /* Create interleaving stmt:
4611 in the case of big endian:
4612 high = interleave_high (vect1, vect2)
4613 and in the case of little endian:
4614 high = interleave_low (vect1, vect2). */
4615 perm_dest = create_tmp_var (vectype, "vect_inter_high");
4616 DECL_GIMPLE_REG_P (perm_dest) = 1;
4617 add_referenced_var (perm_dest);
4618 if (BYTES_BIG_ENDIAN)
4619 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4621 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4622 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4623 high = make_ssa_name (perm_dest, perm_stmt);
4624 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
4625 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4626 VEC_replace (tree, *result_chain, 2*j, high);
4628 /* Create interleaving stmt:
4629 in the case of big endian:
4630 low = interleave_low (vect1, vect2)
4631 and in the case of little endian:
4632 low = interleave_high (vect1, vect2). */
4633 perm_dest = create_tmp_var (vectype, "vect_inter_low");
4634 DECL_GIMPLE_REG_P (perm_dest) = 1;
4635 add_referenced_var (perm_dest);
4636 if (BYTES_BIG_ENDIAN)
4637 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4639 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4640 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4641 low = make_ssa_name (perm_dest, perm_stmt);
4642 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
4643 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4644 VEC_replace (tree, *result_chain, 2*j+1, low);
4646 dr_chain = VEC_copy (tree, heap, *result_chain);
4652 /* Function vectorizable_store.
4654 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
4656 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4657 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4658 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4661 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
4667 tree vec_oprnd = NULL_TREE;
4668 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4669 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
4670 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4671 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4672 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4673 enum machine_mode vec_mode;
4675 enum dr_alignment_support alignment_support_scheme;
4677 enum vect_def_type dt;
4678 stmt_vec_info prev_stmt_info = NULL;
4679 tree dataref_ptr = NULL_TREE;
4680 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4681 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4683 tree next_stmt, first_stmt = NULL_TREE;
4684 bool strided_store = false;
4685 unsigned int group_size, i;
4686 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
4688 VEC(tree,heap) *vec_oprnds = NULL;
4689 bool slp = (slp_node != NULL);
4690 stmt_vec_info first_stmt_vinfo;
4691 unsigned int vec_num;
4693 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
4694 this, so we can safely override NCOPIES with 1 here. */
4698 gcc_assert (ncopies >= 1);
4700 /* FORNOW. This restriction should be relaxed. */
4701 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4703 if (vect_print_dump_info (REPORT_DETAILS))
4704 fprintf (vect_dump, "multiple types in nested loop.");
4708 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4711 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4714 /* Is vectorizable store? */
4716 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4719 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4720 if (TREE_CODE (scalar_dest) != ARRAY_REF
4721 && TREE_CODE (scalar_dest) != INDIRECT_REF
4722 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
4725 op = GIMPLE_STMT_OPERAND (stmt, 1);
4726 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4728 if (vect_print_dump_info (REPORT_DETAILS))
4729 fprintf (vect_dump, "use not simple.");
4733 vec_mode = TYPE_MODE (vectype);
4734 /* FORNOW. In some cases can vectorize even if data-type not supported
4735 (e.g. - array initialization with 0). */
4736 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
4739 if (!STMT_VINFO_DATA_REF (stmt_info))
4742 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4744 strided_store = true;
4745 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4746 if (!vect_strided_store_supported (vectype)
4747 && !PURE_SLP_STMT (stmt_info) && !slp)
4750 if (first_stmt == stmt)
4752 /* STMT is the leader of the group. Check the operands of all the
4753 stmts of the group. */
4754 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
4757 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4758 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4760 if (vect_print_dump_info (REPORT_DETAILS))
4761 fprintf (vect_dump, "use not simple.");
4764 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4769 if (!vec_stmt) /* transformation not required. */
4771 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
4772 if (!PURE_SLP_STMT (stmt_info))
4773 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
4781 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
4782 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
4784 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
4787 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
4789 /* We vectorize all the stmts of the interleaving group when we
4790 reach the last stmt in the group. */
4791 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
4792 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
4795 *vec_stmt = NULL_TREE;
4800 strided_store = false;
4802 /* VEC_NUM is the number of vect stmts to be created for this group. */
4803 if (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) < group_size)
4804 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4806 vec_num = group_size;
4812 group_size = vec_num = 1;
4813 first_stmt_vinfo = stmt_info;
4816 if (vect_print_dump_info (REPORT_DETAILS))
4817 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
4819 dr_chain = VEC_alloc (tree, heap, group_size);
4820 oprnds = VEC_alloc (tree, heap, group_size);
4822 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
4823 gcc_assert (alignment_support_scheme);
4824 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
4826 /* In case the vectorization factor (VF) is bigger than the number
4827 of elements that we can fit in a vectype (nunits), we have to generate
4828 more than one vector stmt - i.e - we need to "unroll" the
4829 vector stmt by a factor VF/nunits. For more details see documentation in
4830 vect_get_vec_def_for_copy_stmt. */
4832 /* In case of interleaving (non-unit strided access):
4839 We create vectorized stores starting from base address (the access of the
4840 first stmt in the chain (S2 in the above example), when the last store stmt
4841 of the chain (S4) is reached:
4844 VS2: &base + vec_size*1 = vx0
4845 VS3: &base + vec_size*2 = vx1
4846 VS4: &base + vec_size*3 = vx3
4848 Then permutation statements are generated:
4850 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
4851 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
4854 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
4855 (the order of the data-refs in the output of vect_permute_store_chain
4856 corresponds to the order of scalar stmts in the interleaving chain - see
4857 the documentation of vect_permute_store_chain()).
4859 In case of both multiple types and interleaving, above vector stores and
4860 permutation stmts are created for every copy. The result vector stmts are
4861 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
4862 STMT_VINFO_RELATED_STMT for the next copies.
4865 prev_stmt_info = NULL;
4866 for (j = 0; j < ncopies; j++)
4875 /* Get vectorized arguments for SLP_NODE. */
4876 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
4878 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
4882 /* For interleaved stores we collect vectorized defs for all the
4883 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
4884 used as an input to vect_permute_store_chain(), and OPRNDS as
4885 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
4887 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4888 OPRNDS are of size 1. */
4889 next_stmt = first_stmt;
4890 for (i = 0; i < group_size; i++)
4892 /* Since gaps are not supported for interleaved stores,
4893 GROUP_SIZE is the exact number of stmts in the chain.
4894 Therefore, NEXT_STMT can't be NULL_TREE. In case that
4895 there is no interleaving, GROUP_SIZE is 1, and only one
4896 iteration of the loop will be executed. */
4897 gcc_assert (next_stmt);
4898 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4900 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
4902 VEC_quick_push(tree, dr_chain, vec_oprnd);
4903 VEC_quick_push(tree, oprnds, vec_oprnd);
4904 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4907 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
4908 &dummy, &ptr_incr, false,
4909 TREE_TYPE (vec_oprnd), &inv_p);
4910 gcc_assert (!inv_p);
4914 /* FORNOW SLP doesn't work for multiple types. */
4917 /* For interleaved stores we created vectorized defs for all the
4918 defs stored in OPRNDS in the previous iteration (previous copy).
4919 DR_CHAIN is then used as an input to vect_permute_store_chain(),
4920 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
4922 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4923 OPRNDS are of size 1. */
4924 for (i = 0; i < group_size; i++)
4926 op = VEC_index (tree, oprnds, i);
4927 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
4928 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
4929 VEC_replace(tree, dr_chain, i, vec_oprnd);
4930 VEC_replace(tree, oprnds, i, vec_oprnd);
4933 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
4938 result_chain = VEC_alloc (tree, heap, group_size);
4940 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
4945 next_stmt = first_stmt;
4946 for (i = 0; i < vec_num; i++)
4949 /* Bump the vector pointer. */
4950 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
4954 vec_oprnd = VEC_index (tree, vec_oprnds, i);
4955 else if (strided_store)
4956 /* For strided stores vectorized defs are interleaved in
4957 vect_permute_store_chain(). */
4958 vec_oprnd = VEC_index (tree, result_chain, i);
4960 data_ref = build_fold_indirect_ref (dataref_ptr);
4961 /* Arguments are ready. Create the new vector stmt. */
4962 new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
4963 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4964 mark_symbols_for_renaming (new_stmt);
4967 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4969 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4971 prev_stmt_info = vinfo_for_stmt (new_stmt);
4972 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4982 /* Function vect_setup_realignment
4984 This function is called when vectorizing an unaligned load using
4985 the dr_explicit_realign[_optimized] scheme.
4986 This function generates the following code at the loop prolog:
4989 x msq_init = *(floor(p)); # prolog load
4990 realignment_token = call target_builtin;
4992 x msq = phi (msq_init, ---)
4994 The stmts marked with x are generated only for the case of
4995 dr_explicit_realign_optimized.
4997 The code above sets up a new (vector) pointer, pointing to the first
4998 location accessed by STMT, and a "floor-aligned" load using that pointer.
4999 It also generates code to compute the "realignment-token" (if the relevant
5000 target hook was defined), and creates a phi-node at the loop-header bb
5001 whose arguments are the result of the prolog-load (created by this
5002 function) and the result of a load that takes place in the loop (to be
5003 created by the caller to this function).
5005 For the case of dr_explicit_realign_optimized:
5006 The caller to this function uses the phi-result (msq) to create the
5007 realignment code inside the loop, and sets up the missing phi argument,
5010 msq = phi (msq_init, lsq)
5011 lsq = *(floor(p')); # load in loop
5012 result = realign_load (msq, lsq, realignment_token);
5014 For the case of dr_explicit_realign:
5016 msq = *(floor(p)); # load in loop
5018 lsq = *(floor(p')); # load in loop
5019 result = realign_load (msq, lsq, realignment_token);
5022 STMT - (scalar) load stmt to be vectorized. This load accesses
5023 a memory location that may be unaligned.
5024 BSI - place where new code is to be inserted.
5025 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5029 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5030 target hook, if defined.
5031 Return value - the result of the loop-header phi node. */
5034 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
5035 tree *realignment_token,
5036 enum dr_alignment_support alignment_support_scheme,
5038 struct loop **at_loop)
5040 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5041 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5042 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5043 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5045 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5052 tree msq_init = NULL_TREE;
5055 tree msq = NULL_TREE;
5056 tree stmts = NULL_TREE;
5058 bool compute_in_loop = false;
5059 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5060 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5061 struct loop *loop_for_initial_load;
5063 gcc_assert (alignment_support_scheme == dr_explicit_realign
5064 || alignment_support_scheme == dr_explicit_realign_optimized);
5066 /* We need to generate three things:
5067 1. the misalignment computation
5068 2. the extra vector load (for the optimized realignment scheme).
5069 3. the phi node for the two vectors from which the realignment is
5070 done (for the optimized realignment scheme).
5073 /* 1. Determine where to generate the misalignment computation.
5075 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5076 calculation will be generated by this function, outside the loop (in the
5077 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5078 caller, inside the loop.
5080 Background: If the misalignment remains fixed throughout the iterations of
5081 the loop, then both realignment schemes are applicable, and also the
5082 misalignment computation can be done outside LOOP. This is because we are
5083 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5084 are a multiple of VS (the Vector Size), and therefore the misalignment in
5085 different vectorized LOOP iterations is always the same.
5086 The problem arises only if the memory access is in an inner-loop nested
5087 inside LOOP, which is now being vectorized using outer-loop vectorization.
5088 This is the only case when the misalignment of the memory access may not
5089 remain fixed throughout the iterations of the inner-loop (as explained in
5090 detail in vect_supportable_dr_alignment). In this case, not only is the
5091 optimized realignment scheme not applicable, but also the misalignment
5092 computation (and generation of the realignment token that is passed to
5093 REALIGN_LOAD) have to be done inside the loop.
5095 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5096 or not, which in turn determines if the misalignment is computed inside
5097 the inner-loop, or outside LOOP. */
5099 if (init_addr != NULL_TREE)
5101 compute_in_loop = true;
5102 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5106 /* 2. Determine where to generate the extra vector load.
5108 For the optimized realignment scheme, instead of generating two vector
5109 loads in each iteration, we generate a single extra vector load in the
5110 preheader of the loop, and in each iteration reuse the result of the
5111 vector load from the previous iteration. In case the memory access is in
5112 an inner-loop nested inside LOOP, which is now being vectorized using
5113 outer-loop vectorization, we need to determine whether this initial vector
5114 load should be generated at the preheader of the inner-loop, or can be
5115 generated at the preheader of LOOP. If the memory access has no evolution
5116 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5117 to be generated inside LOOP (in the preheader of the inner-loop). */
5119 if (nested_in_vect_loop)
5121 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5122 bool invariant_in_outerloop =
5123 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5124 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5127 loop_for_initial_load = loop;
5129 *at_loop = loop_for_initial_load;
5131 /* 3. For the case of the optimized realignment, create the first vector
5132 load at the loop preheader. */
5134 if (alignment_support_scheme == dr_explicit_realign_optimized)
5136 /* Create msq_init = *(floor(p1)) in the loop preheader */
5138 gcc_assert (!compute_in_loop);
5139 pe = loop_preheader_edge (loop_for_initial_load);
5140 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5141 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5142 &init_addr, &inc, true, NULL_TREE, &inv_p);
5143 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5144 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5145 new_temp = make_ssa_name (vec_dest, new_stmt);
5146 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5147 mark_symbols_for_renaming (new_stmt);
5148 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5149 gcc_assert (!new_bb);
5150 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
5153 /* 4. Create realignment token using a target builtin, if available.
5154 It is done either inside the containing loop, or before LOOP (as
5155 determined above). */
5157 if (targetm.vectorize.builtin_mask_for_load)
5161 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5162 if (compute_in_loop)
5163 gcc_assert (init_addr); /* already computed by the caller. */
5166 /* Generate the INIT_ADDR computation outside LOOP. */
5167 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5169 pe = loop_preheader_edge (loop);
5170 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
5171 gcc_assert (!new_bb);
5174 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5175 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
5176 vec_dest = vect_create_destination_var (scalar_dest,
5177 TREE_TYPE (new_stmt));
5178 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5179 new_temp = make_ssa_name (vec_dest, new_stmt);
5180 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5182 if (compute_in_loop)
5183 bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT);
5186 /* Generate the misalignment computation outside LOOP. */
5187 pe = loop_preheader_edge (loop);
5188 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5189 gcc_assert (!new_bb);
5192 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
5194 /* The result of the CALL_EXPR to this builtin is determined from
5195 the value of the parameter and no global variables are touched
5196 which makes the builtin a "const" function. Requiring the
5197 builtin to have the "const" attribute makes it unnecessary
5198 to call mark_call_clobbered. */
5199 gcc_assert (TREE_READONLY (builtin_decl));
5202 if (alignment_support_scheme == dr_explicit_realign)
5205 gcc_assert (!compute_in_loop);
5206 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5209 /* 5. Create msq = phi <msq_init, lsq> in loop */
5211 pe = loop_preheader_edge (containing_loop);
5212 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5213 msq = make_ssa_name (vec_dest, NULL_TREE);
5214 phi_stmt = create_phi_node (msq, containing_loop->header);
5215 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5216 add_phi_arg (phi_stmt, msq_init, pe);
5222 /* Function vect_strided_load_supported.
5224 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5225 and FALSE otherwise. */
5228 vect_strided_load_supported (tree vectype)
5230 optab perm_even_optab, perm_odd_optab;
5233 mode = (int) TYPE_MODE (vectype);
5235 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
5236 if (!perm_even_optab)
5238 if (vect_print_dump_info (REPORT_DETAILS))
5239 fprintf (vect_dump, "no optab for perm_even.");
5243 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5245 if (vect_print_dump_info (REPORT_DETAILS))
5246 fprintf (vect_dump, "perm_even op not supported by target.");
5250 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
5251 if (!perm_odd_optab)
5253 if (vect_print_dump_info (REPORT_DETAILS))
5254 fprintf (vect_dump, "no optab for perm_odd.");
5258 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5260 if (vect_print_dump_info (REPORT_DETAILS))
5261 fprintf (vect_dump, "perm_odd op not supported by target.");
5268 /* Function vect_permute_load_chain.
5270 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5271 a power of 2, generate extract_even/odd stmts to reorder the input data
5272 correctly. Return the final references for loads in RESULT_CHAIN.
5274 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5275 The input is 4 vectors each containing 8 elements. We assign a number to each
5276 element, the input sequence is:
5278 1st vec: 0 1 2 3 4 5 6 7
5279 2nd vec: 8 9 10 11 12 13 14 15
5280 3rd vec: 16 17 18 19 20 21 22 23
5281 4th vec: 24 25 26 27 28 29 30 31
5283 The output sequence should be:
5285 1st vec: 0 4 8 12 16 20 24 28
5286 2nd vec: 1 5 9 13 17 21 25 29
5287 3rd vec: 2 6 10 14 18 22 26 30
5288 4th vec: 3 7 11 15 19 23 27 31
5290 i.e., the first output vector should contain the first elements of each
5291 interleaving group, etc.
5293 We use extract_even/odd instructions to create such output. The input of each
5294 extract_even/odd operation is two vectors
5298 and the output is the vector of extracted even/odd elements. The output of
5299 extract_even will be: 0 2 4 6
5300 and of extract_odd: 1 3 5 7
5303 The permutation is done in log LENGTH stages. In each stage extract_even and
5304 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5305 order. In our example,
5307 E1: extract_even (1st vec, 2nd vec)
5308 E2: extract_odd (1st vec, 2nd vec)
5309 E3: extract_even (3rd vec, 4th vec)
5310 E4: extract_odd (3rd vec, 4th vec)
5312 The output for the first stage will be:
5314 E1: 0 2 4 6 8 10 12 14
5315 E2: 1 3 5 7 9 11 13 15
5316 E3: 16 18 20 22 24 26 28 30
5317 E4: 17 19 21 23 25 27 29 31
5319 In order to proceed and create the correct sequence for the next stage (or
5320 for the correct output, if the second stage is the last one, as in our
5321 example), we first put the output of extract_even operation and then the
5322 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5323 The input for the second stage is:
5325 1st vec (E1): 0 2 4 6 8 10 12 14
5326 2nd vec (E3): 16 18 20 22 24 26 28 30
5327 3rd vec (E2): 1 3 5 7 9 11 13 15
5328 4th vec (E4): 17 19 21 23 25 27 29 31
5330 The output of the second stage:
5332 E1: 0 4 8 12 16 20 24 28
5333 E2: 2 6 10 14 18 22 26 30
5334 E3: 1 5 9 13 17 21 25 29
5335 E4: 3 7 11 15 19 23 27 31
5337 And RESULT_CHAIN after reordering:
5339 1st vec (E1): 0 4 8 12 16 20 24 28
5340 2nd vec (E3): 1 5 9 13 17 21 25 29
5341 3rd vec (E2): 2 6 10 14 18 22 26 30
5342 4th vec (E4): 3 7 11 15 19 23 27 31. */
5345 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5346 unsigned int length,
5348 block_stmt_iterator *bsi,
5349 VEC(tree,heap) **result_chain)
5351 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
5352 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5357 /* Check that the operation is supported. */
5358 if (!vect_strided_load_supported (vectype))
5361 *result_chain = VEC_copy (tree, heap, dr_chain);
5362 for (i = 0; i < exact_log2 (length); i++)
5364 for (j = 0; j < length; j +=2)
5366 first_vect = VEC_index (tree, dr_chain, j);
5367 second_vect = VEC_index (tree, dr_chain, j+1);
5369 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5370 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5371 DECL_GIMPLE_REG_P (perm_dest) = 1;
5372 add_referenced_var (perm_dest);
5374 tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
5375 first_vect, second_vect);
5376 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5378 data_ref = make_ssa_name (perm_dest, perm_stmt);
5379 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5380 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5381 mark_symbols_for_renaming (perm_stmt);
5383 VEC_replace (tree, *result_chain, j/2, data_ref);
5385 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5386 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5387 DECL_GIMPLE_REG_P (perm_dest) = 1;
5388 add_referenced_var (perm_dest);
5390 tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
5391 first_vect, second_vect);
5392 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5393 data_ref = make_ssa_name (perm_dest, perm_stmt);
5394 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5395 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5396 mark_symbols_for_renaming (perm_stmt);
5398 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5400 dr_chain = VEC_copy (tree, heap, *result_chain);
5406 /* Function vect_transform_strided_load.
5408 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5409 to perform their permutation and ascribe the result vectorized statements to
5410 the scalar statements.
5414 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
5415 block_stmt_iterator *bsi)
5417 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5418 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5419 tree next_stmt, new_stmt;
5420 VEC(tree,heap) *result_chain = NULL;
5421 unsigned int i, gap_count;
5424 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5425 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5426 vectors, that are ready for vector computation. */
5427 result_chain = VEC_alloc (tree, heap, size);
5429 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
5432 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5433 Since we scan the chain starting from it's first node, their order
5434 corresponds the order of data-refs in RESULT_CHAIN. */
5435 next_stmt = first_stmt;
5437 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5442 /* Skip the gaps. Loads created for the gaps will be removed by dead
5443 code elimination pass later.
5444 DR_GROUP_GAP is the number of steps in elements from the previous
5445 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5446 correspond to the gaps.
5448 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5456 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5457 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5458 copies, and we put the new vector statement in the first available
5460 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5461 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5464 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5465 tree rel_stmt = STMT_VINFO_RELATED_STMT (
5466 vinfo_for_stmt (prev_stmt));
5469 prev_stmt = rel_stmt;
5470 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5472 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
5474 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5476 /* If NEXT_STMT accesses the same DR as the previous statement,
5477 put the same TMP_DATA_REF as its vectorized statement; otherwise
5478 get the next data-ref from RESULT_CHAIN. */
5479 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5487 /* vectorizable_load.
5489 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
5491 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5492 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5493 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5496 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
5500 tree vec_dest = NULL;
5501 tree data_ref = NULL;
5503 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5504 stmt_vec_info prev_stmt_info;
5505 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5506 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5507 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5508 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5509 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
5510 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5513 tree new_stmt = NULL_TREE;
5515 enum dr_alignment_support alignment_support_scheme;
5516 tree dataref_ptr = NULL_TREE;
5518 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5519 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5520 int i, j, group_size;
5521 tree msq = NULL_TREE, lsq;
5522 tree offset = NULL_TREE;
5523 tree realignment_token = NULL_TREE;
5524 tree phi = NULL_TREE;
5525 VEC(tree,heap) *dr_chain = NULL;
5526 bool strided_load = false;
5530 bool compute_in_loop = false;
5531 struct loop *at_loop;
5533 bool slp = (slp_node != NULL);
5535 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
5536 this, so we can safely override NCOPIES with 1 here. */
5540 gcc_assert (ncopies >= 1);
5542 /* FORNOW. This restriction should be relaxed. */
5543 if (nested_in_vect_loop && ncopies > 1)
5545 if (vect_print_dump_info (REPORT_DETAILS))
5546 fprintf (vect_dump, "multiple types in nested loop.");
5550 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5553 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5556 /* Is vectorizable load? */
5557 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5560 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5561 if (TREE_CODE (scalar_dest) != SSA_NAME)
5564 op = GIMPLE_STMT_OPERAND (stmt, 1);
5565 if (TREE_CODE (op) != ARRAY_REF
5566 && TREE_CODE (op) != INDIRECT_REF
5567 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5570 if (!STMT_VINFO_DATA_REF (stmt_info))
5573 scalar_type = TREE_TYPE (DR_REF (dr));
5574 mode = (int) TYPE_MODE (vectype);
5576 /* FORNOW. In some cases can vectorize even if data-type not supported
5577 (e.g. - data copies). */
5578 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
5580 if (vect_print_dump_info (REPORT_DETAILS))
5581 fprintf (vect_dump, "Aligned load, but unsupported type.");
5585 /* Check if the load is a part of an interleaving chain. */
5586 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5588 strided_load = true;
5590 gcc_assert (! nested_in_vect_loop);
5592 /* Check if interleaving is supported. */
5593 if (!vect_strided_load_supported (vectype)
5594 && !PURE_SLP_STMT (stmt_info) && !slp)
5598 if (!vec_stmt) /* transformation not required. */
5600 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
5601 vect_model_load_cost (stmt_info, ncopies, NULL);
5605 if (vect_print_dump_info (REPORT_DETAILS))
5606 fprintf (vect_dump, "transform load.");
5612 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5613 /* Check if the chain of loads is already vectorized. */
5614 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
5616 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5619 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5620 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5621 dr_chain = VEC_alloc (tree, heap, group_size);
5623 /* VEC_NUM is the number of vect stmts to be created for this group. */
5626 strided_load = false;
5627 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5630 vec_num = group_size;
5636 group_size = vec_num = 1;
5639 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5640 gcc_assert (alignment_support_scheme);
5642 /* In case the vectorization factor (VF) is bigger than the number
5643 of elements that we can fit in a vectype (nunits), we have to generate
5644 more than one vector stmt - i.e - we need to "unroll" the
5645 vector stmt by a factor VF/nunits. In doing so, we record a pointer
5646 from one copy of the vector stmt to the next, in the field
5647 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
5648 stages to find the correct vector defs to be used when vectorizing
5649 stmts that use the defs of the current stmt. The example below illustrates
5650 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
5651 4 vectorized stmts):
5653 before vectorization:
5654 RELATED_STMT VEC_STMT
5658 step 1: vectorize stmt S1:
5659 We first create the vector stmt VS1_0, and, as usual, record a
5660 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
5661 Next, we create the vector stmt VS1_1, and record a pointer to
5662 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
5663 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
5665 RELATED_STMT VEC_STMT
5666 VS1_0: vx0 = memref0 VS1_1 -
5667 VS1_1: vx1 = memref1 VS1_2 -
5668 VS1_2: vx2 = memref2 VS1_3 -
5669 VS1_3: vx3 = memref3 - -
5670 S1: x = load - VS1_0
5673 See in documentation in vect_get_vec_def_for_stmt_copy for how the
5674 information we recorded in RELATED_STMT field is used to vectorize
5677 /* In case of interleaving (non-unit strided access):
5684 Vectorized loads are created in the order of memory accesses
5685 starting from the access of the first stmt of the chain:
5688 VS2: vx1 = &base + vec_size*1
5689 VS3: vx3 = &base + vec_size*2
5690 VS4: vx4 = &base + vec_size*3
5692 Then permutation statements are generated:
5694 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
5695 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
5698 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5699 (the order of the data-refs in the output of vect_permute_load_chain
5700 corresponds to the order of scalar stmts in the interleaving chain - see
5701 the documentation of vect_permute_load_chain()).
5702 The generation of permutation stmts and recording them in
5703 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
5705 In case of both multiple types and interleaving, the vector loads and
5706 permutation stmts above are created for every copy. The result vector stmts
5707 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5708 STMT_VINFO_RELATED_STMT for the next copies. */
5710 /* If the data reference is aligned (dr_aligned) or potentially unaligned
5711 on a target that supports unaligned accesses (dr_unaligned_supported)
5712 we generate the following code:
5716 p = p + indx * vectype_size;
5721 Otherwise, the data reference is potentially unaligned on a target that
5722 does not support unaligned accesses (dr_explicit_realign_optimized) -
5723 then generate the following code, in which the data in each iteration is
5724 obtained by two vector loads, one from the previous iteration, and one
5725 from the current iteration:
5727 msq_init = *(floor(p1))
5728 p2 = initial_addr + VS - 1;
5729 realignment_token = call target_builtin;
5732 p2 = p2 + indx * vectype_size
5734 vec_dest = realign_load (msq, lsq, realignment_token)
5739 /* If the misalignment remains the same throughout the execution of the
5740 loop, we can create the init_addr and permutation mask at the loop
5741 preheader. Otherwise, it needs to be created inside the loop.
5742 This can only occur when vectorizing memory accesses in the inner-loop
5743 nested within an outer-loop that is being vectorized. */
5745 if (nested_in_vect_loop_p (loop, stmt)
5746 && (TREE_INT_CST_LOW (DR_STEP (dr)) % UNITS_PER_SIMD_WORD != 0))
5748 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
5749 compute_in_loop = true;
5752 if ((alignment_support_scheme == dr_explicit_realign_optimized
5753 || alignment_support_scheme == dr_explicit_realign)
5754 && !compute_in_loop)
5756 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token,
5757 alignment_support_scheme, NULL_TREE,
5759 if (alignment_support_scheme == dr_explicit_realign_optimized)
5761 phi = SSA_NAME_DEF_STMT (msq);
5762 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5768 prev_stmt_info = NULL;
5769 for (j = 0; j < ncopies; j++)
5771 /* 1. Create the vector pointer update chain. */
5773 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
5775 &dummy, &ptr_incr, false,
5779 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
5781 for (i = 0; i < vec_num; i++)
5784 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
5787 /* 2. Create the vector-load in the loop. */
5788 switch (alignment_support_scheme)
5791 gcc_assert (aligned_access_p (first_dr));
5792 data_ref = build_fold_indirect_ref (dataref_ptr);
5794 case dr_unaligned_supported:
5796 int mis = DR_MISALIGNMENT (first_dr);
5797 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
5799 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
5801 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
5804 case dr_explicit_realign:
5807 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5809 if (compute_in_loop)
5810 msq = vect_setup_realignment (first_stmt, bsi,
5812 dr_explicit_realign,
5815 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5816 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5817 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5818 new_temp = make_ssa_name (vec_dest, new_stmt);
5819 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5820 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5821 copy_virtual_operands (new_stmt, stmt);
5822 mark_symbols_for_renaming (new_stmt);
5825 bump = size_binop (MULT_EXPR, vs_minus_1,
5826 TYPE_SIZE_UNIT (scalar_type));
5827 ptr = bump_vector_ptr (dataref_ptr, NULL_TREE, bsi, stmt, bump);
5828 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5831 case dr_explicit_realign_optimized:
5832 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5837 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5838 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5839 new_temp = make_ssa_name (vec_dest, new_stmt);
5840 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5841 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5842 mark_symbols_for_renaming (new_stmt);
5844 /* 3. Handle explicit realignment if necessary/supported. Create in
5845 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
5846 if (alignment_support_scheme == dr_explicit_realign_optimized
5847 || alignment_support_scheme == dr_explicit_realign)
5849 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
5850 if (!realignment_token)
5851 realignment_token = dataref_ptr;
5852 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5853 new_stmt = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
5855 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5856 new_temp = make_ssa_name (vec_dest, new_stmt);
5857 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5858 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5860 if (alignment_support_scheme == dr_explicit_realign_optimized)
5862 if (i == vec_num - 1 && j == ncopies - 1)
5863 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
5868 /* 4. Handle invariant-load. */
5871 gcc_assert (!strided_load);
5872 gcc_assert (nested_in_vect_loop_p (loop, stmt));
5877 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
5879 /* CHECKME: bitpos depends on endianess? */
5880 bitpos = bitsize_zero_node;
5881 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5883 BIT_FIELD_REF_UNSIGNED (vec_inv) =
5884 TYPE_UNSIGNED (scalar_type);
5886 vect_create_destination_var (scalar_dest, NULL_TREE);
5887 new_stmt = build_gimple_modify_stmt (vec_dest, vec_inv);
5888 new_temp = make_ssa_name (vec_dest, new_stmt);
5889 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5890 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5892 for (k = nunits - 1; k >= 0; --k)
5893 t = tree_cons (NULL_TREE, new_temp, t);
5894 /* FIXME: use build_constructor directly. */
5895 vec_inv = build_constructor_from_list (vectype, t);
5896 new_temp = vect_init_vector (stmt, vec_inv, vectype, bsi);
5897 new_stmt = SSA_NAME_DEF_STMT (new_temp);
5900 gcc_unreachable (); /* FORNOW. */
5903 /* Collect vector loads and later create their permutation in
5904 vect_transform_strided_load (). */
5906 VEC_quick_push (tree, dr_chain, new_temp);
5908 /* Store vector loads in the corresponding SLP_NODE. */
5910 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
5913 /* FORNOW: SLP with multiple types is unsupported. */
5919 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
5921 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5922 dr_chain = VEC_alloc (tree, heap, group_size);
5927 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5929 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5930 prev_stmt_info = vinfo_for_stmt (new_stmt);
5938 /* Function vectorizable_live_operation.
5940 STMT computes a value that is used outside the loop. Check if
5941 it can be supported. */
5944 vectorizable_live_operation (tree stmt,
5945 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
5946 tree *vec_stmt ATTRIBUTE_UNUSED)
5949 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5950 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5951 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5956 enum vect_def_type dt;
5958 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5960 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5963 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5966 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
5969 /* FORNOW. CHECKME. */
5970 if (nested_in_vect_loop_p (loop, stmt))
5973 operation = GIMPLE_STMT_OPERAND (stmt, 1);
5974 op_type = TREE_OPERAND_LENGTH (operation);
5976 /* FORNOW: support only if all uses are invariant. This means
5977 that the scalar operations can remain in place, unvectorized.
5978 The original last scalar value that they compute will be used. */
5980 for (i = 0; i < op_type; i++)
5982 op = TREE_OPERAND (operation, i);
5983 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5985 if (vect_print_dump_info (REPORT_DETAILS))
5986 fprintf (vect_dump, "use not simple.");
5990 if (dt != vect_invariant_def && dt != vect_constant_def)
5994 /* No transformation is required for the cases we currently support. */
5999 /* Function vect_is_simple_cond.
6002 LOOP - the loop that is being vectorized.
6003 COND - Condition that is checked for simple use.
6005 Returns whether a COND can be vectorized. Checks whether
6006 condition operands are supportable using vec_is_simple_use. */
6009 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6013 enum vect_def_type dt;
6015 if (!COMPARISON_CLASS_P (cond))
6018 lhs = TREE_OPERAND (cond, 0);
6019 rhs = TREE_OPERAND (cond, 1);
6021 if (TREE_CODE (lhs) == SSA_NAME)
6023 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6024 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6027 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6028 && TREE_CODE (lhs) != FIXED_CST)
6031 if (TREE_CODE (rhs) == SSA_NAME)
6033 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6034 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6037 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
6038 && TREE_CODE (rhs) != FIXED_CST)
6044 /* vectorizable_condition.
6046 Check if STMT is conditional modify expression that can be vectorized.
6047 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6048 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
6051 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6054 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
6056 tree scalar_dest = NULL_TREE;
6057 tree vec_dest = NULL_TREE;
6058 tree op = NULL_TREE;
6059 tree cond_expr, then_clause, else_clause;
6060 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6061 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6062 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6063 tree vec_compare, vec_cond_expr;
6065 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6066 enum machine_mode vec_mode;
6068 enum vect_def_type dt;
6069 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6070 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6072 gcc_assert (ncopies >= 1);
6074 return false; /* FORNOW */
6076 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6079 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6082 /* FORNOW: SLP not supported. */
6083 if (STMT_SLP_TYPE (stmt_info))
6086 /* FORNOW: not yet supported. */
6087 if (STMT_VINFO_LIVE_P (stmt_info))
6089 if (vect_print_dump_info (REPORT_DETAILS))
6090 fprintf (vect_dump, "value used after loop.");
6094 /* Is vectorizable conditional operation? */
6095 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
6098 op = GIMPLE_STMT_OPERAND (stmt, 1);
6100 if (TREE_CODE (op) != COND_EXPR)
6103 cond_expr = TREE_OPERAND (op, 0);
6104 then_clause = TREE_OPERAND (op, 1);
6105 else_clause = TREE_OPERAND (op, 2);
6107 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6110 /* We do not handle two different vector types for the condition
6112 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6115 if (TREE_CODE (then_clause) == SSA_NAME)
6117 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6118 if (!vect_is_simple_use (then_clause, loop_vinfo,
6119 &then_def_stmt, &def, &dt))
6122 else if (TREE_CODE (then_clause) != INTEGER_CST
6123 && TREE_CODE (then_clause) != REAL_CST
6124 && TREE_CODE (then_clause) != FIXED_CST)
6127 if (TREE_CODE (else_clause) == SSA_NAME)
6129 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6130 if (!vect_is_simple_use (else_clause, loop_vinfo,
6131 &else_def_stmt, &def, &dt))
6134 else if (TREE_CODE (else_clause) != INTEGER_CST
6135 && TREE_CODE (else_clause) != REAL_CST
6136 && TREE_CODE (else_clause) != FIXED_CST)
6140 vec_mode = TYPE_MODE (vectype);
6144 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
6145 return expand_vec_cond_expr_p (op, vec_mode);
6151 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
6152 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6154 /* Handle cond expr. */
6156 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
6158 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
6159 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
6160 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
6162 /* Arguments are ready. create the new vector stmt. */
6163 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
6164 vec_cond_lhs, vec_cond_rhs);
6165 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
6166 vec_compare, vec_then_clause, vec_else_clause);
6168 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
6169 new_temp = make_ssa_name (vec_dest, *vec_stmt);
6170 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
6171 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
6177 /* Function vect_transform_stmt.
6179 Create a vectorized stmt to replace STMT, and insert it at BSI. */
6182 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store,
6185 bool is_store = false;
6186 tree vec_stmt = NULL_TREE;
6187 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6188 tree orig_stmt_in_pattern;
6191 switch (STMT_VINFO_TYPE (stmt_info))
6193 case type_demotion_vec_info_type:
6194 gcc_assert (!slp_node);
6195 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
6199 case type_promotion_vec_info_type:
6200 gcc_assert (!slp_node);
6201 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
6205 case type_conversion_vec_info_type:
6206 done = vectorizable_conversion (stmt, bsi, &vec_stmt, slp_node);
6210 case induc_vec_info_type:
6211 gcc_assert (!slp_node);
6212 done = vectorizable_induction (stmt, bsi, &vec_stmt);
6216 case op_vec_info_type:
6217 done = vectorizable_operation (stmt, bsi, &vec_stmt, slp_node);
6221 case assignment_vec_info_type:
6222 done = vectorizable_assignment (stmt, bsi, &vec_stmt, slp_node);
6226 case load_vec_info_type:
6227 done = vectorizable_load (stmt, bsi, &vec_stmt, slp_node);
6231 case store_vec_info_type:
6232 done = vectorizable_store (stmt, bsi, &vec_stmt, slp_node);
6234 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6236 /* In case of interleaving, the whole chain is vectorized when the
6237 last store in the chain is reached. Store stmts before the last
6238 one are skipped, and there vec_stmt_info shouldn't be freed
6240 *strided_store = true;
6241 if (STMT_VINFO_VEC_STMT (stmt_info))
6248 case condition_vec_info_type:
6249 gcc_assert (!slp_node);
6250 done = vectorizable_condition (stmt, bsi, &vec_stmt);
6254 case call_vec_info_type:
6255 gcc_assert (!slp_node);
6256 done = vectorizable_call (stmt, bsi, &vec_stmt);
6259 case reduc_vec_info_type:
6260 gcc_assert (!slp_node);
6261 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
6266 if (!STMT_VINFO_LIVE_P (stmt_info))
6268 if (vect_print_dump_info (REPORT_DETAILS))
6269 fprintf (vect_dump, "stmt not supported.");
6274 if (STMT_VINFO_LIVE_P (stmt_info)
6275 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
6277 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
6283 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
6284 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
6285 if (orig_stmt_in_pattern)
6287 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
6288 /* STMT was inserted by the vectorizer to replace a computation idiom.
6289 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
6290 computed this idiom. We need to record a pointer to VEC_STMT in
6291 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
6292 documentation of vect_pattern_recog. */
6293 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
6295 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
6296 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
6305 /* This function builds ni_name = number of iterations loop executes
6306 on the loop preheader. */
6309 vect_build_loop_niters (loop_vec_info loop_vinfo)
6311 tree ni_name, stmt, var;
6313 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6314 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6316 var = create_tmp_var (TREE_TYPE (ni), "niters");
6317 add_referenced_var (var);
6318 ni_name = force_gimple_operand (ni, &stmt, false, var);
6320 pe = loop_preheader_edge (loop);
6323 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6324 gcc_assert (!new_bb);
6331 /* This function generates the following statements:
6333 ni_name = number of iterations loop executes
6334 ratio = ni_name / vf
6335 ratio_mult_vf_name = ratio * vf
6337 and places them at the loop preheader edge. */
6340 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6342 tree *ratio_mult_vf_name_ptr,
6343 tree *ratio_name_ptr)
6351 tree ratio_mult_vf_name;
6352 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6353 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
6354 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6357 pe = loop_preheader_edge (loop);
6359 /* Generate temporary variable that contains
6360 number of iterations loop executes. */
6362 ni_name = vect_build_loop_niters (loop_vinfo);
6363 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
6365 /* Create: ratio = ni >> log2(vf) */
6367 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
6368 if (!is_gimple_val (ratio_name))
6370 var = create_tmp_var (TREE_TYPE (ni), "bnd");
6371 add_referenced_var (var);
6373 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
6374 pe = loop_preheader_edge (loop);
6375 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6376 gcc_assert (!new_bb);
6379 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6381 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6382 ratio_name, log_vf);
6383 if (!is_gimple_val (ratio_mult_vf_name))
6385 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
6386 add_referenced_var (var);
6388 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
6390 pe = loop_preheader_edge (loop);
6391 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6392 gcc_assert (!new_bb);
6395 *ni_name_ptr = ni_name;
6396 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6397 *ratio_name_ptr = ratio_name;
6403 /* Function vect_update_ivs_after_vectorizer.
6405 "Advance" the induction variables of LOOP to the value they should take
6406 after the execution of LOOP. This is currently necessary because the
6407 vectorizer does not handle induction variables that are used after the
6408 loop. Such a situation occurs when the last iterations of LOOP are
6410 1. We introduced new uses after LOOP for IVs that were not originally used
6411 after LOOP: the IVs of LOOP are now used by an epilog loop.
6412 2. LOOP is going to be vectorized; this means that it will iterate N/VF
6413 times, whereas the loop IVs should be bumped N times.
6416 - LOOP - a loop that is going to be vectorized. The last few iterations
6417 of LOOP were peeled.
6418 - NITERS - the number of iterations that LOOP executes (before it is
6419 vectorized). i.e, the number of times the ivs should be bumped.
6420 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
6421 coming out from LOOP on which there are uses of the LOOP ivs
6422 (this is the path from LOOP->exit to epilog_loop->preheader).
6424 The new definitions of the ivs are placed in LOOP->exit.
6425 The phi args associated with the edge UPDATE_E in the bb
6426 UPDATE_E->dest are updated accordingly.
6428 Assumption 1: Like the rest of the vectorizer, this function assumes
6429 a single loop exit that has a single predecessor.
6431 Assumption 2: The phi nodes in the LOOP header and in update_bb are
6432 organized in the same order.
6434 Assumption 3: The access function of the ivs is simple enough (see
6435 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
6437 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
6438 coming out of LOOP on which the ivs of LOOP are used (this is the path
6439 that leads to the epilog loop; other paths skip the epilog loop). This
6440 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
6441 needs to have its phis updated.
6445 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
6448 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6449 basic_block exit_bb = single_exit (loop)->dest;
6451 basic_block update_bb = update_e->dest;
6453 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
6455 /* Make sure there exists a single-predecessor exit bb: */
6456 gcc_assert (single_pred_p (exit_bb));
6458 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
6460 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
6462 tree access_fn = NULL;
6463 tree evolution_part;
6466 tree var, ni, ni_name;
6467 block_stmt_iterator last_bsi;
6469 if (vect_print_dump_info (REPORT_DETAILS))
6471 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
6472 print_generic_expr (vect_dump, phi, TDF_SLIM);
6475 /* Skip virtual phi's. */
6476 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
6478 if (vect_print_dump_info (REPORT_DETAILS))
6479 fprintf (vect_dump, "virtual phi. skip.");
6483 /* Skip reduction phis. */
6484 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
6486 if (vect_print_dump_info (REPORT_DETAILS))
6487 fprintf (vect_dump, "reduc phi. skip.");
6491 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
6492 gcc_assert (access_fn);
6494 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
6495 gcc_assert (evolution_part != NULL_TREE);
6497 /* FORNOW: We do not support IVs whose evolution function is a polynomial
6498 of degree >= 2 or exponential. */
6499 gcc_assert (!tree_is_chrec (evolution_part));
6501 step_expr = evolution_part;
6502 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
6505 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
6506 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
6508 fold_convert (sizetype,
6509 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
6510 niters, step_expr)));
6512 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
6513 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
6514 fold_convert (TREE_TYPE (init_expr),
6521 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
6522 add_referenced_var (var);
6524 last_bsi = bsi_last (exit_bb);
6525 ni_name = force_gimple_operand_bsi (&last_bsi, ni, false, var,
6526 true, BSI_SAME_STMT);
6528 /* Fix phi expressions in the successor bb. */
6529 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
6533 /* Return the more conservative threshold between the
6534 min_profitable_iters returned by the cost model and the user
6535 specified threshold, if provided. */
6538 conservative_cost_threshold (loop_vec_info loop_vinfo,
6539 int min_profitable_iters)
6542 int min_scalar_loop_bound;
6544 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
6545 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
6547 /* Use the cost model only if it is more conservative than user specified
6549 th = (unsigned) min_scalar_loop_bound;
6550 if (min_profitable_iters
6551 && (!min_scalar_loop_bound
6552 || min_profitable_iters > min_scalar_loop_bound))
6553 th = (unsigned) min_profitable_iters;
6555 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
6556 fprintf (vect_dump, "not vectorized: vectorization may not be "
6559 if (th && vect_print_dump_info (REPORT_DETAILS))
6560 fprintf (vect_dump, "Vectorization may not be profitable.");
6565 /* Function vect_do_peeling_for_loop_bound
6567 Peel the last iterations of the loop represented by LOOP_VINFO.
6568 The peeled iterations form a new epilog loop. Given that the loop now
6569 iterates NITERS times, the new epilog loop iterates
6570 NITERS % VECTORIZATION_FACTOR times.
6572 The original loop will later be made to iterate
6573 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
6576 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
6578 tree ni_name, ratio_mult_vf_name;
6579 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6580 struct loop *new_loop;
6582 basic_block preheader;
6584 bool check_profitability = false;
6585 unsigned int th = 0;
6586 int min_profitable_iters;
6588 if (vect_print_dump_info (REPORT_DETAILS))
6589 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
6591 initialize_original_copy_tables ();
6593 /* Generate the following variables on the preheader of original loop:
6595 ni_name = number of iteration the original loop executes
6596 ratio = ni_name / vf
6597 ratio_mult_vf_name = ratio * vf */
6598 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
6599 &ratio_mult_vf_name, ratio);
6601 loop_num = loop->num;
6603 /* If cost model check not done during versioning and
6604 peeling for alignment. */
6605 if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6606 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
6607 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
6609 check_profitability = true;
6611 /* Get profitability threshold for vectorized loop. */
6612 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6614 th = conservative_cost_threshold (loop_vinfo,
6615 min_profitable_iters);
6618 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
6619 ratio_mult_vf_name, ni_name, false,
6620 th, check_profitability);
6621 gcc_assert (new_loop);
6622 gcc_assert (loop_num == loop->num);
6623 #ifdef ENABLE_CHECKING
6624 slpeel_verify_cfg_after_peeling (loop, new_loop);
6627 /* A guard that controls whether the new_loop is to be executed or skipped
6628 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
6629 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
6630 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
6631 is on the path where the LOOP IVs are used and need to be updated. */
6633 preheader = loop_preheader_edge (new_loop)->src;
6634 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
6635 update_e = EDGE_PRED (preheader, 0);
6637 update_e = EDGE_PRED (preheader, 1);
6639 /* Update IVs of original loop as if they were advanced
6640 by ratio_mult_vf_name steps. */
6641 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
6643 /* After peeling we have to reset scalar evolution analyzer. */
6646 free_original_copy_tables ();
6650 /* Function vect_gen_niters_for_prolog_loop
6652 Set the number of iterations for the loop represented by LOOP_VINFO
6653 to the minimum between LOOP_NITERS (the original iteration count of the loop)
6654 and the misalignment of DR - the data reference recorded in
6655 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
6656 this loop, the data reference DR will refer to an aligned location.
6658 The following computation is generated:
6660 If the misalignment of DR is known at compile time:
6661 addr_mis = int mis = DR_MISALIGNMENT (dr);
6662 Else, compute address misalignment in bytes:
6663 addr_mis = addr & (vectype_size - 1)
6665 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
6667 (elem_size = element type size; an element is the scalar element
6668 whose type is the inner type of the vectype)
6672 prolog_niters = min ( LOOP_NITERS ,
6673 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
6674 where group_size is the size of the interleaved group.
6676 The above formulas assume that VF == number of elements in the vector. This
6677 may not hold when there are multiple-types in the loop.
6678 In this case, for some data-references in the loop the VF does not represent
6679 the number of elements that fit in the vector. Therefore, instead of VF we
6680 use TYPE_VECTOR_SUBPARTS. */
6683 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
6685 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
6686 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6688 tree iters, iters_name;
6691 tree dr_stmt = DR_STMT (dr);
6692 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
6693 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6694 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
6695 tree niters_type = TREE_TYPE (loop_niters);
6697 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
6698 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
6700 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6702 /* For interleaved access element size must be multiplied by the size of
6703 the interleaved group. */
6704 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
6705 DR_GROUP_FIRST_DR (stmt_info)));
6706 element_size *= group_size;
6709 pe = loop_preheader_edge (loop);
6711 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
6713 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
6714 int elem_misalign = byte_misalign / element_size;
6716 if (vect_print_dump_info (REPORT_DETAILS))
6717 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
6718 iters = build_int_cst (niters_type,
6719 (nelements - elem_misalign)&(nelements/group_size-1));
6723 tree new_stmts = NULL_TREE;
6724 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
6725 &new_stmts, NULL_TREE, loop);
6726 tree ptr_type = TREE_TYPE (start_addr);
6727 tree size = TYPE_SIZE (ptr_type);
6728 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
6729 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
6730 tree elem_size_log =
6731 build_int_cst (type, exact_log2 (vectype_align/nelements));
6732 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
6733 tree nelements_tree = build_int_cst (type, nelements);
6737 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
6738 gcc_assert (!new_bb);
6740 /* Create: byte_misalign = addr & (vectype_size - 1) */
6742 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
6744 /* Create: elem_misalign = byte_misalign / element_size */
6746 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
6748 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
6749 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
6750 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
6751 iters = fold_convert (niters_type, iters);
6754 /* Create: prolog_loop_niters = min (iters, loop_niters) */
6755 /* If the loop bound is known at compile time we already verified that it is
6756 greater than vf; since the misalignment ('iters') is at most vf, there's
6757 no need to generate the MIN_EXPR in this case. */
6758 if (TREE_CODE (loop_niters) != INTEGER_CST)
6759 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
6761 if (vect_print_dump_info (REPORT_DETAILS))
6763 fprintf (vect_dump, "niters for prolog loop: ");
6764 print_generic_expr (vect_dump, iters, TDF_SLIM);
6767 var = create_tmp_var (niters_type, "prolog_loop_niters");
6768 add_referenced_var (var);
6769 iters_name = force_gimple_operand (iters, &stmt, false, var);
6771 /* Insert stmt on loop preheader edge. */
6774 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6775 gcc_assert (!new_bb);
6782 /* Function vect_update_init_of_dr
6784 NITERS iterations were peeled from LOOP. DR represents a data reference
6785 in LOOP. This function updates the information recorded in DR to
6786 account for the fact that the first NITERS iterations had already been
6787 executed. Specifically, it updates the OFFSET field of DR. */
6790 vect_update_init_of_dr (struct data_reference *dr, tree niters)
6792 tree offset = DR_OFFSET (dr);
6794 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
6795 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
6796 DR_OFFSET (dr) = offset;
6800 /* Function vect_update_inits_of_drs
6802 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
6803 This function updates the information recorded for the data references in
6804 the loop to account for the fact that the first NITERS iterations had
6805 already been executed. Specifically, it updates the initial_condition of
6806 the access_function of all the data_references in the loop. */
6809 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
6812 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
6813 struct data_reference *dr;
6815 if (vect_print_dump_info (REPORT_DETAILS))
6816 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
6818 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
6819 vect_update_init_of_dr (dr, niters);
6823 /* Function vect_do_peeling_for_alignment
6825 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
6826 'niters' is set to the misalignment of one of the data references in the
6827 loop, thereby forcing it to refer to an aligned location at the beginning
6828 of the execution of this loop. The data reference for which we are
6829 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
6832 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
6834 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6835 tree niters_of_prolog_loop, ni_name;
6837 struct loop *new_loop;
6838 bool check_profitability = false;
6839 unsigned int th = 0;
6840 int min_profitable_iters;
6842 if (vect_print_dump_info (REPORT_DETAILS))
6843 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
6845 initialize_original_copy_tables ();
6847 ni_name = vect_build_loop_niters (loop_vinfo);
6848 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
6851 /* If cost model check not done during versioning. */
6852 if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6853 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
6855 check_profitability = true;
6857 /* Get profitability threshold for vectorized loop. */
6858 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6860 th = conservative_cost_threshold (loop_vinfo,
6861 min_profitable_iters);
6864 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
6866 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
6867 niters_of_prolog_loop, ni_name, true,
6868 th, check_profitability);
6870 gcc_assert (new_loop);
6871 #ifdef ENABLE_CHECKING
6872 slpeel_verify_cfg_after_peeling (new_loop, loop);
6875 /* Update number of times loop executes. */
6876 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
6877 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
6878 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
6880 /* Update the init conditions of the access functions of all data refs. */
6881 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
6883 /* After peeling we have to reset scalar evolution analyzer. */
6886 free_original_copy_tables ();
6890 /* Function vect_create_cond_for_align_checks.
6892 Create a conditional expression that represents the alignment checks for
6893 all of data references (array element references) whose alignment must be
6897 COND_EXPR - input conditional expression. New conditions will be chained
6898 with logical AND operation.
6899 LOOP_VINFO - two fields of the loop information are used.
6900 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
6901 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
6904 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6906 The returned value is the conditional expression to be used in the if
6907 statement that controls which version of the loop gets executed at runtime.
6909 The algorithm makes two assumptions:
6910 1) The number of bytes "n" in a vector is a power of 2.
6911 2) An address "a" is aligned if a%n is zero and that this
6912 test can be done as a&(n-1) == 0. For example, for 16
6913 byte vectors the test is a&0xf == 0. */
6916 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
6918 tree *cond_expr_stmt_list)
6920 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6921 VEC(tree,heap) *may_misalign_stmts
6922 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
6924 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
6928 tree int_ptrsize_type;
6930 tree or_tmp_name = NULL_TREE;
6931 tree and_tmp, and_tmp_name, and_stmt;
6933 tree part_cond_expr;
6935 /* Check that mask is one less than a power of 2, i.e., mask is
6936 all zeros followed by all ones. */
6937 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
6939 /* CHECKME: what is the best integer or unsigned type to use to hold a
6940 cast from a pointer value? */
6941 psize = TYPE_SIZE (ptr_type_node);
6943 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
6945 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
6946 of the first vector of the i'th data reference. */
6948 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
6950 tree new_stmt_list = NULL_TREE;
6952 tree addr_tmp, addr_tmp_name, addr_stmt;
6953 tree or_tmp, new_or_tmp_name, or_stmt;
6955 /* create: addr_tmp = (int)(address_of_first_vector) */
6956 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
6957 &new_stmt_list, NULL_TREE, loop);
6959 if (new_stmt_list != NULL_TREE)
6960 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
6962 sprintf (tmp_name, "%s%d", "addr2int", i);
6963 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6964 add_referenced_var (addr_tmp);
6965 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
6966 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
6967 addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
6968 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
6969 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
6971 /* The addresses are OR together. */
6973 if (or_tmp_name != NULL_TREE)
6975 /* create: or_tmp = or_tmp | addr_tmp */
6976 sprintf (tmp_name, "%s%d", "orptrs", i);
6977 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6978 add_referenced_var (or_tmp);
6979 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
6980 tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
6981 or_tmp_name, addr_tmp_name);
6982 or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
6983 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
6984 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
6985 or_tmp_name = new_or_tmp_name;
6988 or_tmp_name = addr_tmp_name;
6992 mask_cst = build_int_cst (int_ptrsize_type, mask);
6994 /* create: and_tmp = or_tmp & mask */
6995 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
6996 add_referenced_var (and_tmp);
6997 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
6999 tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
7000 and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
7001 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
7002 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
7004 /* Make and_tmp the left operand of the conditional test against zero.
7005 if and_tmp has a nonzero bit then some address is unaligned. */
7006 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
7007 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
7008 and_tmp_name, ptrsize_zero);
7010 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7011 *cond_expr, part_cond_expr);
7013 *cond_expr = part_cond_expr;
7016 /* Function vect_vfa_segment_size.
7018 Create an expression that computes the size of segment
7019 that will be accessed for a data reference. The functions takes into
7020 account that realignment loads may access one more vector.
7023 DR: The data reference.
7024 VECT_FACTOR: vectorization factor.
7026 Return an expression whose value is the size of segment which will be
7030 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7032 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7033 DR_STEP (dr), vect_factor);
7035 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7037 tree vector_size = TYPE_SIZE_UNIT
7038 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7040 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7041 segment_length, vector_size);
7043 return fold_convert (sizetype, segment_length);
7046 /* Function vect_create_cond_for_alias_checks.
7048 Create a conditional expression that represents the run-time checks for
7049 overlapping of address ranges represented by a list of data references
7050 relations passed as input.
7053 COND_EXPR - input conditional expression. New conditions will be chained
7054 with logical AND operation.
7055 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7059 COND_EXPR - conditional expression.
7060 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7064 The returned value is the conditional expression to be used in the if
7065 statement that controls which version of the loop gets executed at runtime.
7069 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7071 tree * cond_expr_stmt_list)
7073 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7074 VEC (ddr_p, heap) * may_alias_ddrs =
7075 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7077 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7081 tree part_cond_expr;
7083 /* Create expression
7084 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7085 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7089 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7090 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
7092 if (VEC_empty (ddr_p, may_alias_ddrs))
7095 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
7097 struct data_reference *dr_a, *dr_b;
7098 tree dr_group_first_a, dr_group_first_b;
7099 tree addr_base_a, addr_base_b;
7100 tree segment_length_a, segment_length_b;
7101 tree stmt_a, stmt_b;
7104 stmt_a = DR_STMT (DDR_A (ddr));
7105 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
7106 if (dr_group_first_a)
7108 stmt_a = dr_group_first_a;
7109 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
7113 stmt_b = DR_STMT (DDR_B (ddr));
7114 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
7115 if (dr_group_first_b)
7117 stmt_b = dr_group_first_b;
7118 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
7122 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
7125 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
7128 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
7129 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
7131 if (vect_print_dump_info (REPORT_DR_DETAILS))
7134 "create runtime check for data references ");
7135 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
7136 fprintf (vect_dump, " and ");
7137 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
7142 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
7143 fold_build2 (LT_EXPR, boolean_type_node,
7144 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
7148 fold_build2 (LT_EXPR, boolean_type_node,
7149 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
7155 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7156 *cond_expr, part_cond_expr);
7158 *cond_expr = part_cond_expr;
7160 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7161 fprintf (vect_dump, "created %u versioning for alias checks.\n",
7162 VEC_length (ddr_p, may_alias_ddrs));
7166 /* Function vect_loop_versioning.
7168 If the loop has data references that may or may not be aligned or/and
7169 has data reference relations whose independence was not proven then
7170 two versions of the loop need to be generated, one which is vectorized
7171 and one which isn't. A test is then generated to control which of the
7172 loops is executed. The test checks for the alignment of all of the
7173 data references that may or may not be aligned. An additional
7174 sequence of runtime tests is generated for each pairs of DDRs whose
7175 independence was not proven. The vectorized version of loop is
7176 executed only if both alias and alignment tests are passed.
7178 The test generated to check which version of loop is executed
7179 is modified to also check for profitability as indicated by the
7180 cost model initially. */
7183 vect_loop_versioning (loop_vec_info loop_vinfo)
7185 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7187 tree cond_expr = NULL_TREE;
7188 tree cond_expr_stmt_list = NULL_TREE;
7189 basic_block condition_bb;
7190 block_stmt_iterator cond_exp_bsi;
7191 basic_block merge_bb;
7192 basic_block new_exit_bb;
7194 tree orig_phi, new_phi, arg;
7195 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
7196 tree gimplify_stmt_list;
7197 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
7198 int min_profitable_iters = 0;
7201 /* Get profitability threshold for vectorized loop. */
7202 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7204 th = conservative_cost_threshold (loop_vinfo,
7205 min_profitable_iters);
7208 build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
7209 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
7211 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
7214 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
7215 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
7216 &cond_expr_stmt_list);
7218 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7219 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
7220 &cond_expr_stmt_list);
7223 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
7225 force_gimple_operand (cond_expr, &gimplify_stmt_list, true,
7227 append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list);
7229 initialize_original_copy_tables ();
7230 nloop = loop_version (loop, cond_expr, &condition_bb,
7231 prob, prob, REG_BR_PROB_BASE - prob, true);
7232 free_original_copy_tables();
7234 /* Loop versioning violates an assumption we try to maintain during
7235 vectorization - that the loop exit block has a single predecessor.
7236 After versioning, the exit block of both loop versions is the same
7237 basic block (i.e. it has two predecessors). Just in order to simplify
7238 following transformations in the vectorizer, we fix this situation
7239 here by adding a new (empty) block on the exit-edge of the loop,
7240 with the proper loop-exit phis to maintain loop-closed-form. */
7242 merge_bb = single_exit (loop)->dest;
7243 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
7244 new_exit_bb = split_edge (single_exit (loop));
7245 new_exit_e = single_exit (loop);
7246 e = EDGE_SUCC (new_exit_bb, 0);
7248 for (orig_phi = phi_nodes (merge_bb); orig_phi;
7249 orig_phi = PHI_CHAIN (orig_phi))
7251 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
7253 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
7254 add_phi_arg (new_phi, arg, new_exit_e);
7255 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
7258 /* End loop-exit-fixes after versioning. */
7260 update_ssa (TODO_update_ssa);
7261 if (cond_expr_stmt_list)
7263 cond_exp_bsi = bsi_last (condition_bb);
7264 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
7268 /* Remove a group of stores (for SLP or interleaving), free their
7272 vect_remove_stores (tree first_stmt)
7275 tree next = first_stmt;
7277 stmt_vec_info next_stmt_info;
7278 block_stmt_iterator next_si;
7282 /* Free the attached stmt_vec_info and remove the stmt. */
7283 next_si = bsi_for_stmt (next);
7284 bsi_remove (&next_si, true);
7285 next_stmt_info = vinfo_for_stmt (next);
7286 ann = stmt_ann (next);
7287 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
7288 free (next_stmt_info);
7289 set_stmt_info (ann, NULL);
7295 /* Vectorize SLP instance tree in postorder. */
7298 vect_schedule_slp_instance (slp_tree node, unsigned int vec_stmts_size)
7301 bool strided_store, is_store;
7302 block_stmt_iterator si;
7303 stmt_vec_info stmt_info;
7308 vect_schedule_slp_instance (SLP_TREE_LEFT (node), vec_stmts_size);
7309 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), vec_stmts_size);
7311 stmt = VEC_index(tree, SLP_TREE_SCALAR_STMTS (node), 0);
7312 stmt_info = vinfo_for_stmt (stmt);
7313 SLP_TREE_VEC_STMTS (node) = VEC_alloc (tree, heap, vec_stmts_size);
7314 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
7316 if (vect_print_dump_info (REPORT_DETAILS))
7318 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
7319 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7322 si = bsi_for_stmt (stmt);
7323 is_store = vect_transform_stmt (stmt, &si, &strided_store, node);
7326 if (DR_GROUP_FIRST_DR (stmt_info))
7327 /* If IS_STORE is TRUE, the vectorization of the
7328 interleaving chain was completed - free all the stores in
7330 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
7332 /* FORNOW: SLP originates only from strided stores. */
7338 /* FORNOW: SLP originates only from strided stores. */
7344 vect_schedule_slp (loop_vec_info loop_vinfo, unsigned int nunits)
7346 VEC (slp_instance, heap) *slp_instances =
7347 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7348 slp_instance instance;
7349 unsigned int vec_stmts_size;
7350 unsigned int group_size, i;
7351 unsigned int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7352 bool is_store = false;
7354 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
7356 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
7357 /* For each SLP instance calculate number of vector stmts to be created
7358 for the scalar stmts in each node of the SLP tree. Number of vector
7359 elements in one vector iteration is the number of scalar elements in
7360 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
7362 vec_stmts_size = vectorization_factor * group_size / nunits;
7364 /* Schedule the tree of INSTANCE. */
7365 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
7368 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
7369 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
7370 fprintf (vect_dump, "vectorizing stmts using SLP.");
7376 /* Function vect_transform_loop.
7378 The analysis phase has determined that the loop is vectorizable.
7379 Vectorize the loop - created vectorized stmts to replace the scalar
7380 stmts in the loop, and update the loop exit condition. */
7383 vect_transform_loop (loop_vec_info loop_vinfo)
7385 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7386 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7387 int nbbs = loop->num_nodes;
7388 block_stmt_iterator si, next_si;
7391 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7393 bool slp_scheduled = false;
7394 unsigned int nunits;
7396 if (vect_print_dump_info (REPORT_DETAILS))
7397 fprintf (vect_dump, "=== vec_transform_loop ===");
7399 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7400 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7401 vect_loop_versioning (loop_vinfo);
7403 /* CHECKME: we wouldn't need this if we called update_ssa once
7405 bitmap_zero (vect_memsyms_to_rename);
7407 /* Peel the loop if there are data refs with unknown alignment.
7408 Only one data ref with unknown store is allowed. */
7410 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7411 vect_do_peeling_for_alignment (loop_vinfo);
7413 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
7414 compile time constant), or it is a constant that doesn't divide by the
7415 vectorization factor, then an epilog loop needs to be created.
7416 We therefore duplicate the loop: the original loop will be vectorized,
7417 and will compute the first (n/VF) iterations. The second copy of the loop
7418 will remain scalar and will compute the remaining (n%VF) iterations.
7419 (VF is the vectorization factor). */
7421 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7422 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7423 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
7424 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
7426 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7427 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
7429 /* 1) Make sure the loop header has exactly two entries
7430 2) Make sure we have a preheader basic block. */
7432 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7434 split_edge (loop_preheader_edge (loop));
7436 /* FORNOW: the vectorizer supports only loops which body consist
7437 of one basic block (header + empty latch). When the vectorizer will
7438 support more involved loop forms, the order by which the BBs are
7439 traversed need to be reconsidered. */
7441 for (i = 0; i < nbbs; i++)
7443 basic_block bb = bbs[i];
7444 stmt_vec_info stmt_info;
7447 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
7449 if (vect_print_dump_info (REPORT_DETAILS))
7451 fprintf (vect_dump, "------>vectorizing phi: ");
7452 print_generic_expr (vect_dump, phi, TDF_SLIM);
7454 stmt_info = vinfo_for_stmt (phi);
7458 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7459 && !STMT_VINFO_LIVE_P (stmt_info))
7462 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7463 != (unsigned HOST_WIDE_INT) vectorization_factor)
7464 && vect_print_dump_info (REPORT_DETAILS))
7465 fprintf (vect_dump, "multiple-types.");
7467 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
7469 if (vect_print_dump_info (REPORT_DETAILS))
7470 fprintf (vect_dump, "transform phi.");
7471 vect_transform_stmt (phi, NULL, NULL, NULL);
7475 for (si = bsi_start (bb); !bsi_end_p (si);)
7477 tree stmt = bsi_stmt (si);
7480 if (vect_print_dump_info (REPORT_DETAILS))
7482 fprintf (vect_dump, "------>vectorizing statement: ");
7483 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7486 stmt_info = vinfo_for_stmt (stmt);
7488 /* vector stmts created in the outer-loop during vectorization of
7489 stmts in an inner-loop may not have a stmt_info, and do not
7490 need to be vectorized. */
7497 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7498 && !STMT_VINFO_LIVE_P (stmt_info))
7504 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
7506 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7507 if (!STMT_SLP_TYPE (stmt_info)
7508 && nunits != (unsigned int) vectorization_factor
7509 && vect_print_dump_info (REPORT_DETAILS))
7510 /* For SLP VF is set according to unrolling factor, and not to
7511 vector size, hence for SLP this print is not valid. */
7512 fprintf (vect_dump, "multiple-types.");
7514 /* SLP. Schedule all the SLP instances when the first SLP stmt is
7516 if (STMT_SLP_TYPE (stmt_info))
7520 slp_scheduled = true;
7522 if (vect_print_dump_info (REPORT_DETAILS))
7523 fprintf (vect_dump, "=== scheduling SLP instances ===");
7525 is_store = vect_schedule_slp (loop_vinfo, nunits);
7527 /* IS_STORE is true if STMT is a store. Stores cannot be of
7528 hybrid SLP type. They are removed in
7529 vect_schedule_slp_instance and their vinfo is destroyed. */
7537 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
7538 if (PURE_SLP_STMT (stmt_info))
7545 /* -------- vectorize statement ------------ */
7546 if (vect_print_dump_info (REPORT_DETAILS))
7547 fprintf (vect_dump, "transform statement.");
7549 strided_store = false;
7550 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL);
7554 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7556 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7557 interleaving chain was completed - free all the stores in
7559 tree next = DR_GROUP_FIRST_DR (stmt_info);
7561 stmt_vec_info next_stmt_info;
7565 next_si = bsi_for_stmt (next);
7566 next_stmt_info = vinfo_for_stmt (next);
7567 /* Free the attached stmt_vec_info and remove the stmt. */
7568 ann = stmt_ann (next);
7569 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
7570 free (next_stmt_info);
7571 set_stmt_info (ann, NULL);
7572 bsi_remove (&next_si, true);
7575 bsi_remove (&si, true);
7580 /* Free the attached stmt_vec_info and remove the stmt. */
7581 ann = stmt_ann (stmt);
7583 set_stmt_info (ann, NULL);
7584 bsi_remove (&si, true);
7592 slpeel_make_loop_iterate_ntimes (loop, ratio);
7594 mark_set_for_renaming (vect_memsyms_to_rename);
7596 /* The memory tags and pointers in vectorized statements need to
7597 have their SSA forms updated. FIXME, why can't this be delayed
7598 until all the loops have been transformed? */
7599 update_ssa (TODO_update_ssa);
7601 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7602 fprintf (vect_dump, "LOOP VECTORIZED.");
7603 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7604 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");