1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *, slp_tree);
50 static tree vect_create_destination_var (tree, tree);
51 static tree vect_create_data_ref_ptr
52 (tree, struct loop*, tree, tree *, tree *, bool, tree, bool *);
53 static tree vect_create_addr_base_for_vector_ref
54 (tree, tree *, tree, struct loop *);
55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
57 static tree vect_init_vector (tree, tree, tree, block_stmt_iterator *);
58 static void vect_finish_stmt_generation
59 (tree stmt, tree vec_stmt, block_stmt_iterator *);
60 static bool vect_is_simple_cond (tree, loop_vec_info);
61 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
62 static tree get_initial_def_for_reduction (tree, tree, tree *);
64 /* Utility function dealing with loop peeling (not peeling itself). */
65 static void vect_generate_tmps_on_preheader
66 (loop_vec_info, tree *, tree *, tree *);
67 static tree vect_build_loop_niters (loop_vec_info);
68 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
69 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
70 static void vect_update_init_of_dr (struct data_reference *, tree niters);
71 static void vect_update_inits_of_drs (loop_vec_info, tree);
72 static int vect_min_worthwhile_factor (enum tree_code);
76 cost_for_stmt (tree stmt)
78 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
80 switch (STMT_VINFO_TYPE (stmt_info))
82 case load_vec_info_type:
83 return TARG_SCALAR_LOAD_COST;
84 case store_vec_info_type:
85 return TARG_SCALAR_STORE_COST;
86 case op_vec_info_type:
87 case condition_vec_info_type:
88 case assignment_vec_info_type:
89 case reduc_vec_info_type:
90 case induc_vec_info_type:
91 case type_promotion_vec_info_type:
92 case type_demotion_vec_info_type:
93 case type_conversion_vec_info_type:
94 case call_vec_info_type:
95 return TARG_SCALAR_STMT_COST;
96 case undef_vec_info_type:
103 /* Function vect_estimate_min_profitable_iters
105 Return the number of iterations required for the vector version of the
106 loop to be profitable relative to the cost of the scalar version of the
109 TODO: Take profile info into account before making vectorization
110 decisions, if available. */
113 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
116 int min_profitable_iters;
117 int peel_iters_prologue;
118 int peel_iters_epilogue;
119 int vec_inside_cost = 0;
120 int vec_outside_cost = 0;
121 int scalar_single_iter_cost = 0;
122 int scalar_outside_cost = 0;
123 bool runtime_test = false;
124 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
125 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
126 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
127 int nbbs = loop->num_nodes;
128 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
129 int peel_guard_costs = 0;
130 int innerloop_iters = 0, factor;
131 VEC (slp_instance, heap) *slp_instances;
132 slp_instance instance;
134 /* Cost model disabled. */
135 if (!flag_vect_cost_model)
137 if (vect_print_dump_info (REPORT_DETAILS))
138 fprintf (vect_dump, "cost model disabled.");
142 /* If the number of iterations is unknown, or the
143 peeling-for-misalignment amount is unknown, we will have to generate
144 a runtime test to test the loop count against the threshold. */
145 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
146 || (byte_misalign < 0))
149 /* Requires loop versioning tests to handle misalignment. */
151 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
153 /* FIXME: Make cost depend on complexity of individual check. */
155 VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
156 if (vect_print_dump_info (REPORT_DETAILS))
157 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
158 "versioning to treat misalignment.\n");
161 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
163 /* FIXME: Make cost depend on complexity of individual check. */
165 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
166 if (vect_print_dump_info (REPORT_DETAILS))
167 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
168 "versioning aliasing.\n");
171 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
172 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
174 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
177 /* Count statements in scalar loop. Using this as scalar cost for a single
180 TODO: Add outer loop support.
182 TODO: Consider assigning different costs to different scalar
187 innerloop_iters = 50; /* FIXME */
189 for (i = 0; i < nbbs; i++)
191 block_stmt_iterator si;
192 basic_block bb = bbs[i];
194 if (bb->loop_father == loop->inner)
195 factor = innerloop_iters;
199 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
201 tree stmt = bsi_stmt (si);
202 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
203 if (!STMT_VINFO_RELEVANT_P (stmt_info)
204 && !STMT_VINFO_LIVE_P (stmt_info))
206 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
207 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
208 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
209 some of the "outside" costs are generated inside the outer-loop. */
210 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
214 /* Add additional cost for the peeled instructions in prologue and epilogue
217 FORNOW: If we dont know the value of peel_iters for prologue or epilogue
218 at compile-time - we assume it's vf/2 (the worst would be vf-1).
220 TODO: Build an expression that represents peel_iters for prologue and
221 epilogue to be used in a run-time test. */
223 if (byte_misalign < 0)
225 peel_iters_prologue = vf/2;
226 if (vect_print_dump_info (REPORT_DETAILS))
227 fprintf (vect_dump, "cost model: "
228 "prologue peel iters set to vf/2.");
230 /* If peeling for alignment is unknown, loop bound of main loop becomes
232 peel_iters_epilogue = vf/2;
233 if (vect_print_dump_info (REPORT_DETAILS))
234 fprintf (vect_dump, "cost model: "
235 "epilogue peel iters set to vf/2 because "
236 "peeling for alignment is unknown .");
238 /* If peeled iterations are unknown, count a taken branch and a not taken
239 branch per peeled loop. Even if scalar loop iterations are known,
240 vector iterations are not known since peeled prologue iterations are
241 not known. Hence guards remain the same. */
242 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
243 + TARG_COND_NOT_TAKEN_BRANCH_COST);
250 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
251 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
252 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
253 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
255 peel_iters_prologue = nelements - (byte_misalign / element_size);
258 peel_iters_prologue = 0;
260 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
262 peel_iters_epilogue = vf/2;
263 if (vect_print_dump_info (REPORT_DETAILS))
264 fprintf (vect_dump, "cost model: "
265 "epilogue peel iters set to vf/2 because "
266 "loop iterations are unknown .");
268 /* If peeled iterations are known but number of scalar loop
269 iterations are unknown, count a taken branch per peeled loop. */
270 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
275 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
276 peel_iters_prologue = niters < peel_iters_prologue ?
277 niters : peel_iters_prologue;
278 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
282 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
283 + (peel_iters_epilogue * scalar_single_iter_cost)
286 /* FORNOW: The scalar outside cost is incremented in one of the
289 1. The vectorizer checks for alignment and aliasing and generates
290 a condition that allows dynamic vectorization. A cost model
291 check is ANDED with the versioning condition. Hence scalar code
292 path now has the added cost of the versioning check.
294 if (cost > th & versioning_check)
297 Hence run-time scalar is incremented by not-taken branch cost.
299 2. The vectorizer then checks if a prologue is required. If the
300 cost model check was not done before during versioning, it has to
301 be done before the prologue check.
304 prologue = scalar_iters
309 if (prologue == num_iters)
312 Hence the run-time scalar cost is incremented by a taken branch,
313 plus a not-taken branch, plus a taken branch cost.
315 3. The vectorizer then checks if an epilogue is required. If the
316 cost model check was not done before during prologue check, it
317 has to be done with the epilogue check.
323 if (prologue == num_iters)
326 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
329 Hence the run-time scalar cost should be incremented by 2 taken
332 TODO: The back end may reorder the BBS's differently and reverse
333 conditions/branch directions. Change the stimates below to
334 something more reasonable. */
338 /* Cost model check occurs at versioning. */
339 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
340 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
341 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
344 /* Cost model occurs at prologue generation. */
345 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
346 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
347 + TARG_COND_NOT_TAKEN_BRANCH_COST;
348 /* Cost model check occurs at epilogue generation. */
350 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
355 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
356 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
358 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
359 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
362 /* Calculate number of iterations required to make the vector version
363 profitable, relative to the loop bodies only. The following condition
365 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
367 SIC = scalar iteration cost, VIC = vector iteration cost,
368 VOC = vector outside cost, VF = vectorization factor,
369 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
370 SOC = scalar outside cost for run time cost model check. */
372 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
374 if (vec_outside_cost <= 0)
375 min_profitable_iters = 1;
378 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
379 - vec_inside_cost * peel_iters_prologue
380 - vec_inside_cost * peel_iters_epilogue)
381 / ((scalar_single_iter_cost * vf)
384 if ((scalar_single_iter_cost * vf * min_profitable_iters)
385 <= ((vec_inside_cost * min_profitable_iters)
386 + ((vec_outside_cost - scalar_outside_cost) * vf)))
387 min_profitable_iters++;
390 /* vector version will never be profitable. */
393 if (vect_print_dump_info (REPORT_DETAILS))
394 fprintf (vect_dump, "cost model: vector iteration cost = %d "
395 "is divisible by scalar iteration cost = %d by a factor "
396 "greater than or equal to the vectorization factor = %d .",
397 vec_inside_cost, scalar_single_iter_cost, vf);
401 if (vect_print_dump_info (REPORT_DETAILS))
403 fprintf (vect_dump, "Cost model analysis: \n");
404 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
406 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
408 fprintf (vect_dump, " Scalar iteration cost: %d\n",
409 scalar_single_iter_cost);
410 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
411 fprintf (vect_dump, " prologue iterations: %d\n",
412 peel_iters_prologue);
413 fprintf (vect_dump, " epilogue iterations: %d\n",
414 peel_iters_epilogue);
415 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
416 min_profitable_iters);
419 min_profitable_iters =
420 min_profitable_iters < vf ? vf : min_profitable_iters;
422 /* Because the condition we create is:
423 if (niters <= min_profitable_iters)
424 then skip the vectorized loop. */
425 min_profitable_iters--;
427 if (vect_print_dump_info (REPORT_DETAILS))
428 fprintf (vect_dump, " Profitability threshold = %d\n",
429 min_profitable_iters);
431 return min_profitable_iters;
435 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
436 functions. Design better to avoid maintenance issues. */
438 /* Function vect_model_reduction_cost.
440 Models cost for a reduction operation, including the vector ops
441 generated within the strip-mine loop, the initial definition before
442 the loop, and the epilogue code that must be generated. */
445 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
454 enum machine_mode mode;
455 tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1);
456 int op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
457 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
458 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
460 /* Cost of reduction op inside loop. */
461 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
463 reduction_op = TREE_OPERAND (operation, op_type-1);
464 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
467 if (vect_print_dump_info (REPORT_DETAILS))
469 fprintf (vect_dump, "unsupported data-type ");
470 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
475 mode = TYPE_MODE (vectype);
476 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
479 orig_stmt = STMT_VINFO_STMT (stmt_info);
481 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
483 /* Add in cost for initial definition. */
484 outer_cost += TARG_SCALAR_TO_VEC_COST;
486 /* Determine cost of epilogue code.
488 We have a reduction operator that will reduce the vector in one statement.
489 Also requires scalar extract. */
491 if (!nested_in_vect_loop_p (loop, orig_stmt))
493 if (reduc_code < NUM_TREE_CODES)
494 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
497 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
499 TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0)));
500 int element_bitsize = tree_low_cst (bitsize, 1);
501 int nelements = vec_size_in_bits / element_bitsize;
503 optab = optab_for_tree_code (code, vectype);
505 /* We have a whole vector shift available. */
506 if (VECTOR_MODE_P (mode)
507 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
508 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
509 /* Final reduction via vector shifts and the reduction operator. Also
510 requires scalar extract. */
511 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
512 + TARG_VEC_TO_SCALAR_COST);
514 /* Use extracts and reduction op for final reduction. For N elements,
515 we have N extracts and N-1 reduction ops. */
516 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
520 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
522 if (vect_print_dump_info (REPORT_DETAILS))
523 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
524 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
525 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
531 /* Function vect_model_induction_cost.
533 Models cost for induction operations. */
536 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
538 /* loop cost for vec_loop. */
539 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
540 /* prologue cost for vec_init and vec_step. */
541 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
543 if (vect_print_dump_info (REPORT_DETAILS))
544 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
545 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
546 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
550 /* Function vect_model_simple_cost.
552 Models cost for simple operations, i.e. those that only emit ncopies of a
553 single op. Right now, this does not account for multiple insns that could
554 be generated for the single vector op. We will handle that shortly. */
557 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
558 enum vect_def_type *dt, slp_tree slp_node)
561 int inside_cost = 0, outside_cost = 0;
563 inside_cost = ncopies * TARG_VEC_STMT_COST;
565 /* FORNOW: Assuming maximum 2 args per stmts. */
566 for (i = 0; i < 2; i++)
568 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
569 outside_cost += TARG_SCALAR_TO_VEC_COST;
572 if (vect_print_dump_info (REPORT_DETAILS))
573 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
574 "outside_cost = %d .", inside_cost, outside_cost);
576 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
577 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
578 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
582 /* Function vect_cost_strided_group_size
584 For strided load or store, return the group_size only if it is the first
585 load or store of a group, else return 1. This ensures that group size is
586 only returned once per group. */
589 vect_cost_strided_group_size (stmt_vec_info stmt_info)
591 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
593 if (first_stmt == STMT_VINFO_STMT (stmt_info))
594 return DR_GROUP_SIZE (stmt_info);
600 /* Function vect_model_store_cost
602 Models cost for stores. In the case of strided accesses, one access
603 has the overhead of the strided access attributed to it. */
606 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
607 enum vect_def_type dt, slp_tree slp_node)
610 int inside_cost = 0, outside_cost = 0;
612 if (dt == vect_constant_def || dt == vect_invariant_def)
613 outside_cost = TARG_SCALAR_TO_VEC_COST;
615 /* Strided access? */
616 if (DR_GROUP_FIRST_DR (stmt_info))
617 group_size = vect_cost_strided_group_size (stmt_info);
618 /* Not a strided access. */
622 /* Is this an access in a group of stores, which provide strided access?
623 If so, add in the cost of the permutes. */
626 /* Uses a high and low interleave operation for each needed permute. */
627 inside_cost = ncopies * exact_log2(group_size) * group_size
628 * TARG_VEC_STMT_COST;
630 if (vect_print_dump_info (REPORT_DETAILS))
631 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
636 /* Costs of the stores. */
637 inside_cost += ncopies * TARG_VEC_STORE_COST;
639 if (vect_print_dump_info (REPORT_DETAILS))
640 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
641 "outside_cost = %d .", inside_cost, outside_cost);
643 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
644 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
645 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
649 /* Function vect_model_load_cost
651 Models cost for loads. In the case of strided accesses, the last access
652 has the overhead of the strided access attributed to it. Since unaligned
653 accesses are supported for loads, we also account for the costs of the
654 access scheme chosen. */
657 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
661 int alignment_support_cheme;
663 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
664 int inside_cost = 0, outside_cost = 0;
666 /* Strided accesses? */
667 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
668 if (first_stmt && !slp_node)
670 group_size = vect_cost_strided_group_size (stmt_info);
671 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
673 /* Not a strided access. */
680 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
682 /* Is this an access in a group of loads providing strided access?
683 If so, add in the cost of the permutes. */
686 /* Uses an even and odd extract operations for each needed permute. */
687 inside_cost = ncopies * exact_log2(group_size) * group_size
688 * TARG_VEC_STMT_COST;
690 if (vect_print_dump_info (REPORT_DETAILS))
691 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
696 /* The loads themselves. */
697 switch (alignment_support_cheme)
701 inside_cost += ncopies * TARG_VEC_LOAD_COST;
703 if (vect_print_dump_info (REPORT_DETAILS))
704 fprintf (vect_dump, "vect_model_load_cost: aligned.");
708 case dr_unaligned_supported:
710 /* Here, we assign an additional cost for the unaligned load. */
711 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
713 if (vect_print_dump_info (REPORT_DETAILS))
714 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
719 case dr_explicit_realign:
721 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
723 /* FIXME: If the misalignment remains fixed across the iterations of
724 the containing loop, the following cost should be added to the
726 if (targetm.vectorize.builtin_mask_for_load)
727 inside_cost += TARG_VEC_STMT_COST;
731 case dr_explicit_realign_optimized:
733 if (vect_print_dump_info (REPORT_DETAILS))
734 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
737 /* Unaligned software pipeline has a load of an address, an initial
738 load, and possibly a mask operation to "prime" the loop. However,
739 if this is an access in a group of loads, which provide strided
740 access, then the above cost should only be considered for one
741 access in the group. Inside the loop, there is a load op
742 and a realignment op. */
744 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
746 outside_cost = 2*TARG_VEC_STMT_COST;
747 if (targetm.vectorize.builtin_mask_for_load)
748 outside_cost += TARG_VEC_STMT_COST;
751 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
760 if (vect_print_dump_info (REPORT_DETAILS))
761 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
762 "outside_cost = %d .", inside_cost, outside_cost);
764 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
765 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
766 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
770 /* Function vect_get_new_vect_var.
772 Returns a name for a new variable. The current naming scheme appends the
773 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
774 the name of vectorizer generated variables, and appends that to NAME if
778 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
785 case vect_simple_var:
788 case vect_scalar_var:
791 case vect_pointer_var:
800 char* tmp = concat (prefix, name, NULL);
801 new_vect_var = create_tmp_var (type, tmp);
805 new_vect_var = create_tmp_var (type, prefix);
807 /* Mark vector typed variable as a gimple register variable. */
808 if (TREE_CODE (type) == VECTOR_TYPE)
809 DECL_GIMPLE_REG_P (new_vect_var) = true;
815 /* Function vect_create_addr_base_for_vector_ref.
817 Create an expression that computes the address of the first memory location
818 that will be accessed for a data reference.
821 STMT: The statement containing the data reference.
822 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
823 OFFSET: Optional. If supplied, it is be added to the initial address.
824 LOOP: Specify relative to which loop-nest should the address be computed.
825 For example, when the dataref is in an inner-loop nested in an
826 outer-loop that is now being vectorized, LOOP can be either the
827 outer-loop, or the inner-loop. The first memory location accessed
828 by the following dataref ('in' points to short):
835 if LOOP=i_loop: &in (relative to i_loop)
836 if LOOP=j_loop: &in+i*2B (relative to j_loop)
839 1. Return an SSA_NAME whose value is the address of the memory location of
840 the first vector of the data reference.
841 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
842 these statement(s) which define the returned SSA_NAME.
844 FORNOW: We are only handling array accesses with step 1. */
847 vect_create_addr_base_for_vector_ref (tree stmt,
852 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
853 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
854 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
855 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
857 tree data_ref_base_var;
860 tree addr_base, addr_expr;
862 tree base_offset = unshare_expr (DR_OFFSET (dr));
863 tree init = unshare_expr (DR_INIT (dr));
864 tree vect_ptr_type, addr_expr2;
865 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
868 if (loop != containing_loop)
870 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
871 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
873 gcc_assert (nested_in_vect_loop_p (loop, stmt));
875 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
876 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
877 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
880 /* Create data_ref_base */
881 base_name = build_fold_indirect_ref (data_ref_base);
882 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
883 add_referenced_var (data_ref_base_var);
884 data_ref_base = force_gimple_operand (data_ref_base, &new_base_stmt,
885 true, data_ref_base_var);
886 append_to_statement_list_force(new_base_stmt, new_stmt_list);
888 /* Create base_offset */
889 base_offset = size_binop (PLUS_EXPR, base_offset, init);
890 base_offset = fold_convert (sizetype, base_offset);
891 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
892 add_referenced_var (dest);
893 base_offset = force_gimple_operand (base_offset, &new_stmt, true, dest);
894 append_to_statement_list_force (new_stmt, new_stmt_list);
898 tree tmp = create_tmp_var (sizetype, "offset");
900 add_referenced_var (tmp);
901 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
902 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
903 base_offset, offset);
904 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
905 append_to_statement_list_force (new_stmt, new_stmt_list);
908 /* base + base_offset */
909 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
910 data_ref_base, base_offset);
912 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
914 /* addr_expr = addr_base */
915 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
916 get_name (base_name));
917 add_referenced_var (addr_expr);
918 vec_stmt = fold_convert (vect_ptr_type, addr_base);
919 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
920 get_name (base_name));
921 add_referenced_var (addr_expr2);
922 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
923 append_to_statement_list_force (new_stmt, new_stmt_list);
925 if (vect_print_dump_info (REPORT_DETAILS))
927 fprintf (vect_dump, "created ");
928 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
934 /* Function vect_create_data_ref_ptr.
936 Create a new pointer to vector type (vp), that points to the first location
937 accessed in the loop by STMT, along with the def-use update chain to
938 appropriately advance the pointer through the loop iterations. Also set
939 aliasing information for the pointer. This vector pointer is used by the
940 callers to this function to create a memory reference expression for vector
944 1. STMT: a stmt that references memory. Expected to be of the form
945 GIMPLE_MODIFY_STMT <name, data-ref> or
946 GIMPLE_MODIFY_STMT <data-ref, name>.
947 2. AT_LOOP: the loop where the vector memref is to be created.
948 3. OFFSET (optional): an offset to be added to the initial address accessed
949 by the data-ref in STMT.
950 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
951 pointing to the initial address.
952 5. TYPE: if not NULL indicates the required type of the data-ref
955 1. Declare a new ptr to vector_type, and have it point to the base of the
956 data reference (initial addressed accessed by the data reference).
957 For example, for vector of type V8HI, the following code is generated:
960 vp = (v8hi *)initial_address;
962 if OFFSET is not supplied:
963 initial_address = &a[init];
964 if OFFSET is supplied:
965 initial_address = &a[init + OFFSET];
967 Return the initial_address in INITIAL_ADDRESS.
969 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
970 update the pointer in each iteration of the loop.
972 Return the increment stmt that updates the pointer in PTR_INCR.
974 3. Set INV_P to true if the access pattern of the data reference in the
975 vectorized loop is invariant. Set it to false otherwise.
977 4. Return the pointer. */
980 vect_create_data_ref_ptr (tree stmt, struct loop *at_loop,
981 tree offset, tree *initial_address, tree *ptr_incr,
982 bool only_init, tree type, bool *inv_p)
985 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
986 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
987 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
988 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
989 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
990 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
996 tree new_stmt_list = NULL_TREE;
1000 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1002 block_stmt_iterator incr_bsi;
1004 tree indx_before_incr, indx_after_incr;
1008 /* Check the step (evolution) of the load in LOOP, and record
1009 whether it's invariant. */
1010 if (nested_in_vect_loop)
1011 step = STMT_VINFO_DR_STEP (stmt_info);
1013 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1015 if (tree_int_cst_compare (step, size_zero_node) == 0)
1020 /* Create an expression for the first address accessed by this load
1022 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1024 if (vect_print_dump_info (REPORT_DETAILS))
1026 tree data_ref_base = base_name;
1027 fprintf (vect_dump, "create vector-pointer variable to type: ");
1028 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1029 if (TREE_CODE (data_ref_base) == VAR_DECL)
1030 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
1031 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1032 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
1033 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1034 fprintf (vect_dump, " vectorizing a record based array ref: ");
1035 else if (TREE_CODE (data_ref_base) == SSA_NAME)
1036 fprintf (vect_dump, " vectorizing a pointer ref: ");
1037 print_generic_expr (vect_dump, base_name, TDF_SLIM);
1040 /** (1) Create the new vector-pointer variable: **/
1042 vect_ptr_type = build_pointer_type (type);
1044 vect_ptr_type = build_pointer_type (vectype);
1045 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1046 get_name (base_name));
1047 add_referenced_var (vect_ptr);
1049 /** (2) Add aliasing information to the new vector-pointer:
1050 (The points-to info (DR_PTR_INFO) may be defined later.) **/
1052 tag = DR_SYMBOL_TAG (dr);
1055 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1056 tag must be created with tag added to its may alias list. */
1058 new_type_alias (vect_ptr, tag, DR_REF (dr));
1060 set_symbol_mem_tag (vect_ptr, tag);
1062 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
1064 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1065 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1066 def-use update cycles for the pointer: One relative to the outer-loop
1067 (LOOP), which is what steps (3) and (4) below do. The other is relative
1068 to the inner-loop (which is the inner-most loop containing the dataref),
1069 and this is done be step (5) below.
1071 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1072 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1073 redundant. Steps (3),(4) create the following:
1076 LOOP: vp1 = phi(vp0,vp2)
1082 If there is an inner-loop nested in loop, then step (5) will also be
1083 applied, and an additional update in the inner-loop will be created:
1086 LOOP: vp1 = phi(vp0,vp2)
1088 inner: vp3 = phi(vp1,vp4)
1089 vp4 = vp3 + inner_step
1095 /** (3) Calculate the initial address the vector-pointer, and set
1096 the vector-pointer to point to it before the loop: **/
1098 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1100 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1102 pe = loop_preheader_edge (loop);
1103 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
1104 gcc_assert (!new_bb);
1105 *initial_address = new_temp;
1107 /* Create: p = (vectype *) initial_base */
1108 vec_stmt = fold_convert (vect_ptr_type, new_temp);
1109 vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
1110 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1111 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
1112 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
1113 gcc_assert (!new_bb);
1116 /** (4) Handle the updating of the vector-pointer inside the loop.
1117 This is needed when ONLY_INIT is false, and also when AT_LOOP
1118 is the inner-loop nested in LOOP (during outer-loop vectorization).
1121 if (only_init && at_loop == loop) /* No update in loop is required. */
1123 /* Copy the points-to information if it exists. */
1124 if (DR_PTR_INFO (dr))
1125 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1126 vptr = vect_ptr_init;
1130 /* The step of the vector pointer is the Vector Size. */
1131 tree step = TYPE_SIZE_UNIT (vectype);
1132 /* One exception to the above is when the scalar step of the load in
1133 LOOP is zero. In this case the step here is also zero. */
1135 step = size_zero_node;
1137 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
1139 create_iv (vect_ptr_init,
1140 fold_convert (vect_ptr_type, step),
1141 NULL_TREE, loop, &incr_bsi, insert_after,
1142 &indx_before_incr, &indx_after_incr);
1143 incr = bsi_stmt (incr_bsi);
1144 set_stmt_info (stmt_ann (incr),
1145 new_stmt_vec_info (incr, loop_vinfo));
1147 /* Copy the points-to information if it exists. */
1148 if (DR_PTR_INFO (dr))
1150 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1151 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1153 merge_alias_info (vect_ptr_init, indx_before_incr);
1154 merge_alias_info (vect_ptr_init, indx_after_incr);
1158 vptr = indx_before_incr;
1161 if (!nested_in_vect_loop || only_init)
1165 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1166 nested in LOOP, if exists: **/
1168 gcc_assert (nested_in_vect_loop);
1171 standard_iv_increment_position (containing_loop, &incr_bsi,
1173 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE,
1174 containing_loop, &incr_bsi, insert_after, &indx_before_incr,
1176 incr = bsi_stmt (incr_bsi);
1177 set_stmt_info (stmt_ann (incr), new_stmt_vec_info (incr, loop_vinfo));
1179 /* Copy the points-to information if it exists. */
1180 if (DR_PTR_INFO (dr))
1182 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1183 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1185 merge_alias_info (vect_ptr_init, indx_before_incr);
1186 merge_alias_info (vect_ptr_init, indx_after_incr);
1190 return indx_before_incr;
1197 /* Function bump_vector_ptr
1199 Increment a pointer (to a vector type) by vector-size. If requested,
1200 i.e. if PTR-INCR is given, then also connect the new increment stmt
1201 to the existing def-use update-chain of the pointer, by modifying
1202 the PTR_INCR as illustrated below:
1204 The pointer def-use update-chain before this function:
1205 DATAREF_PTR = phi (p_0, p_2)
1207 PTR_INCR: p_2 = DATAREF_PTR + step
1209 The pointer def-use update-chain after this function:
1210 DATAREF_PTR = phi (p_0, p_2)
1212 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1214 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1217 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1219 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1220 the loop. The increment amount across iterations is expected
1222 BSI - location where the new update stmt is to be placed.
1223 STMT - the original scalar memory-access stmt that is being vectorized.
1224 BUMP - optional. The offset by which to bump the pointer. If not given,
1225 the offset is assumed to be vector_size.
1227 Output: Return NEW_DATAREF_PTR as illustrated above.
1232 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
1233 tree stmt, tree bump)
1235 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1236 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1237 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1238 tree vptr_type = TREE_TYPE (dataref_ptr);
1239 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1240 tree update = TYPE_SIZE_UNIT (vectype);
1243 use_operand_p use_p;
1244 tree new_dataref_ptr;
1249 incr_stmt = build_gimple_modify_stmt (ptr_var,
1250 build2 (POINTER_PLUS_EXPR, vptr_type,
1251 dataref_ptr, update));
1252 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1253 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
1254 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
1256 /* Copy the points-to information if it exists. */
1257 if (DR_PTR_INFO (dr))
1258 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1259 merge_alias_info (new_dataref_ptr, dataref_ptr);
1262 return new_dataref_ptr;
1264 /* Update the vector-pointer's cross-iteration increment. */
1265 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1267 tree use = USE_FROM_PTR (use_p);
1269 if (use == dataref_ptr)
1270 SET_USE (use_p, new_dataref_ptr);
1272 gcc_assert (tree_int_cst_compare (use, update) == 0);
1275 return new_dataref_ptr;
1279 /* Function vect_create_destination_var.
1281 Create a new temporary of type VECTYPE. */
1284 vect_create_destination_var (tree scalar_dest, tree vectype)
1287 const char *new_name;
1289 enum vect_var_kind kind;
1291 kind = vectype ? vect_simple_var : vect_scalar_var;
1292 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1294 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1296 new_name = get_name (scalar_dest);
1299 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1300 add_referenced_var (vec_dest);
1306 /* Function vect_init_vector.
1308 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1309 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1310 is not NULL. Otherwise, place the initialization at the loop preheader.
1311 Return the DEF of INIT_STMT.
1312 It will be used in the vectorization of STMT. */
1315 vect_init_vector (tree stmt, tree vector_var, tree vector_type,
1316 block_stmt_iterator *bsi)
1318 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1326 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1327 add_referenced_var (new_var);
1328 init_stmt = build_gimple_modify_stmt (new_var, vector_var);
1329 new_temp = make_ssa_name (new_var, init_stmt);
1330 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
1333 vect_finish_stmt_generation (stmt, init_stmt, bsi);
1336 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1337 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1339 if (nested_in_vect_loop_p (loop, stmt))
1341 pe = loop_preheader_edge (loop);
1342 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1343 gcc_assert (!new_bb);
1346 if (vect_print_dump_info (REPORT_DETAILS))
1348 fprintf (vect_dump, "created new init_stmt: ");
1349 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1352 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
1357 /* For constant and loop invariant defs of SLP_NODE this function returns
1358 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1359 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1363 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1364 unsigned int op_num)
1366 VEC (tree, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1367 tree stmt = VEC_index (tree, stmts, 0);
1368 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1369 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1370 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1373 int j, number_of_places_left_in_vector;
1375 tree op, vop, operation;
1376 int group_size = VEC_length (tree, stmts);
1377 unsigned int vec_num, i;
1378 int number_of_copies = 1;
1379 bool is_store = false;
1380 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1381 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1384 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1387 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1388 created vectors. It is greater than 1 if unrolling is performed.
1390 For example, we have two scalar operands, s1 and s2 (e.g., group of
1391 strided accesses of size two), while NUINTS is four (i.e., four scalars
1392 of this type can be packed in a vector). The output vector will contain
1393 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1396 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1397 containing the operands.
1399 For example, NUINTS is four as before, and the group size is 8
1400 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1401 {s5, s6, s7, s8}. */
1403 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1405 number_of_places_left_in_vector = nunits;
1407 for (j = 0; j < number_of_copies; j++)
1409 for (i = group_size - 1; VEC_iterate (tree, stmts, i, stmt); i--)
1411 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1415 op = TREE_OPERAND (operation, op_num);
1416 if (!CONSTANT_CLASS_P (op))
1419 /* Create 'vect_ = {op0,op1,...,opn}'. */
1420 t = tree_cons (NULL_TREE, op, t);
1422 number_of_places_left_in_vector--;
1424 if (number_of_places_left_in_vector == 0)
1426 number_of_places_left_in_vector = nunits;
1428 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1429 gcc_assert (vector_type);
1431 vec_cst = build_vector (vector_type, t);
1433 vec_cst = build_constructor_from_list (vector_type, t);
1435 VEC_quick_push (tree, voprnds,
1436 vect_init_vector (stmt, vec_cst, vector_type,
1443 /* Since the vectors are created in the reverse order, we should invert
1445 vec_num = VEC_length (tree, voprnds);
1446 for (j = vec_num - 1; j >= 0; j--)
1448 vop = VEC_index (tree, voprnds, j);
1449 VEC_quick_push (tree, *vec_oprnds, vop);
1452 VEC_free (tree, heap, voprnds);
1454 /* In case that VF is greater than the unrolling factor needed for the SLP
1455 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1456 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1457 to replicate the vectors. */
1458 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1460 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1461 VEC_quick_push (tree, *vec_oprnds, vop);
1466 /* Get vectorized definitions from SLP_NODE that contains corresponding
1467 vectorized def-stmts. */
1470 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1476 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1479 VEC_iterate (tree, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1482 gcc_assert (vec_def_stmt);
1483 vec_oprnd = GIMPLE_STMT_OPERAND (vec_def_stmt, 0);
1484 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1489 /* Get vectorized definitions for SLP_NODE.
1490 If the scalar definitions are loop invariants or constants, collect them and
1491 call vect_get_constant_vectors() to create vector stmts.
1492 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1493 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1494 vect_get_slp_vect_defs() to retrieve them.
1495 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1496 the right node. This is used when the second operand must remain scalar. */
1499 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1500 VEC (tree,heap) **vec_oprnds1)
1502 tree operation, first_stmt;
1504 /* Allocate memory for vectorized defs. */
1505 *vec_oprnds0 = VEC_alloc (tree, heap,
1506 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1508 /* SLP_NODE corresponds either to a group of stores or to a group of
1509 unary/binary operations. We don't call this function for loads. */
1510 if (SLP_TREE_LEFT (slp_node))
1511 /* The defs are already vectorized. */
1512 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1514 /* Build vectors from scalar defs. */
1515 vect_get_constant_vectors (slp_node, vec_oprnds0, 0);
1517 first_stmt = VEC_index (tree, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1518 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1519 /* Since we don't call this function with loads, this is a group of
1523 operation = GIMPLE_STMT_OPERAND (first_stmt, 1);
1524 if (TREE_OPERAND_LENGTH (operation) == unary_op || !vec_oprnds1)
1527 *vec_oprnds1 = VEC_alloc (tree, heap,
1528 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1530 if (SLP_TREE_RIGHT (slp_node))
1531 /* The defs are already vectorized. */
1532 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1534 /* Build vectors from scalar defs. */
1535 vect_get_constant_vectors (slp_node, vec_oprnds1, 1);
1539 /* Function get_initial_def_for_induction
1542 STMT - a stmt that performs an induction operation in the loop.
1543 IV_PHI - the initial value of the induction variable
1546 Return a vector variable, initialized with the first VF values of
1547 the induction variable. E.g., for an iv with IV_PHI='X' and
1548 evolution S, for a vector of 4 units, we want to return:
1549 [X, X + S, X + 2*S, X + 3*S]. */
1552 get_initial_def_for_induction (tree iv_phi)
1554 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1555 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1556 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1557 tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
1560 edge pe = loop_preheader_edge (loop);
1561 struct loop *iv_loop;
1563 tree vec, vec_init, vec_step, t;
1568 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
1569 tree init_expr, step_expr;
1570 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1575 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1576 bool nested_in_vect_loop = false;
1578 imm_use_iterator imm_iter;
1579 use_operand_p use_p;
1583 block_stmt_iterator si;
1584 basic_block bb = bb_for_stmt (iv_phi);
1586 vectype = get_vectype_for_scalar_type (scalar_type);
1587 gcc_assert (vectype);
1588 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1589 ncopies = vf / nunits;
1591 gcc_assert (phi_info);
1592 gcc_assert (ncopies >= 1);
1594 /* Find the first insertion point in the BB. */
1595 si = bsi_after_labels (bb);
1597 if (INTEGRAL_TYPE_P (scalar_type))
1598 step_expr = build_int_cst (scalar_type, 0);
1600 step_expr = build_real (scalar_type, dconst0);
1602 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1603 if (nested_in_vect_loop_p (loop, iv_phi))
1605 nested_in_vect_loop = true;
1606 iv_loop = loop->inner;
1610 gcc_assert (iv_loop == (bb_for_stmt (iv_phi))->loop_father);
1612 latch_e = loop_latch_edge (iv_loop);
1613 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1615 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1616 gcc_assert (access_fn);
1617 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1618 &init_expr, &step_expr);
1620 pe = loop_preheader_edge (iv_loop);
1622 /* Create the vector that holds the initial_value of the induction. */
1623 if (nested_in_vect_loop)
1625 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1626 been created during vectorization of previous stmts; We obtain it from
1627 the STMT_VINFO_VEC_STMT of the defining stmt. */
1628 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1629 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1633 /* iv_loop is the loop to be vectorized. Create:
1634 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1635 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1636 add_referenced_var (new_var);
1638 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1641 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
1642 gcc_assert (!new_bb);
1646 t = tree_cons (NULL_TREE, init_expr, t);
1647 for (i = 1; i < nunits; i++)
1651 /* Create: new_name_i = new_name + step_expr */
1652 tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
1653 init_stmt = build_gimple_modify_stmt (new_var, tmp);
1654 new_name = make_ssa_name (new_var, init_stmt);
1655 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
1657 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1658 gcc_assert (!new_bb);
1660 if (vect_print_dump_info (REPORT_DETAILS))
1662 fprintf (vect_dump, "created new init_stmt: ");
1663 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1665 t = tree_cons (NULL_TREE, new_name, t);
1667 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1668 vec = build_constructor_from_list (vectype, nreverse (t));
1669 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1673 /* Create the vector that holds the step of the induction. */
1674 if (nested_in_vect_loop)
1675 /* iv_loop is nested in the loop to be vectorized. Generate:
1676 vec_step = [S, S, S, S] */
1677 new_name = step_expr;
1680 /* iv_loop is the loop to be vectorized. Generate:
1681 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1682 expr = build_int_cst (scalar_type, vf);
1683 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1687 for (i = 0; i < nunits; i++)
1688 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1689 gcc_assert (CONSTANT_CLASS_P (new_name));
1690 vec = build_vector (vectype, t);
1691 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1694 /* Create the following def-use cycle:
1699 vec_iv = PHI <vec_init, vec_loop>
1703 vec_loop = vec_iv + vec_step; */
1705 /* Create the induction-phi that defines the induction-operand. */
1706 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1707 add_referenced_var (vec_dest);
1708 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1709 set_stmt_info (get_stmt_ann (induction_phi),
1710 new_stmt_vec_info (induction_phi, loop_vinfo));
1711 induc_def = PHI_RESULT (induction_phi);
1713 /* Create the iv update inside the loop */
1714 new_stmt = build_gimple_modify_stmt (NULL_TREE,
1715 build2 (PLUS_EXPR, vectype,
1716 induc_def, vec_step));
1717 vec_def = make_ssa_name (vec_dest, new_stmt);
1718 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1719 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1720 set_stmt_info (get_stmt_ann (new_stmt),
1721 new_stmt_vec_info (new_stmt, loop_vinfo));
1723 /* Set the arguments of the phi node: */
1724 add_phi_arg (induction_phi, vec_init, pe);
1725 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1728 /* In case that vectorization factor (VF) is bigger than the number
1729 of elements that we can fit in a vectype (nunits), we have to generate
1730 more than one vector stmt - i.e - we need to "unroll" the
1731 vector stmt by a factor VF/nunits. For more details see documentation
1732 in vectorizable_operation. */
1736 stmt_vec_info prev_stmt_vinfo;
1737 /* FORNOW. This restriction should be relaxed. */
1738 gcc_assert (!nested_in_vect_loop);
1740 /* Create the vector that holds the step of the induction. */
1741 expr = build_int_cst (scalar_type, nunits);
1742 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1744 for (i = 0; i < nunits; i++)
1745 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1746 gcc_assert (CONSTANT_CLASS_P (new_name));
1747 vec = build_vector (vectype, t);
1748 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1750 vec_def = induc_def;
1751 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1752 for (i = 1; i < ncopies; i++)
1756 /* vec_i = vec_prev + vec_step */
1757 tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
1758 new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
1759 vec_def = make_ssa_name (vec_dest, new_stmt);
1760 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1761 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1762 set_stmt_info (get_stmt_ann (new_stmt),
1763 new_stmt_vec_info (new_stmt, loop_vinfo));
1764 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1765 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1769 if (nested_in_vect_loop)
1771 /* Find the loop-closed exit-phi of the induction, and record
1772 the final vector of induction results: */
1774 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1776 if (!flow_bb_inside_loop_p (iv_loop, bb_for_stmt (USE_STMT (use_p))))
1778 exit_phi = USE_STMT (use_p);
1784 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1785 /* FORNOW. Currently not supporting the case that an inner-loop induction
1786 is not used in the outer-loop (i.e. only outside the outer-loop). */
1787 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1788 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1790 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1791 if (vect_print_dump_info (REPORT_DETAILS))
1793 fprintf (vect_dump, "vector of inductions after inner-loop:");
1794 print_generic_expr (vect_dump, new_stmt, TDF_SLIM);
1800 if (vect_print_dump_info (REPORT_DETAILS))
1802 fprintf (vect_dump, "transform induction: created def-use cycle:");
1803 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
1804 fprintf (vect_dump, "\n");
1805 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
1808 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1813 /* Function vect_get_vec_def_for_operand.
1815 OP is an operand in STMT. This function returns a (vector) def that will be
1816 used in the vectorized stmt for STMT.
1818 In the case that OP is an SSA_NAME which is defined in the loop, then
1819 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1821 In case OP is an invariant or constant, a new stmt that creates a vector def
1822 needs to be introduced. */
1825 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
1830 stmt_vec_info def_stmt_info = NULL;
1831 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1832 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1833 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1834 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1840 enum vect_def_type dt;
1844 if (vect_print_dump_info (REPORT_DETAILS))
1846 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1847 print_generic_expr (vect_dump, op, TDF_SLIM);
1850 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1851 gcc_assert (is_simple_use);
1852 if (vect_print_dump_info (REPORT_DETAILS))
1856 fprintf (vect_dump, "def = ");
1857 print_generic_expr (vect_dump, def, TDF_SLIM);
1861 fprintf (vect_dump, " def_stmt = ");
1862 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
1868 /* Case 1: operand is a constant. */
1869 case vect_constant_def:
1874 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1875 if (vect_print_dump_info (REPORT_DETAILS))
1876 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1878 for (i = nunits - 1; i >= 0; --i)
1880 t = tree_cons (NULL_TREE, op, t);
1882 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1883 gcc_assert (vector_type);
1884 vec_cst = build_vector (vector_type, t);
1886 return vect_init_vector (stmt, vec_cst, vector_type, NULL);
1889 /* Case 2: operand is defined outside the loop - loop invariant. */
1890 case vect_invariant_def:
1895 /* Create 'vec_inv = {inv,inv,..,inv}' */
1896 if (vect_print_dump_info (REPORT_DETAILS))
1897 fprintf (vect_dump, "Create vector_inv.");
1899 for (i = nunits - 1; i >= 0; --i)
1901 t = tree_cons (NULL_TREE, def, t);
1904 /* FIXME: use build_constructor directly. */
1905 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1906 gcc_assert (vector_type);
1907 vec_inv = build_constructor_from_list (vector_type, t);
1908 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1911 /* Case 3: operand is defined inside the loop. */
1915 *scalar_def = def_stmt;
1917 /* Get the def from the vectorized stmt. */
1918 def_stmt_info = vinfo_for_stmt (def_stmt);
1919 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1920 gcc_assert (vec_stmt);
1921 if (TREE_CODE (vec_stmt) == PHI_NODE)
1922 vec_oprnd = PHI_RESULT (vec_stmt);
1924 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
1928 /* Case 4: operand is defined by a loop header phi - reduction */
1929 case vect_reduction_def:
1933 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1934 loop = (bb_for_stmt (def_stmt))->loop_father;
1936 /* Get the def before the loop */
1937 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1938 return get_initial_def_for_reduction (stmt, op, scalar_def);
1941 /* Case 5: operand is defined by loop-header phi - induction. */
1942 case vect_induction_def:
1944 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1946 /* Get the def from the vectorized stmt. */
1947 def_stmt_info = vinfo_for_stmt (def_stmt);
1948 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1949 gcc_assert (vec_stmt && (TREE_CODE (vec_stmt) == PHI_NODE));
1950 vec_oprnd = PHI_RESULT (vec_stmt);
1960 /* Function vect_get_vec_def_for_stmt_copy
1962 Return a vector-def for an operand. This function is used when the
1963 vectorized stmt to be created (by the caller to this function) is a "copy"
1964 created in case the vectorized result cannot fit in one vector, and several
1965 copies of the vector-stmt are required. In this case the vector-def is
1966 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
1967 of the stmt that defines VEC_OPRND.
1968 DT is the type of the vector def VEC_OPRND.
1971 In case the vectorization factor (VF) is bigger than the number
1972 of elements that can fit in a vectype (nunits), we have to generate
1973 more than one vector stmt to vectorize the scalar stmt. This situation
1974 arises when there are multiple data-types operated upon in the loop; the
1975 smallest data-type determines the VF, and as a result, when vectorizing
1976 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
1977 vector stmt (each computing a vector of 'nunits' results, and together
1978 computing 'VF' results in each iteration). This function is called when
1979 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
1980 which VF=16 and nunits=4, so the number of copies required is 4):
1982 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
1984 S1: x = load VS1.0: vx.0 = memref0 VS1.1
1985 VS1.1: vx.1 = memref1 VS1.2
1986 VS1.2: vx.2 = memref2 VS1.3
1987 VS1.3: vx.3 = memref3
1989 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
1990 VSnew.1: vz1 = vx.1 + ... VSnew.2
1991 VSnew.2: vz2 = vx.2 + ... VSnew.3
1992 VSnew.3: vz3 = vx.3 + ...
1994 The vectorization of S1 is explained in vectorizable_load.
1995 The vectorization of S2:
1996 To create the first vector-stmt out of the 4 copies - VSnew.0 -
1997 the function 'vect_get_vec_def_for_operand' is called to
1998 get the relevant vector-def for each operand of S2. For operand x it
1999 returns the vector-def 'vx.0'.
2001 To create the remaining copies of the vector-stmt (VSnew.j), this
2002 function is called to get the relevant vector-def for each operand. It is
2003 obtained from the respective VS1.j stmt, which is recorded in the
2004 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2006 For example, to obtain the vector-def 'vx.1' in order to create the
2007 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2008 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2009 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2010 and return its def ('vx.1').
2011 Overall, to create the above sequence this function will be called 3 times:
2012 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2013 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2014 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
2017 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2019 tree vec_stmt_for_operand;
2020 stmt_vec_info def_stmt_info;
2022 /* Do nothing; can reuse same def. */
2023 if (dt == vect_invariant_def || dt == vect_constant_def )
2026 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2027 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2028 gcc_assert (def_stmt_info);
2029 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2030 gcc_assert (vec_stmt_for_operand);
2031 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
2036 /* Get vectorized definitions for the operands to create a copy of an original
2037 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
2040 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2041 VEC(tree,heap) **vec_oprnds0,
2042 VEC(tree,heap) **vec_oprnds1)
2044 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2046 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2047 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2049 if (vec_oprnds1 && *vec_oprnds1)
2051 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2052 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2053 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2058 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
2061 vect_get_vec_defs (tree op0, tree op1, tree stmt, VEC(tree,heap) **vec_oprnds0,
2062 VEC(tree,heap) **vec_oprnds1, slp_tree slp_node)
2065 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2070 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2071 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2072 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2076 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2077 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2078 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2084 /* Function vect_finish_stmt_generation.
2086 Insert a new stmt. */
2089 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
2090 block_stmt_iterator *bsi)
2092 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2093 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2095 gcc_assert (stmt == bsi_stmt (*bsi));
2096 gcc_assert (TREE_CODE (stmt) != LABEL_EXPR);
2098 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
2100 set_stmt_info (get_stmt_ann (vec_stmt),
2101 new_stmt_vec_info (vec_stmt, loop_vinfo));
2103 if (vect_print_dump_info (REPORT_DETAILS))
2105 fprintf (vect_dump, "add new stmt: ");
2106 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
2109 /* Make sure bsi points to the stmt that is being vectorized. */
2110 gcc_assert (stmt == bsi_stmt (*bsi));
2112 #ifdef USE_MAPPED_LOCATION
2113 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
2115 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
2120 /* Function get_initial_def_for_reduction
2123 STMT - a stmt that performs a reduction operation in the loop.
2124 INIT_VAL - the initial value of the reduction variable
2127 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2128 of the reduction (used for adjusting the epilog - see below).
2129 Return a vector variable, initialized according to the operation that STMT
2130 performs. This vector will be used as the initial value of the
2131 vector of partial results.
2133 Option1 (adjust in epilog): Initialize the vector as follows:
2136 min/max: [init_val,init_val,..,init_val,init_val]
2137 bit and/or: [init_val,init_val,..,init_val,init_val]
2138 and when necessary (e.g. add/mult case) let the caller know
2139 that it needs to adjust the result by init_val.
2141 Option2: Initialize the vector as follows:
2142 add: [0,0,...,0,init_val]
2143 mult: [1,1,...,1,init_val]
2144 min/max: [init_val,init_val,...,init_val]
2145 bit and/or: [init_val,init_val,...,init_val]
2146 and no adjustments are needed.
2148 For example, for the following code:
2154 STMT is 's = s + a[i]', and the reduction variable is 's'.
2155 For a vector of 4 units, we want to return either [0,0,0,init_val],
2156 or [0,0,0,0] and let the caller know that it needs to adjust
2157 the result at the end by 'init_val'.
2159 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2160 initialization vector is simpler (same element in all entries).
2161 A cost model should help decide between these two schemes. */
2164 get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
2166 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2167 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2168 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2169 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2170 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2171 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2172 tree type = TREE_TYPE (init_val);
2179 bool nested_in_vect_loop = false;
2181 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2182 if (nested_in_vect_loop_p (loop, stmt))
2183 nested_in_vect_loop = true;
2185 gcc_assert (loop == (bb_for_stmt (stmt))->loop_father);
2187 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2191 case WIDEN_SUM_EXPR:
2194 if (nested_in_vect_loop)
2195 *adjustment_def = vecdef;
2197 *adjustment_def = init_val;
2198 /* Create a vector of zeros for init_def. */
2199 if (SCALAR_FLOAT_TYPE_P (type))
2200 def_for_init = build_real (type, dconst0);
2202 def_for_init = build_int_cst (type, 0);
2203 for (i = nunits - 1; i >= 0; --i)
2204 t = tree_cons (NULL_TREE, def_for_init, t);
2205 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
2206 gcc_assert (vector_type);
2207 init_def = build_vector (vector_type, t);
2212 *adjustment_def = NULL_TREE;
2224 /* Function vect_create_epilog_for_reduction
2226 Create code at the loop-epilog to finalize the result of a reduction
2229 VECT_DEF is a vector of partial results.
2230 REDUC_CODE is the tree-code for the epilog reduction.
2231 STMT is the scalar reduction stmt that is being vectorized.
2232 REDUCTION_PHI is the phi-node that carries the reduction computation.
2235 1. Creates the reduction def-use cycle: sets the arguments for
2237 The loop-entry argument is the vectorized initial-value of the reduction.
2238 The loop-latch argument is VECT_DEF - the vector of partial sums.
2239 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2240 by applying the operation specified by REDUC_CODE if available, or by
2241 other means (whole-vector shifts or a scalar loop).
2242 The function also creates a new phi node at the loop exit to preserve
2243 loop-closed form, as illustrated below.
2245 The flow at the entry to this function:
2248 vec_def = phi <null, null> # REDUCTION_PHI
2249 VECT_DEF = vector_stmt # vectorized form of STMT
2250 s_loop = scalar_stmt # (scalar) STMT
2252 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2256 The above is transformed by this function into:
2259 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2260 VECT_DEF = vector_stmt # vectorized form of STMT
2261 s_loop = scalar_stmt # (scalar) STMT
2263 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2264 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2265 v_out2 = reduce <v_out1>
2266 s_out3 = extract_field <v_out2, 0>
2267 s_out4 = adjust_result <s_out3>
2273 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
2274 enum tree_code reduc_code, tree reduction_phi)
2276 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2278 enum machine_mode mode;
2279 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2280 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2281 basic_block exit_bb;
2285 block_stmt_iterator exit_bsi;
2287 tree new_temp = NULL_TREE;
2289 tree epilog_stmt = NULL_TREE;
2290 tree new_scalar_dest, exit_phi, new_dest;
2291 tree bitsize, bitpos, bytesize;
2292 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2293 tree adjustment_def;
2294 tree vec_initial_def;
2296 imm_use_iterator imm_iter;
2297 use_operand_p use_p;
2298 bool extract_scalar_result = false;
2299 tree reduction_op, expr;
2302 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
2303 bool nested_in_vect_loop = false;
2305 VEC(tree,heap) *phis = NULL;
2308 if (nested_in_vect_loop_p (loop, stmt))
2311 nested_in_vect_loop = true;
2314 op_type = TREE_OPERAND_LENGTH (operation);
2315 reduction_op = TREE_OPERAND (operation, op_type-1);
2316 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2317 gcc_assert (vectype);
2318 mode = TYPE_MODE (vectype);
2320 /*** 1. Create the reduction def-use cycle ***/
2322 /* 1.1 set the loop-entry arg of the reduction-phi: */
2323 /* For the case of reduction, vect_get_vec_def_for_operand returns
2324 the scalar def before the loop, that defines the initial value
2325 of the reduction variable. */
2326 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2328 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
2330 /* 1.2 set the loop-latch arg for the reduction-phi: */
2331 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
2333 if (vect_print_dump_info (REPORT_DETAILS))
2335 fprintf (vect_dump, "transform reduction: created def-use cycle:");
2336 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
2337 fprintf (vect_dump, "\n");
2338 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
2342 /*** 2. Create epilog code
2343 The reduction epilog code operates across the elements of the vector
2344 of partial results computed by the vectorized loop.
2345 The reduction epilog code consists of:
2346 step 1: compute the scalar result in a vector (v_out2)
2347 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2348 step 3: adjust the scalar result (s_out3) if needed.
2350 Step 1 can be accomplished using one the following three schemes:
2351 (scheme 1) using reduc_code, if available.
2352 (scheme 2) using whole-vector shifts, if available.
2353 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2356 The overall epilog code looks like this:
2358 s_out0 = phi <s_loop> # original EXIT_PHI
2359 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2360 v_out2 = reduce <v_out1> # step 1
2361 s_out3 = extract_field <v_out2, 0> # step 2
2362 s_out4 = adjust_result <s_out3> # step 3
2364 (step 3 is optional, and step2 1 and 2 may be combined).
2365 Lastly, the uses of s_out0 are replaced by s_out4.
2369 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2370 v_out1 = phi <v_loop> */
2372 exit_bb = single_exit (loop)->dest;
2373 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2374 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
2375 exit_bsi = bsi_after_labels (exit_bb);
2377 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2378 (i.e. when reduc_code is not available) and in the final adjustment
2379 code (if needed). Also get the original scalar reduction variable as
2380 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2381 represents a reduction pattern), the tree-code and scalar-def are
2382 taken from the original stmt that the pattern-stmt (STMT) replaces.
2383 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2384 are taken from STMT. */
2386 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2389 /* Regular reduction */
2394 /* Reduction pattern */
2395 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2396 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2397 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2399 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2400 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
2401 scalar_type = TREE_TYPE (scalar_dest);
2402 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2403 bitsize = TYPE_SIZE (scalar_type);
2404 bytesize = TYPE_SIZE_UNIT (scalar_type);
2407 /* In case this is a reduction in an inner-loop while vectorizing an outer
2408 loop - we don't need to extract a single scalar result at the end of the
2409 inner-loop. The final vector of partial results will be used in the
2410 vectorized outer-loop, or reduced to a scalar result at the end of the
2412 if (nested_in_vect_loop)
2413 goto vect_finalize_reduction;
2415 /* 2.3 Create the reduction code, using one of the three schemes described
2418 if (reduc_code < NUM_TREE_CODES)
2422 /*** Case 1: Create:
2423 v_out2 = reduc_expr <v_out1> */
2425 if (vect_print_dump_info (REPORT_DETAILS))
2426 fprintf (vect_dump, "Reduce using direct vector reduction.");
2428 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2429 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2430 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2431 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2432 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2433 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2435 extract_scalar_result = true;
2439 enum tree_code shift_code = 0;
2440 bool have_whole_vector_shift = true;
2442 int element_bitsize = tree_low_cst (bitsize, 1);
2443 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2446 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2447 shift_code = VEC_RSHIFT_EXPR;
2449 have_whole_vector_shift = false;
2451 /* Regardless of whether we have a whole vector shift, if we're
2452 emulating the operation via tree-vect-generic, we don't want
2453 to use it. Only the first round of the reduction is likely
2454 to still be profitable via emulation. */
2455 /* ??? It might be better to emit a reduction tree code here, so that
2456 tree-vect-generic can expand the first round via bit tricks. */
2457 if (!VECTOR_MODE_P (mode))
2458 have_whole_vector_shift = false;
2461 optab optab = optab_for_tree_code (code, vectype);
2462 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2463 have_whole_vector_shift = false;
2466 if (have_whole_vector_shift)
2468 /*** Case 2: Create:
2469 for (offset = VS/2; offset >= element_size; offset/=2)
2471 Create: va' = vec_shift <va, offset>
2472 Create: va = vop <va, va'>
2475 if (vect_print_dump_info (REPORT_DETAILS))
2476 fprintf (vect_dump, "Reduce using vector shifts");
2478 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2479 new_temp = PHI_RESULT (new_phi);
2481 for (bit_offset = vec_size_in_bits/2;
2482 bit_offset >= element_bitsize;
2485 tree bitpos = size_int (bit_offset);
2486 tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
2487 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2488 new_name = make_ssa_name (vec_dest, epilog_stmt);
2489 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2490 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2492 tmp = build2 (code, vectype, new_name, new_temp);
2493 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2494 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2495 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2496 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2499 extract_scalar_result = true;
2505 /*** Case 3: Create:
2506 s = extract_field <v_out2, 0>
2507 for (offset = element_size;
2508 offset < vector_size;
2509 offset += element_size;)
2511 Create: s' = extract_field <v_out2, offset>
2512 Create: s = op <s, s'>
2515 if (vect_print_dump_info (REPORT_DETAILS))
2516 fprintf (vect_dump, "Reduce using scalar code. ");
2518 vec_temp = PHI_RESULT (new_phi);
2519 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2520 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2522 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2523 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2524 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2525 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2526 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2528 for (bit_offset = element_bitsize;
2529 bit_offset < vec_size_in_bits;
2530 bit_offset += element_bitsize)
2533 tree bitpos = bitsize_int (bit_offset);
2534 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2537 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2538 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2539 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2540 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2541 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2543 tmp = build2 (code, scalar_type, new_name, new_temp);
2544 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
2545 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2546 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2547 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2550 extract_scalar_result = false;
2554 /* 2.4 Extract the final scalar result. Create:
2555 s_out3 = extract_field <v_out2, bitpos> */
2557 if (extract_scalar_result)
2561 gcc_assert (!nested_in_vect_loop);
2562 if (vect_print_dump_info (REPORT_DETAILS))
2563 fprintf (vect_dump, "extract scalar result");
2565 if (BYTES_BIG_ENDIAN)
2566 bitpos = size_binop (MULT_EXPR,
2567 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2568 TYPE_SIZE (scalar_type));
2570 bitpos = bitsize_zero_node;
2572 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2573 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2574 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2575 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2576 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2577 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2580 vect_finalize_reduction:
2582 /* 2.5 Adjust the final result by the initial value of the reduction
2583 variable. (When such adjustment is not needed, then
2584 'adjustment_def' is zero). For example, if code is PLUS we create:
2585 new_temp = loop_exit_def + adjustment_def */
2589 if (nested_in_vect_loop)
2591 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2592 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2593 new_dest = vect_create_destination_var (scalar_dest, vectype);
2597 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2598 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2599 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2601 epilog_stmt = build_gimple_modify_stmt (new_dest, expr);
2602 new_temp = make_ssa_name (new_dest, epilog_stmt);
2603 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2604 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2608 /* 2.6 Handle the loop-exit phi */
2610 /* Replace uses of s_out0 with uses of s_out3:
2611 Find the loop-closed-use at the loop exit of the original scalar result.
2612 (The reduction result is expected to have two immediate uses - one at the
2613 latch block, and one at the loop exit). */
2614 phis = VEC_alloc (tree, heap, 10);
2615 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2617 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
2619 exit_phi = USE_STMT (use_p);
2620 VEC_quick_push (tree, phis, exit_phi);
2623 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2624 gcc_assert (!VEC_empty (tree, phis));
2626 for (i = 0; VEC_iterate (tree, phis, i, exit_phi); i++)
2628 if (nested_in_vect_loop)
2630 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2632 /* FORNOW. Currently not supporting the case that an inner-loop reduction
2633 is not used in the outer-loop (but only outside the outer-loop). */
2634 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2635 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2637 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2638 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2639 set_stmt_info (get_stmt_ann (epilog_stmt),
2640 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2644 /* Replace the uses: */
2645 orig_name = PHI_RESULT (exit_phi);
2646 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2647 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2648 SET_USE (use_p, new_temp);
2650 VEC_free (tree, heap, phis);
2654 /* Function vectorizable_reduction.
2656 Check if STMT performs a reduction operation that can be vectorized.
2657 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2658 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2659 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2661 This function also handles reduction idioms (patterns) that have been
2662 recognized in advance during vect_pattern_recog. In this case, STMT may be
2664 X = pattern_expr (arg0, arg1, ..., X)
2665 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2666 sequence that had been detected and replaced by the pattern-stmt (STMT).
2668 In some cases of reduction patterns, the type of the reduction variable X is
2669 different than the type of the other arguments of STMT.
2670 In such cases, the vectype that is used when transforming STMT into a vector
2671 stmt is different than the vectype that is used to determine the
2672 vectorization factor, because it consists of a different number of elements
2673 than the actual number of elements that are being operated upon in parallel.
2675 For example, consider an accumulation of shorts into an int accumulator.
2676 On some targets it's possible to vectorize this pattern operating on 8
2677 shorts at a time (hence, the vectype for purposes of determining the
2678 vectorization factor should be V8HI); on the other hand, the vectype that
2679 is used to create the vector form is actually V4SI (the type of the result).
2681 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2682 indicates what is the actual level of parallelism (V8HI in the example), so
2683 that the right vectorization factor would be derived. This vectype
2684 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2685 be used to create the vectorized stmt. The right vectype for the vectorized
2686 stmt is obtained from the type of the result X:
2687 get_vectype_for_scalar_type (TREE_TYPE (X))
2689 This means that, contrary to "regular" reductions (or "regular" stmts in
2690 general), the following equation:
2691 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2692 does *NOT* necessarily hold for reduction patterns. */
2695 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2700 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2701 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2702 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2703 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2704 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2706 enum tree_code code, orig_code, epilog_reduc_code = 0;
2707 enum machine_mode vec_mode;
2709 optab optab, reduc_optab;
2710 tree new_temp = NULL_TREE;
2712 enum vect_def_type dt;
2717 stmt_vec_info orig_stmt_info;
2718 tree expr = NULL_TREE;
2720 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2721 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2722 stmt_vec_info prev_stmt_info;
2724 tree new_stmt = NULL_TREE;
2727 if (nested_in_vect_loop_p (loop, stmt))
2730 /* FORNOW. This restriction should be relaxed. */
2733 if (vect_print_dump_info (REPORT_DETAILS))
2734 fprintf (vect_dump, "multiple types in nested loop.");
2739 gcc_assert (ncopies >= 1);
2741 /* FORNOW: SLP not supported. */
2742 if (STMT_SLP_TYPE (stmt_info))
2745 /* 1. Is vectorizable reduction? */
2747 /* Not supportable if the reduction variable is used in the loop. */
2748 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2751 /* Reductions that are not used even in an enclosing outer-loop,
2752 are expected to be "live" (used out of the loop). */
2753 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2754 && !STMT_VINFO_LIVE_P (stmt_info))
2757 /* Make sure it was already recognized as a reduction computation. */
2758 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2761 /* 2. Has this been recognized as a reduction pattern?
2763 Check if STMT represents a pattern that has been recognized
2764 in earlier analysis stages. For stmts that represent a pattern,
2765 the STMT_VINFO_RELATED_STMT field records the last stmt in
2766 the original sequence that constitutes the pattern. */
2768 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2771 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2772 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2773 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2774 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2777 /* 3. Check the operands of the operation. The first operands are defined
2778 inside the loop body. The last operand is the reduction variable,
2779 which is defined by the loop-header-phi. */
2781 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
2783 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2784 code = TREE_CODE (operation);
2785 op_type = TREE_OPERAND_LENGTH (operation);
2786 if (op_type != binary_op && op_type != ternary_op)
2788 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2789 scalar_type = TREE_TYPE (scalar_dest);
2790 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2791 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2794 /* All uses but the last are expected to be defined in the loop.
2795 The last use is the reduction variable. */
2796 for (i = 0; i < op_type-1; i++)
2798 op = TREE_OPERAND (operation, i);
2799 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2800 gcc_assert (is_simple_use);
2801 if (dt != vect_loop_def
2802 && dt != vect_invariant_def
2803 && dt != vect_constant_def
2804 && dt != vect_induction_def)
2808 op = TREE_OPERAND (operation, i);
2809 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2810 gcc_assert (is_simple_use);
2811 gcc_assert (dt == vect_reduction_def);
2812 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
2814 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2816 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2818 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2821 /* 4. Supportable by target? */
2823 /* 4.1. check support for the operation in the loop */
2824 optab = optab_for_tree_code (code, vectype);
2827 if (vect_print_dump_info (REPORT_DETAILS))
2828 fprintf (vect_dump, "no optab.");
2831 vec_mode = TYPE_MODE (vectype);
2832 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2834 if (vect_print_dump_info (REPORT_DETAILS))
2835 fprintf (vect_dump, "op not supported by target.");
2836 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2837 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2838 < vect_min_worthwhile_factor (code))
2840 if (vect_print_dump_info (REPORT_DETAILS))
2841 fprintf (vect_dump, "proceeding using word mode.");
2844 /* Worthwhile without SIMD support? */
2845 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2846 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2847 < vect_min_worthwhile_factor (code))
2849 if (vect_print_dump_info (REPORT_DETAILS))
2850 fprintf (vect_dump, "not worthwhile without SIMD support.");
2854 /* 4.2. Check support for the epilog operation.
2856 If STMT represents a reduction pattern, then the type of the
2857 reduction variable may be different than the type of the rest
2858 of the arguments. For example, consider the case of accumulation
2859 of shorts into an int accumulator; The original code:
2860 S1: int_a = (int) short_a;
2861 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2864 STMT: int_acc = widen_sum <short_a, int_acc>
2867 1. The tree-code that is used to create the vector operation in the
2868 epilog code (that reduces the partial results) is not the
2869 tree-code of STMT, but is rather the tree-code of the original
2870 stmt from the pattern that STMT is replacing. I.e, in the example
2871 above we want to use 'widen_sum' in the loop, but 'plus' in the
2873 2. The type (mode) we use to check available target support
2874 for the vector operation to be created in the *epilog*, is
2875 determined by the type of the reduction variable (in the example
2876 above we'd check this: plus_optab[vect_int_mode]).
2877 However the type (mode) we use to check available target support
2878 for the vector operation to be created *inside the loop*, is
2879 determined by the type of the other arguments to STMT (in the
2880 example we'd check this: widen_sum_optab[vect_short_mode]).
2882 This is contrary to "regular" reductions, in which the types of all
2883 the arguments are the same as the type of the reduction variable.
2884 For "regular" reductions we can therefore use the same vector type
2885 (and also the same tree-code) when generating the epilog code and
2886 when generating the code inside the loop. */
2890 /* This is a reduction pattern: get the vectype from the type of the
2891 reduction variable, and get the tree-code from orig_stmt. */
2892 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2893 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
2896 if (vect_print_dump_info (REPORT_DETAILS))
2898 fprintf (vect_dump, "unsupported data-type ");
2899 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
2904 vec_mode = TYPE_MODE (vectype);
2908 /* Regular reduction: use the same vectype and tree-code as used for
2909 the vector code inside the loop can be used for the epilog code. */
2913 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
2915 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
2918 if (vect_print_dump_info (REPORT_DETAILS))
2919 fprintf (vect_dump, "no optab for reduction.");
2920 epilog_reduc_code = NUM_TREE_CODES;
2922 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
2924 if (vect_print_dump_info (REPORT_DETAILS))
2925 fprintf (vect_dump, "reduc op not supported by target.");
2926 epilog_reduc_code = NUM_TREE_CODES;
2929 if (!vec_stmt) /* transformation not required. */
2931 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
2932 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
2939 if (vect_print_dump_info (REPORT_DETAILS))
2940 fprintf (vect_dump, "transform reduction.");
2942 /* Create the destination vector */
2943 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2945 /* Create the reduction-phi that defines the reduction-operand. */
2946 new_phi = create_phi_node (vec_dest, loop->header);
2948 /* In case the vectorization factor (VF) is bigger than the number
2949 of elements that we can fit in a vectype (nunits), we have to generate
2950 more than one vector stmt - i.e - we need to "unroll" the
2951 vector stmt by a factor VF/nunits. For more details see documentation
2952 in vectorizable_operation. */
2954 prev_stmt_info = NULL;
2955 for (j = 0; j < ncopies; j++)
2960 op = TREE_OPERAND (operation, 0);
2961 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
2962 if (op_type == ternary_op)
2964 op = TREE_OPERAND (operation, 1);
2965 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
2968 /* Get the vector def for the reduction variable from the phi node */
2969 reduc_def = PHI_RESULT (new_phi);
2973 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2974 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
2975 if (op_type == ternary_op)
2976 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
2978 /* Get the vector def for the reduction variable from the vectorized
2979 reduction operation generated in the previous iteration (j-1) */
2980 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
2983 /* Arguments are ready. create the new vector stmt. */
2984 if (op_type == binary_op)
2985 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
2987 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
2989 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2990 new_temp = make_ssa_name (vec_dest, new_stmt);
2991 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2992 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2995 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2997 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2998 prev_stmt_info = vinfo_for_stmt (new_stmt);
3001 /* Finalize the reduction-phi (set it's arguments) and create the
3002 epilog reduction code. */
3003 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
3007 /* Checks if CALL can be vectorized in type VECTYPE. Returns
3008 a function declaration if the target has a vectorized version
3009 of the function, or NULL_TREE if the function cannot be vectorized. */
3012 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
3014 tree fndecl = get_callee_fndecl (call);
3015 enum built_in_function code;
3017 /* We only handle functions that do not read or clobber memory -- i.e.
3018 const or novops ones. */
3019 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3023 || TREE_CODE (fndecl) != FUNCTION_DECL
3024 || !DECL_BUILT_IN (fndecl))
3027 code = DECL_FUNCTION_CODE (fndecl);
3028 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3032 /* Function vectorizable_call.
3034 Check if STMT performs a function call that can be vectorized.
3035 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3036 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3037 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3040 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3046 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3047 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3048 tree vectype_out, vectype_in;
3051 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3052 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3053 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
3054 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3056 int ncopies, j, nargs;
3057 call_expr_arg_iterator iter;
3059 enum { NARROW, NONE, WIDEN } modifier;
3061 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3064 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3067 /* FORNOW: SLP not supported. */
3068 if (STMT_SLP_TYPE (stmt_info))
3071 /* Is STMT a vectorizable call? */
3072 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3075 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3078 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3079 if (TREE_CODE (operation) != CALL_EXPR)
3082 /* Process function arguments. */
3083 rhs_type = NULL_TREE;
3085 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3087 /* Bail out if the function has more than two arguments, we
3088 do not have interesting builtin functions to vectorize with
3089 more than two arguments. */
3093 /* We can only handle calls with arguments of the same type. */
3095 && rhs_type != TREE_TYPE (op))
3097 if (vect_print_dump_info (REPORT_DETAILS))
3098 fprintf (vect_dump, "argument types differ.");
3101 rhs_type = TREE_TYPE (op);
3103 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
3105 if (vect_print_dump_info (REPORT_DETAILS))
3106 fprintf (vect_dump, "use not simple.");
3113 /* No arguments is also not good. */
3117 vectype_in = get_vectype_for_scalar_type (rhs_type);
3120 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3122 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
3123 vectype_out = get_vectype_for_scalar_type (lhs_type);
3126 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3129 if (nunits_in == nunits_out / 2)
3131 else if (nunits_out == nunits_in)
3133 else if (nunits_out == nunits_in / 2)
3138 /* For now, we only vectorize functions if a target specific builtin
3139 is available. TODO -- in some cases, it might be profitable to
3140 insert the calls for pieces of the vector, in order to be able
3141 to vectorize other operations in the loop. */
3142 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
3143 if (fndecl == NULL_TREE)
3145 if (vect_print_dump_info (REPORT_DETAILS))
3146 fprintf (vect_dump, "function is not vectorizable.");
3151 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3153 if (modifier == NARROW)
3154 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3156 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3158 /* Sanity check: make sure that at least one copy of the vectorized stmt
3159 needs to be generated. */
3160 gcc_assert (ncopies >= 1);
3162 /* FORNOW. This restriction should be relaxed. */
3163 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3165 if (vect_print_dump_info (REPORT_DETAILS))
3166 fprintf (vect_dump, "multiple types in nested loop.");
3170 if (!vec_stmt) /* transformation not required. */
3172 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3173 if (vect_print_dump_info (REPORT_DETAILS))
3174 fprintf (vect_dump, "=== vectorizable_call ===");
3175 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3181 if (vect_print_dump_info (REPORT_DETAILS))
3182 fprintf (vect_dump, "transform operation.");
3184 /* FORNOW. This restriction should be relaxed. */
3185 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3187 if (vect_print_dump_info (REPORT_DETAILS))
3188 fprintf (vect_dump, "multiple types in nested loop.");
3193 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3194 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3196 prev_stmt_info = NULL;
3200 for (j = 0; j < ncopies; ++j)
3202 /* Build argument list for the vectorized call. */
3203 /* FIXME: Rewrite this so that it doesn't
3204 construct a temporary list. */
3207 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3211 = vect_get_vec_def_for_operand (op, stmt, NULL);
3214 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3216 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3220 vargs = nreverse (vargs);
3222 rhs = build_function_call_expr (fndecl, vargs);
3223 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3224 new_temp = make_ssa_name (vec_dest, new_stmt);
3225 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3227 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3230 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3232 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3234 prev_stmt_info = vinfo_for_stmt (new_stmt);
3240 for (j = 0; j < ncopies; ++j)
3242 /* Build argument list for the vectorized call. */
3243 /* FIXME: Rewrite this so that it doesn't
3244 construct a temporary list. */
3247 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3252 = vect_get_vec_def_for_operand (op, stmt, NULL);
3254 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3259 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3261 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3264 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3265 vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs);
3269 vargs = nreverse (vargs);
3271 rhs = build_function_call_expr (fndecl, vargs);
3272 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3273 new_temp = make_ssa_name (vec_dest, new_stmt);
3274 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3276 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3279 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3281 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3283 prev_stmt_info = vinfo_for_stmt (new_stmt);
3286 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3291 /* No current target implements this case. */
3295 /* The call in STMT might prevent it from being removed in dce.
3296 We however cannot remove it here, due to the way the ssa name
3297 it defines is mapped to the new definition. So just replace
3298 rhs of the statement with something harmless. */
3299 type = TREE_TYPE (scalar_dest);
3300 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
3307 /* Function vect_gen_widened_results_half
3309 Create a vector stmt whose code, type, number of arguments, and result
3310 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
3311 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3312 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3313 needs to be created (DECL is a function-decl of a target-builtin).
3314 STMT is the original scalar stmt that we are vectorizing. */
3317 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
3318 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3319 tree vec_dest, block_stmt_iterator *bsi,
3328 /* Generate half of the widened result: */
3329 if (code == CALL_EXPR)
3331 /* Target specific support */
3332 if (op_type == binary_op)
3333 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
3335 expr = build_call_expr (decl, 1, vec_oprnd0);
3339 /* Generic support */
3340 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3341 if (op_type == binary_op)
3342 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
3344 expr = build1 (code, vectype, vec_oprnd0);
3346 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3347 new_temp = make_ssa_name (vec_dest, new_stmt);
3348 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3349 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3351 if (code == CALL_EXPR)
3353 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3355 if (TREE_CODE (sym) == SSA_NAME)
3356 sym = SSA_NAME_VAR (sym);
3357 mark_sym_for_renaming (sym);
3365 /* Check if STMT performs a conversion operation, that can be vectorized.
3366 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3367 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3368 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3371 vectorizable_conversion (tree stmt, block_stmt_iterator *bsi,
3372 tree *vec_stmt, slp_tree slp_node)
3378 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3379 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3380 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3381 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3382 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3383 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3386 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3387 tree new_stmt = NULL_TREE;
3388 stmt_vec_info prev_stmt_info;
3391 tree vectype_out, vectype_in;
3394 tree rhs_type, lhs_type;
3396 enum { NARROW, NONE, WIDEN } modifier;
3398 VEC(tree,heap) *vec_oprnds0 = NULL;
3401 /* Is STMT a vectorizable conversion? */
3403 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3406 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3409 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3412 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3415 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3416 code = TREE_CODE (operation);
3417 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3420 /* Check types of lhs and rhs. */
3421 op0 = TREE_OPERAND (operation, 0);
3422 rhs_type = TREE_TYPE (op0);
3423 vectype_in = get_vectype_for_scalar_type (rhs_type);
3426 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3428 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3429 lhs_type = TREE_TYPE (scalar_dest);
3430 vectype_out = get_vectype_for_scalar_type (lhs_type);
3433 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3436 if (nunits_in == nunits_out / 2)
3438 else if (nunits_out == nunits_in)
3440 else if (nunits_out == nunits_in / 2)
3445 if (modifier == NONE)
3446 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3448 /* Bail out if the types are both integral or non-integral. */
3449 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3450 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3453 if (modifier == NARROW)
3454 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3456 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3458 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3459 this, so we can safely override NCOPIES with 1 here. */
3463 /* Sanity check: make sure that at least one copy of the vectorized stmt
3464 needs to be generated. */
3465 gcc_assert (ncopies >= 1);
3467 /* FORNOW. This restriction should be relaxed. */
3468 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3470 if (vect_print_dump_info (REPORT_DETAILS))
3471 fprintf (vect_dump, "multiple types in nested loop.");
3475 /* Check the operands of the operation. */
3476 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3478 if (vect_print_dump_info (REPORT_DETAILS))
3479 fprintf (vect_dump, "use not simple.");
3483 /* Supportable by target? */
3484 if ((modifier == NONE
3485 && !targetm.vectorize.builtin_conversion (code, vectype_in))
3486 || (modifier == WIDEN
3487 && !supportable_widening_operation (code, stmt, vectype_in,
3490 || (modifier == NARROW
3491 && !supportable_narrowing_operation (code, stmt, vectype_in,
3494 if (vect_print_dump_info (REPORT_DETAILS))
3495 fprintf (vect_dump, "op not supported by target.");
3499 if (modifier != NONE)
3501 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3502 /* FORNOW: SLP not supported. */
3503 if (STMT_SLP_TYPE (stmt_info))
3507 if (!vec_stmt) /* transformation not required. */
3509 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3514 if (vect_print_dump_info (REPORT_DETAILS))
3515 fprintf (vect_dump, "transform conversion.");
3518 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3520 if (modifier == NONE && !slp_node)
3521 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3523 prev_stmt_info = NULL;
3527 for (j = 0; j < ncopies; j++)
3533 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3535 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3538 targetm.vectorize.builtin_conversion (code, vectype_in);
3539 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3541 new_stmt = build_call_expr (builtin_decl, 1, vop0);
3543 /* Arguments are ready. create the new vector stmt. */
3544 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3545 new_temp = make_ssa_name (vec_dest, new_stmt);
3546 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3547 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3548 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3549 SSA_OP_ALL_VIRTUALS)
3551 if (TREE_CODE (sym) == SSA_NAME)
3552 sym = SSA_NAME_VAR (sym);
3553 mark_sym_for_renaming (sym);
3556 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3560 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3562 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3563 prev_stmt_info = vinfo_for_stmt (new_stmt);
3568 /* In case the vectorization factor (VF) is bigger than the number
3569 of elements that we can fit in a vectype (nunits), we have to
3570 generate more than one vector stmt - i.e - we need to "unroll"
3571 the vector stmt by a factor VF/nunits. */
3572 for (j = 0; j < ncopies; j++)
3575 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3577 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3579 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3581 /* Generate first half of the widened result: */
3583 = vect_gen_widened_results_half (code1, vectype_out, decl1,
3584 vec_oprnd0, vec_oprnd1,
3585 unary_op, vec_dest, bsi, stmt);
3587 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3589 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3590 prev_stmt_info = vinfo_for_stmt (new_stmt);
3592 /* Generate second half of the widened result: */
3594 = vect_gen_widened_results_half (code2, vectype_out, decl2,
3595 vec_oprnd0, vec_oprnd1,
3596 unary_op, vec_dest, bsi, stmt);
3597 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3598 prev_stmt_info = vinfo_for_stmt (new_stmt);
3603 /* In case the vectorization factor (VF) is bigger than the number
3604 of elements that we can fit in a vectype (nunits), we have to
3605 generate more than one vector stmt - i.e - we need to "unroll"
3606 the vector stmt by a factor VF/nunits. */
3607 for (j = 0; j < ncopies; j++)
3612 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3613 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3617 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3618 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3621 /* Arguments are ready. Create the new vector stmt. */
3622 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3623 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3624 new_temp = make_ssa_name (vec_dest, new_stmt);
3625 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3626 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3629 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3631 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3633 prev_stmt_info = vinfo_for_stmt (new_stmt);
3636 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3643 /* Function vectorizable_assignment.
3645 Check if STMT performs an assignment (copy) that can be vectorized.
3646 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3647 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3648 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3651 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3657 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3658 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3659 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3662 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3663 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3664 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3666 VEC(tree,heap) *vec_oprnds = NULL;
3669 gcc_assert (ncopies >= 1);
3671 return false; /* FORNOW */
3673 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3676 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3679 /* Is vectorizable assignment? */
3680 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3683 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3684 if (TREE_CODE (scalar_dest) != SSA_NAME)
3687 op = GIMPLE_STMT_OPERAND (stmt, 1);
3688 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3690 if (vect_print_dump_info (REPORT_DETAILS))
3691 fprintf (vect_dump, "use not simple.");
3695 if (!vec_stmt) /* transformation not required. */
3697 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3698 if (vect_print_dump_info (REPORT_DETAILS))
3699 fprintf (vect_dump, "=== vectorizable_assignment ===");
3700 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3705 if (vect_print_dump_info (REPORT_DETAILS))
3706 fprintf (vect_dump, "transform assignment.");
3709 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3712 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3714 /* Arguments are ready. create the new vector stmt. */
3715 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3717 *vec_stmt = build_gimple_modify_stmt (vec_dest, vop);
3718 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3719 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
3720 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3721 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3724 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3727 VEC_free (tree, heap, vec_oprnds);
3732 /* Function vect_min_worthwhile_factor.
3734 For a loop where we could vectorize the operation indicated by CODE,
3735 return the minimum vectorization factor that makes it worthwhile
3736 to use generic vectors. */
3738 vect_min_worthwhile_factor (enum tree_code code)
3759 /* Function vectorizable_induction
3761 Check if PHI performs an induction computation that can be vectorized.
3762 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3763 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3764 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3767 vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3770 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3771 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3772 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3773 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3774 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3777 gcc_assert (ncopies >= 1);
3779 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3782 /* FORNOW: SLP not supported. */
3783 if (STMT_SLP_TYPE (stmt_info))
3786 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3788 if (TREE_CODE (phi) != PHI_NODE)
3791 if (!vec_stmt) /* transformation not required. */
3793 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3794 if (vect_print_dump_info (REPORT_DETAILS))
3795 fprintf (vect_dump, "=== vectorizable_induction ===");
3796 vect_model_induction_cost (stmt_info, ncopies);
3802 if (vect_print_dump_info (REPORT_DETAILS))
3803 fprintf (vect_dump, "transform induction phi.");
3805 vec_def = get_initial_def_for_induction (phi);
3806 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3811 /* Function vectorizable_operation.
3813 Check if STMT performs a binary or unary operation that can be vectorized.
3814 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3815 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3816 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3819 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3825 tree op0, op1 = NULL;
3826 tree vec_oprnd1 = NULL_TREE;
3827 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3828 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3829 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3830 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3831 enum tree_code code;
3832 enum machine_mode vec_mode;
3837 enum machine_mode optab_op2_mode;
3839 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3840 tree new_stmt = NULL_TREE;
3841 stmt_vec_info prev_stmt_info;
3842 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
3845 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3847 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
3850 bool scalar_shift_arg = false;
3852 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3853 this, so we can safely override NCOPIES with 1 here. */
3856 gcc_assert (ncopies >= 1);
3857 /* FORNOW. This restriction should be relaxed. */
3858 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3860 if (vect_print_dump_info (REPORT_DETAILS))
3861 fprintf (vect_dump, "multiple types in nested loop.");
3865 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3868 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3871 /* Is STMT a vectorizable binary/unary operation? */
3872 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3875 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3878 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3879 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3882 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3883 if (nunits_out != nunits_in)
3886 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3887 code = TREE_CODE (operation);
3889 /* For pointer addition, we should use the normal plus for
3890 the vector addition. */
3891 if (code == POINTER_PLUS_EXPR)
3894 optab = optab_for_tree_code (code, vectype);
3896 /* Support only unary or binary operations. */
3897 op_type = TREE_OPERAND_LENGTH (operation);
3898 if (op_type != unary_op && op_type != binary_op)
3900 if (vect_print_dump_info (REPORT_DETAILS))
3901 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
3905 op0 = TREE_OPERAND (operation, 0);
3906 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3908 if (vect_print_dump_info (REPORT_DETAILS))
3909 fprintf (vect_dump, "use not simple.");
3913 if (op_type == binary_op)
3915 op1 = TREE_OPERAND (operation, 1);
3916 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
3918 if (vect_print_dump_info (REPORT_DETAILS))
3919 fprintf (vect_dump, "use not simple.");
3924 /* Supportable by target? */
3927 if (vect_print_dump_info (REPORT_DETAILS))
3928 fprintf (vect_dump, "no optab.");
3931 vec_mode = TYPE_MODE (vectype);
3932 icode = (int) optab_handler (optab, vec_mode)->insn_code;
3933 if (icode == CODE_FOR_nothing)
3935 if (vect_print_dump_info (REPORT_DETAILS))
3936 fprintf (vect_dump, "op not supported by target.");
3937 /* Check only during analysis. */
3938 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3939 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3940 < vect_min_worthwhile_factor (code)
3943 if (vect_print_dump_info (REPORT_DETAILS))
3944 fprintf (vect_dump, "proceeding using word mode.");
3947 /* Worthwhile without SIMD support? Check only during analysis. */
3948 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3949 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3950 < vect_min_worthwhile_factor (code)
3953 if (vect_print_dump_info (REPORT_DETAILS))
3954 fprintf (vect_dump, "not worthwhile without SIMD support.");
3958 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
3960 /* FORNOW: not yet supported. */
3961 if (!VECTOR_MODE_P (vec_mode))
3964 /* Invariant argument is needed for a vector shift
3965 by a scalar shift operand. */
3966 optab_op2_mode = insn_data[icode].operand[2].mode;
3967 if (!VECTOR_MODE_P (optab_op2_mode))
3969 if (dt[1] != vect_constant_def && dt[1] != vect_invariant_def)
3971 if (vect_print_dump_info (REPORT_DETAILS))
3972 fprintf (vect_dump, "operand mode requires invariant"
3977 scalar_shift_arg = true;
3981 if (!vec_stmt) /* transformation not required. */
3983 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
3984 if (vect_print_dump_info (REPORT_DETAILS))
3985 fprintf (vect_dump, "=== vectorizable_operation ===");
3986 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3992 if (vect_print_dump_info (REPORT_DETAILS))
3993 fprintf (vect_dump, "transform binary/unary operation.");
3996 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3998 /* Allocate VECs for vector operands. In case of SLP, vector operands are
3999 created in the previous stages of the recursion, so no allocation is
4000 needed, except for the case of shift with scalar shift argument. In that
4001 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
4002 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
4003 In case of loop-based vectorization we allocate VECs of size 1. We
4004 allocate VEC_OPRNDS1 only in case of binary operation. */
4007 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4008 if (op_type == binary_op)
4009 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4011 else if (scalar_shift_arg)
4012 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4014 /* In case the vectorization factor (VF) is bigger than the number
4015 of elements that we can fit in a vectype (nunits), we have to generate
4016 more than one vector stmt - i.e - we need to "unroll" the
4017 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4018 from one copy of the vector stmt to the next, in the field
4019 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4020 stages to find the correct vector defs to be used when vectorizing
4021 stmts that use the defs of the current stmt. The example below illustrates
4022 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4023 4 vectorized stmts):
4025 before vectorization:
4026 RELATED_STMT VEC_STMT
4030 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4032 RELATED_STMT VEC_STMT
4033 VS1_0: vx0 = memref0 VS1_1 -
4034 VS1_1: vx1 = memref1 VS1_2 -
4035 VS1_2: vx2 = memref2 VS1_3 -
4036 VS1_3: vx3 = memref3 - -
4037 S1: x = load - VS1_0
4040 step2: vectorize stmt S2 (done here):
4041 To vectorize stmt S2 we first need to find the relevant vector
4042 def for the first operand 'x'. This is, as usual, obtained from
4043 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4044 that defines 'x' (S1). This way we find the stmt VS1_0, and the
4045 relevant vector def 'vx0'. Having found 'vx0' we can generate
4046 the vector stmt VS2_0, and as usual, record it in the
4047 STMT_VINFO_VEC_STMT of stmt S2.
4048 When creating the second copy (VS2_1), we obtain the relevant vector
4049 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4050 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4051 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4052 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4053 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4054 chain of stmts and pointers:
4055 RELATED_STMT VEC_STMT
4056 VS1_0: vx0 = memref0 VS1_1 -
4057 VS1_1: vx1 = memref1 VS1_2 -
4058 VS1_2: vx2 = memref2 VS1_3 -
4059 VS1_3: vx3 = memref3 - -
4060 S1: x = load - VS1_0
4061 VS2_0: vz0 = vx0 + v1 VS2_1 -
4062 VS2_1: vz1 = vx1 + v1 VS2_2 -
4063 VS2_2: vz2 = vx2 + v1 VS2_3 -
4064 VS2_3: vz3 = vx3 + v1 - -
4065 S2: z = x + 1 - VS2_0 */
4067 prev_stmt_info = NULL;
4068 for (j = 0; j < ncopies; j++)
4073 if (op_type == binary_op
4074 && (code == LSHIFT_EXPR || code == RSHIFT_EXPR))
4076 /* Vector shl and shr insn patterns can be defined with scalar
4077 operand 2 (shift operand). In this case, use constant or loop
4078 invariant op1 directly, without extending it to vector mode
4080 optab_op2_mode = insn_data[icode].operand[2].mode;
4081 if (!VECTOR_MODE_P (optab_op2_mode))
4083 if (vect_print_dump_info (REPORT_DETAILS))
4084 fprintf (vect_dump, "operand 1 using scalar mode.");
4086 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4089 /* Store vec_oprnd1 for every vector stmt to be created
4090 for SLP_NODE. We check during the analysis that all the
4091 shift arguments are the same.
4092 TODO: Allow different constants for different vector
4093 stmts generated for an SLP instance. */
4094 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4095 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4100 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4101 (a special case for certain kind of vector shifts); otherwise,
4102 operand 1 should be of a vector type (the usual case). */
4103 if (op_type == binary_op && !vec_oprnd1)
4104 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4107 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4111 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4113 /* Arguments are ready. Create the new vector stmt. */
4114 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4116 if (op_type == binary_op)
4118 vop1 = VEC_index (tree, vec_oprnds1, i);
4119 new_stmt = build_gimple_modify_stmt (vec_dest,
4120 build2 (code, vectype, vop0, vop1));
4123 new_stmt = build_gimple_modify_stmt (vec_dest,
4124 build1 (code, vectype, vop0));
4126 new_temp = make_ssa_name (vec_dest, new_stmt);
4127 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4128 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4130 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4134 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4136 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4137 prev_stmt_info = vinfo_for_stmt (new_stmt);
4140 VEC_free (tree, heap, vec_oprnds0);
4142 VEC_free (tree, heap, vec_oprnds1);
4148 /* Function vectorizable_type_demotion
4150 Check if STMT performs a binary or unary operation that involves
4151 type demotion, and if it can be vectorized.
4152 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4153 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4154 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4157 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
4164 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4165 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4166 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4167 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4168 enum tree_code code, code1 = ERROR_MARK;
4171 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4173 stmt_vec_info prev_stmt_info;
4182 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4185 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4188 /* Is STMT a vectorizable type-demotion operation? */
4189 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4192 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4195 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4196 code = TREE_CODE (operation);
4197 if (code != NOP_EXPR && code != CONVERT_EXPR)
4200 op0 = TREE_OPERAND (operation, 0);
4201 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4204 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4206 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4207 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4210 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4211 if (nunits_in != nunits_out / 2) /* FORNOW */
4214 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4215 gcc_assert (ncopies >= 1);
4216 /* FORNOW. This restriction should be relaxed. */
4217 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4219 if (vect_print_dump_info (REPORT_DETAILS))
4220 fprintf (vect_dump, "multiple types in nested loop.");
4224 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4225 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4226 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4227 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4228 && (code == NOP_EXPR || code == CONVERT_EXPR))))
4231 /* Check the operands of the operation. */
4232 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4234 if (vect_print_dump_info (REPORT_DETAILS))
4235 fprintf (vect_dump, "use not simple.");
4239 /* Supportable by target? */
4240 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1))
4243 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4245 if (!vec_stmt) /* transformation not required. */
4247 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4248 if (vect_print_dump_info (REPORT_DETAILS))
4249 fprintf (vect_dump, "=== vectorizable_demotion ===");
4250 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4255 if (vect_print_dump_info (REPORT_DETAILS))
4256 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4260 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4262 /* In case the vectorization factor (VF) is bigger than the number
4263 of elements that we can fit in a vectype (nunits), we have to generate
4264 more than one vector stmt - i.e - we need to "unroll" the
4265 vector stmt by a factor VF/nunits. */
4266 prev_stmt_info = NULL;
4267 for (j = 0; j < ncopies; j++)
4272 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4273 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4277 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
4278 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4281 /* Arguments are ready. Create the new vector stmt. */
4282 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
4283 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
4284 new_temp = make_ssa_name (vec_dest, new_stmt);
4285 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4286 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4289 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4291 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4293 prev_stmt_info = vinfo_for_stmt (new_stmt);
4296 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4301 /* Function vectorizable_type_promotion
4303 Check if STMT performs a binary or unary operation that involves
4304 type promotion, and if it can be vectorized.
4305 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4306 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4307 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4310 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
4316 tree op0, op1 = NULL;
4317 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4318 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4319 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4320 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4321 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4322 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4325 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4327 stmt_vec_info prev_stmt_info;
4335 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4338 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4341 /* Is STMT a vectorizable type-promotion operation? */
4342 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4345 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4348 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4349 code = TREE_CODE (operation);
4350 if (code != NOP_EXPR && code != CONVERT_EXPR
4351 && code != WIDEN_MULT_EXPR)
4354 op0 = TREE_OPERAND (operation, 0);
4355 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4358 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4360 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4361 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4364 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4365 if (nunits_out != nunits_in / 2) /* FORNOW */
4368 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4369 gcc_assert (ncopies >= 1);
4370 /* FORNOW. This restriction should be relaxed. */
4371 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4373 if (vect_print_dump_info (REPORT_DETAILS))
4374 fprintf (vect_dump, "multiple types in nested loop.");
4378 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4379 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4380 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4381 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4382 && (code == CONVERT_EXPR || code == NOP_EXPR))))
4385 /* Check the operands of the operation. */
4386 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4388 if (vect_print_dump_info (REPORT_DETAILS))
4389 fprintf (vect_dump, "use not simple.");
4393 op_type = TREE_CODE_LENGTH (code);
4394 if (op_type == binary_op)
4396 op1 = TREE_OPERAND (operation, 1);
4397 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4399 if (vect_print_dump_info (REPORT_DETAILS))
4400 fprintf (vect_dump, "use not simple.");
4405 /* Supportable by target? */
4406 if (!supportable_widening_operation (code, stmt, vectype_in,
4407 &decl1, &decl2, &code1, &code2))
4410 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4412 if (!vec_stmt) /* transformation not required. */
4414 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4415 if (vect_print_dump_info (REPORT_DETAILS))
4416 fprintf (vect_dump, "=== vectorizable_promotion ===");
4417 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4423 if (vect_print_dump_info (REPORT_DETAILS))
4424 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4428 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4430 /* In case the vectorization factor (VF) is bigger than the number
4431 of elements that we can fit in a vectype (nunits), we have to generate
4432 more than one vector stmt - i.e - we need to "unroll" the
4433 vector stmt by a factor VF/nunits. */
4435 prev_stmt_info = NULL;
4436 for (j = 0; j < ncopies; j++)
4441 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4442 if (op_type == binary_op)
4443 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4447 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4448 if (op_type == binary_op)
4449 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4452 /* Arguments are ready. Create the new vector stmt. We are creating
4453 two vector defs because the widened result does not fit in one vector.
4454 The vectorized stmt can be expressed as a call to a taregt builtin,
4455 or a using a tree-code. */
4456 /* Generate first half of the widened result: */
4457 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
4458 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4460 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4462 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4463 prev_stmt_info = vinfo_for_stmt (new_stmt);
4465 /* Generate second half of the widened result: */
4466 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
4467 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4468 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4469 prev_stmt_info = vinfo_for_stmt (new_stmt);
4473 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4478 /* Function vect_strided_store_supported.
4480 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4481 and FALSE otherwise. */
4484 vect_strided_store_supported (tree vectype)
4486 optab interleave_high_optab, interleave_low_optab;
4489 mode = (int) TYPE_MODE (vectype);
4491 /* Check that the operation is supported. */
4492 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4494 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4496 if (!interleave_high_optab || !interleave_low_optab)
4498 if (vect_print_dump_info (REPORT_DETAILS))
4499 fprintf (vect_dump, "no optab for interleave.");
4503 if (optab_handler (interleave_high_optab, mode)->insn_code
4505 || optab_handler (interleave_low_optab, mode)->insn_code
4506 == CODE_FOR_nothing)
4508 if (vect_print_dump_info (REPORT_DETAILS))
4509 fprintf (vect_dump, "interleave op not supported by target.");
4517 /* Function vect_permute_store_chain.
4519 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4520 a power of 2, generate interleave_high/low stmts to reorder the data
4521 correctly for the stores. Return the final references for stores in
4524 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4525 The input is 4 vectors each containing 8 elements. We assign a number to each
4526 element, the input sequence is:
4528 1st vec: 0 1 2 3 4 5 6 7
4529 2nd vec: 8 9 10 11 12 13 14 15
4530 3rd vec: 16 17 18 19 20 21 22 23
4531 4th vec: 24 25 26 27 28 29 30 31
4533 The output sequence should be:
4535 1st vec: 0 8 16 24 1 9 17 25
4536 2nd vec: 2 10 18 26 3 11 19 27
4537 3rd vec: 4 12 20 28 5 13 21 30
4538 4th vec: 6 14 22 30 7 15 23 31
4540 i.e., we interleave the contents of the four vectors in their order.
4542 We use interleave_high/low instructions to create such output. The input of
4543 each interleave_high/low operation is two vectors:
4546 the even elements of the result vector are obtained left-to-right from the
4547 high/low elements of the first vector. The odd elements of the result are
4548 obtained left-to-right from the high/low elements of the second vector.
4549 The output of interleave_high will be: 0 4 1 5
4550 and of interleave_low: 2 6 3 7
4553 The permutation is done in log LENGTH stages. In each stage interleave_high
4554 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4555 where the first argument is taken from the first half of DR_CHAIN and the
4556 second argument from it's second half.
4559 I1: interleave_high (1st vec, 3rd vec)
4560 I2: interleave_low (1st vec, 3rd vec)
4561 I3: interleave_high (2nd vec, 4th vec)
4562 I4: interleave_low (2nd vec, 4th vec)
4564 The output for the first stage is:
4566 I1: 0 16 1 17 2 18 3 19
4567 I2: 4 20 5 21 6 22 7 23
4568 I3: 8 24 9 25 10 26 11 27
4569 I4: 12 28 13 29 14 30 15 31
4571 The output of the second stage, i.e. the final result is:
4573 I1: 0 8 16 24 1 9 17 25
4574 I2: 2 10 18 26 3 11 19 27
4575 I3: 4 12 20 28 5 13 21 30
4576 I4: 6 14 22 30 7 15 23 31. */
4579 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
4580 unsigned int length,
4582 block_stmt_iterator *bsi,
4583 VEC(tree,heap) **result_chain)
4585 tree perm_dest, perm_stmt, vect1, vect2, high, low;
4586 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4587 tree scalar_dest, tmp;
4590 VEC(tree,heap) *first, *second;
4592 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4593 first = VEC_alloc (tree, heap, length/2);
4594 second = VEC_alloc (tree, heap, length/2);
4596 /* Check that the operation is supported. */
4597 if (!vect_strided_store_supported (vectype))
4600 *result_chain = VEC_copy (tree, heap, dr_chain);
4602 for (i = 0; i < exact_log2 (length); i++)
4604 for (j = 0; j < length/2; j++)
4606 vect1 = VEC_index (tree, dr_chain, j);
4607 vect2 = VEC_index (tree, dr_chain, j+length/2);
4609 /* Create interleaving stmt:
4610 in the case of big endian:
4611 high = interleave_high (vect1, vect2)
4612 and in the case of little endian:
4613 high = interleave_low (vect1, vect2). */
4614 perm_dest = create_tmp_var (vectype, "vect_inter_high");
4615 DECL_GIMPLE_REG_P (perm_dest) = 1;
4616 add_referenced_var (perm_dest);
4617 if (BYTES_BIG_ENDIAN)
4618 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4620 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4621 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4622 high = make_ssa_name (perm_dest, perm_stmt);
4623 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
4624 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4625 VEC_replace (tree, *result_chain, 2*j, high);
4627 /* Create interleaving stmt:
4628 in the case of big endian:
4629 low = interleave_low (vect1, vect2)
4630 and in the case of little endian:
4631 low = interleave_high (vect1, vect2). */
4632 perm_dest = create_tmp_var (vectype, "vect_inter_low");
4633 DECL_GIMPLE_REG_P (perm_dest) = 1;
4634 add_referenced_var (perm_dest);
4635 if (BYTES_BIG_ENDIAN)
4636 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4638 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4639 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4640 low = make_ssa_name (perm_dest, perm_stmt);
4641 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
4642 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4643 VEC_replace (tree, *result_chain, 2*j+1, low);
4645 dr_chain = VEC_copy (tree, heap, *result_chain);
4651 /* Function vectorizable_store.
4653 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
4655 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4656 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4657 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4660 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
4666 tree vec_oprnd = NULL_TREE;
4667 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4668 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
4669 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4670 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4671 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4672 enum machine_mode vec_mode;
4674 enum dr_alignment_support alignment_support_scheme;
4676 enum vect_def_type dt;
4677 stmt_vec_info prev_stmt_info = NULL;
4678 tree dataref_ptr = NULL_TREE;
4679 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4680 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4682 tree next_stmt, first_stmt = NULL_TREE;
4683 bool strided_store = false;
4684 unsigned int group_size, i;
4685 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
4687 VEC(tree,heap) *vec_oprnds = NULL;
4688 bool slp = (slp_node != NULL);
4689 stmt_vec_info first_stmt_vinfo;
4690 unsigned int vec_num;
4692 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
4693 this, so we can safely override NCOPIES with 1 here. */
4697 gcc_assert (ncopies >= 1);
4699 /* FORNOW. This restriction should be relaxed. */
4700 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4702 if (vect_print_dump_info (REPORT_DETAILS))
4703 fprintf (vect_dump, "multiple types in nested loop.");
4707 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4710 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4713 /* Is vectorizable store? */
4715 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4718 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4719 if (TREE_CODE (scalar_dest) != ARRAY_REF
4720 && TREE_CODE (scalar_dest) != INDIRECT_REF
4721 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
4724 op = GIMPLE_STMT_OPERAND (stmt, 1);
4725 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4727 if (vect_print_dump_info (REPORT_DETAILS))
4728 fprintf (vect_dump, "use not simple.");
4732 vec_mode = TYPE_MODE (vectype);
4733 /* FORNOW. In some cases can vectorize even if data-type not supported
4734 (e.g. - array initialization with 0). */
4735 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
4738 if (!STMT_VINFO_DATA_REF (stmt_info))
4741 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4743 strided_store = true;
4744 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4745 if (!vect_strided_store_supported (vectype)
4746 && !PURE_SLP_STMT (stmt_info) && !slp)
4749 if (first_stmt == stmt)
4751 /* STMT is the leader of the group. Check the operands of all the
4752 stmts of the group. */
4753 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
4756 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4757 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4759 if (vect_print_dump_info (REPORT_DETAILS))
4760 fprintf (vect_dump, "use not simple.");
4763 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4768 if (!vec_stmt) /* transformation not required. */
4770 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
4771 if (!PURE_SLP_STMT (stmt_info))
4772 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
4780 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
4781 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
4783 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
4786 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
4788 /* We vectorize all the stmts of the interleaving group when we
4789 reach the last stmt in the group. */
4790 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
4791 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
4794 *vec_stmt = NULL_TREE;
4799 strided_store = false;
4801 /* VEC_NUM is the number of vect stmts to be created for this group. */
4802 if (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) < group_size)
4803 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4805 vec_num = group_size;
4811 group_size = vec_num = 1;
4812 first_stmt_vinfo = stmt_info;
4815 if (vect_print_dump_info (REPORT_DETAILS))
4816 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
4818 dr_chain = VEC_alloc (tree, heap, group_size);
4819 oprnds = VEC_alloc (tree, heap, group_size);
4821 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
4822 gcc_assert (alignment_support_scheme);
4823 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
4825 /* In case the vectorization factor (VF) is bigger than the number
4826 of elements that we can fit in a vectype (nunits), we have to generate
4827 more than one vector stmt - i.e - we need to "unroll" the
4828 vector stmt by a factor VF/nunits. For more details see documentation in
4829 vect_get_vec_def_for_copy_stmt. */
4831 /* In case of interleaving (non-unit strided access):
4838 We create vectorized stores starting from base address (the access of the
4839 first stmt in the chain (S2 in the above example), when the last store stmt
4840 of the chain (S4) is reached:
4843 VS2: &base + vec_size*1 = vx0
4844 VS3: &base + vec_size*2 = vx1
4845 VS4: &base + vec_size*3 = vx3
4847 Then permutation statements are generated:
4849 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
4850 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
4853 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
4854 (the order of the data-refs in the output of vect_permute_store_chain
4855 corresponds to the order of scalar stmts in the interleaving chain - see
4856 the documentation of vect_permute_store_chain()).
4858 In case of both multiple types and interleaving, above vector stores and
4859 permutation stmts are created for every copy. The result vector stmts are
4860 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
4861 STMT_VINFO_RELATED_STMT for the next copies.
4864 prev_stmt_info = NULL;
4865 for (j = 0; j < ncopies; j++)
4874 /* Get vectorized arguments for SLP_NODE. */
4875 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
4877 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
4881 /* For interleaved stores we collect vectorized defs for all the
4882 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
4883 used as an input to vect_permute_store_chain(), and OPRNDS as
4884 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
4886 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4887 OPRNDS are of size 1. */
4888 next_stmt = first_stmt;
4889 for (i = 0; i < group_size; i++)
4891 /* Since gaps are not supported for interleaved stores,
4892 GROUP_SIZE is the exact number of stmts in the chain.
4893 Therefore, NEXT_STMT can't be NULL_TREE. In case that
4894 there is no interleaving, GROUP_SIZE is 1, and only one
4895 iteration of the loop will be executed. */
4896 gcc_assert (next_stmt);
4897 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4899 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
4901 VEC_quick_push(tree, dr_chain, vec_oprnd);
4902 VEC_quick_push(tree, oprnds, vec_oprnd);
4903 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4906 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
4907 &dummy, &ptr_incr, false,
4908 TREE_TYPE (vec_oprnd), &inv_p);
4909 gcc_assert (!inv_p);
4913 /* FORNOW SLP doesn't work for multiple types. */
4916 /* For interleaved stores we created vectorized defs for all the
4917 defs stored in OPRNDS in the previous iteration (previous copy).
4918 DR_CHAIN is then used as an input to vect_permute_store_chain(),
4919 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
4921 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4922 OPRNDS are of size 1. */
4923 for (i = 0; i < group_size; i++)
4925 op = VEC_index (tree, oprnds, i);
4926 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
4927 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
4928 VEC_replace(tree, dr_chain, i, vec_oprnd);
4929 VEC_replace(tree, oprnds, i, vec_oprnd);
4932 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
4937 result_chain = VEC_alloc (tree, heap, group_size);
4939 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
4944 next_stmt = first_stmt;
4945 for (i = 0; i < vec_num; i++)
4948 /* Bump the vector pointer. */
4949 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
4953 vec_oprnd = VEC_index (tree, vec_oprnds, i);
4954 else if (strided_store)
4955 /* For strided stores vectorized defs are interleaved in
4956 vect_permute_store_chain(). */
4957 vec_oprnd = VEC_index (tree, result_chain, i);
4959 data_ref = build_fold_indirect_ref (dataref_ptr);
4960 /* Arguments are ready. Create the new vector stmt. */
4961 new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
4962 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4963 mark_symbols_for_renaming (new_stmt);
4966 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4968 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4970 prev_stmt_info = vinfo_for_stmt (new_stmt);
4971 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4981 /* Function vect_setup_realignment
4983 This function is called when vectorizing an unaligned load using
4984 the dr_explicit_realign[_optimized] scheme.
4985 This function generates the following code at the loop prolog:
4988 x msq_init = *(floor(p)); # prolog load
4989 realignment_token = call target_builtin;
4991 x msq = phi (msq_init, ---)
4993 The stmts marked with x are generated only for the case of
4994 dr_explicit_realign_optimized.
4996 The code above sets up a new (vector) pointer, pointing to the first
4997 location accessed by STMT, and a "floor-aligned" load using that pointer.
4998 It also generates code to compute the "realignment-token" (if the relevant
4999 target hook was defined), and creates a phi-node at the loop-header bb
5000 whose arguments are the result of the prolog-load (created by this
5001 function) and the result of a load that takes place in the loop (to be
5002 created by the caller to this function).
5004 For the case of dr_explicit_realign_optimized:
5005 The caller to this function uses the phi-result (msq) to create the
5006 realignment code inside the loop, and sets up the missing phi argument,
5009 msq = phi (msq_init, lsq)
5010 lsq = *(floor(p')); # load in loop
5011 result = realign_load (msq, lsq, realignment_token);
5013 For the case of dr_explicit_realign:
5015 msq = *(floor(p)); # load in loop
5017 lsq = *(floor(p')); # load in loop
5018 result = realign_load (msq, lsq, realignment_token);
5021 STMT - (scalar) load stmt to be vectorized. This load accesses
5022 a memory location that may be unaligned.
5023 BSI - place where new code is to be inserted.
5024 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5028 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5029 target hook, if defined.
5030 Return value - the result of the loop-header phi node. */
5033 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
5034 tree *realignment_token,
5035 enum dr_alignment_support alignment_support_scheme,
5037 struct loop **at_loop)
5039 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5040 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5041 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5042 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5044 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5051 tree msq_init = NULL_TREE;
5054 tree msq = NULL_TREE;
5055 tree stmts = NULL_TREE;
5057 bool compute_in_loop = false;
5058 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5059 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5060 struct loop *loop_for_initial_load;
5062 gcc_assert (alignment_support_scheme == dr_explicit_realign
5063 || alignment_support_scheme == dr_explicit_realign_optimized);
5065 /* We need to generate three things:
5066 1. the misalignment computation
5067 2. the extra vector load (for the optimized realignment scheme).
5068 3. the phi node for the two vectors from which the realignment is
5069 done (for the optimized realignment scheme).
5072 /* 1. Determine where to generate the misalignment computation.
5074 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5075 calculation will be generated by this function, outside the loop (in the
5076 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5077 caller, inside the loop.
5079 Background: If the misalignment remains fixed throughout the iterations of
5080 the loop, then both realignment schemes are applicable, and also the
5081 misalignment computation can be done outside LOOP. This is because we are
5082 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5083 are a multiple of VS (the Vector Size), and therefore the misalignment in
5084 different vectorized LOOP iterations is always the same.
5085 The problem arises only if the memory access is in an inner-loop nested
5086 inside LOOP, which is now being vectorized using outer-loop vectorization.
5087 This is the only case when the misalignment of the memory access may not
5088 remain fixed throughout the iterations of the inner-loop (as explained in
5089 detail in vect_supportable_dr_alignment). In this case, not only is the
5090 optimized realignment scheme not applicable, but also the misalignment
5091 computation (and generation of the realignment token that is passed to
5092 REALIGN_LOAD) have to be done inside the loop.
5094 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5095 or not, which in turn determines if the misalignment is computed inside
5096 the inner-loop, or outside LOOP. */
5098 if (init_addr != NULL_TREE)
5100 compute_in_loop = true;
5101 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5105 /* 2. Determine where to generate the extra vector load.
5107 For the optimized realignment scheme, instead of generating two vector
5108 loads in each iteration, we generate a single extra vector load in the
5109 preheader of the loop, and in each iteration reuse the result of the
5110 vector load from the previous iteration. In case the memory access is in
5111 an inner-loop nested inside LOOP, which is now being vectorized using
5112 outer-loop vectorization, we need to determine whether this initial vector
5113 load should be generated at the preheader of the inner-loop, or can be
5114 generated at the preheader of LOOP. If the memory access has no evolution
5115 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5116 to be generated inside LOOP (in the preheader of the inner-loop). */
5118 if (nested_in_vect_loop)
5120 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5121 bool invariant_in_outerloop =
5122 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5123 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5126 loop_for_initial_load = loop;
5128 *at_loop = loop_for_initial_load;
5130 /* 3. For the case of the optimized realignment, create the first vector
5131 load at the loop preheader. */
5133 if (alignment_support_scheme == dr_explicit_realign_optimized)
5135 /* Create msq_init = *(floor(p1)) in the loop preheader */
5137 gcc_assert (!compute_in_loop);
5138 pe = loop_preheader_edge (loop_for_initial_load);
5139 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5140 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5141 &init_addr, &inc, true, NULL_TREE, &inv_p);
5142 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5143 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5144 new_temp = make_ssa_name (vec_dest, new_stmt);
5145 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5146 mark_symbols_for_renaming (new_stmt);
5147 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5148 gcc_assert (!new_bb);
5149 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
5152 /* 4. Create realignment token using a target builtin, if available.
5153 It is done either inside the containing loop, or before LOOP (as
5154 determined above). */
5156 if (targetm.vectorize.builtin_mask_for_load)
5160 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5161 if (compute_in_loop)
5162 gcc_assert (init_addr); /* already computed by the caller. */
5165 /* Generate the INIT_ADDR computation outside LOOP. */
5166 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5168 pe = loop_preheader_edge (loop);
5169 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
5170 gcc_assert (!new_bb);
5173 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5174 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
5175 vec_dest = vect_create_destination_var (scalar_dest,
5176 TREE_TYPE (new_stmt));
5177 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5178 new_temp = make_ssa_name (vec_dest, new_stmt);
5179 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5181 if (compute_in_loop)
5182 bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT);
5185 /* Generate the misalignment computation outside LOOP. */
5186 pe = loop_preheader_edge (loop);
5187 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5188 gcc_assert (!new_bb);
5191 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
5193 /* The result of the CALL_EXPR to this builtin is determined from
5194 the value of the parameter and no global variables are touched
5195 which makes the builtin a "const" function. Requiring the
5196 builtin to have the "const" attribute makes it unnecessary
5197 to call mark_call_clobbered. */
5198 gcc_assert (TREE_READONLY (builtin_decl));
5201 if (alignment_support_scheme == dr_explicit_realign)
5204 gcc_assert (!compute_in_loop);
5205 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5208 /* 5. Create msq = phi <msq_init, lsq> in loop */
5210 pe = loop_preheader_edge (containing_loop);
5211 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5212 msq = make_ssa_name (vec_dest, NULL_TREE);
5213 phi_stmt = create_phi_node (msq, containing_loop->header);
5214 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5215 add_phi_arg (phi_stmt, msq_init, pe);
5221 /* Function vect_strided_load_supported.
5223 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5224 and FALSE otherwise. */
5227 vect_strided_load_supported (tree vectype)
5229 optab perm_even_optab, perm_odd_optab;
5232 mode = (int) TYPE_MODE (vectype);
5234 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
5235 if (!perm_even_optab)
5237 if (vect_print_dump_info (REPORT_DETAILS))
5238 fprintf (vect_dump, "no optab for perm_even.");
5242 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5244 if (vect_print_dump_info (REPORT_DETAILS))
5245 fprintf (vect_dump, "perm_even op not supported by target.");
5249 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
5250 if (!perm_odd_optab)
5252 if (vect_print_dump_info (REPORT_DETAILS))
5253 fprintf (vect_dump, "no optab for perm_odd.");
5257 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5259 if (vect_print_dump_info (REPORT_DETAILS))
5260 fprintf (vect_dump, "perm_odd op not supported by target.");
5267 /* Function vect_permute_load_chain.
5269 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5270 a power of 2, generate extract_even/odd stmts to reorder the input data
5271 correctly. Return the final references for loads in RESULT_CHAIN.
5273 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5274 The input is 4 vectors each containing 8 elements. We assign a number to each
5275 element, the input sequence is:
5277 1st vec: 0 1 2 3 4 5 6 7
5278 2nd vec: 8 9 10 11 12 13 14 15
5279 3rd vec: 16 17 18 19 20 21 22 23
5280 4th vec: 24 25 26 27 28 29 30 31
5282 The output sequence should be:
5284 1st vec: 0 4 8 12 16 20 24 28
5285 2nd vec: 1 5 9 13 17 21 25 29
5286 3rd vec: 2 6 10 14 18 22 26 30
5287 4th vec: 3 7 11 15 19 23 27 31
5289 i.e., the first output vector should contain the first elements of each
5290 interleaving group, etc.
5292 We use extract_even/odd instructions to create such output. The input of each
5293 extract_even/odd operation is two vectors
5297 and the output is the vector of extracted even/odd elements. The output of
5298 extract_even will be: 0 2 4 6
5299 and of extract_odd: 1 3 5 7
5302 The permutation is done in log LENGTH stages. In each stage extract_even and
5303 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5304 order. In our example,
5306 E1: extract_even (1st vec, 2nd vec)
5307 E2: extract_odd (1st vec, 2nd vec)
5308 E3: extract_even (3rd vec, 4th vec)
5309 E4: extract_odd (3rd vec, 4th vec)
5311 The output for the first stage will be:
5313 E1: 0 2 4 6 8 10 12 14
5314 E2: 1 3 5 7 9 11 13 15
5315 E3: 16 18 20 22 24 26 28 30
5316 E4: 17 19 21 23 25 27 29 31
5318 In order to proceed and create the correct sequence for the next stage (or
5319 for the correct output, if the second stage is the last one, as in our
5320 example), we first put the output of extract_even operation and then the
5321 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5322 The input for the second stage is:
5324 1st vec (E1): 0 2 4 6 8 10 12 14
5325 2nd vec (E3): 16 18 20 22 24 26 28 30
5326 3rd vec (E2): 1 3 5 7 9 11 13 15
5327 4th vec (E4): 17 19 21 23 25 27 29 31
5329 The output of the second stage:
5331 E1: 0 4 8 12 16 20 24 28
5332 E2: 2 6 10 14 18 22 26 30
5333 E3: 1 5 9 13 17 21 25 29
5334 E4: 3 7 11 15 19 23 27 31
5336 And RESULT_CHAIN after reordering:
5338 1st vec (E1): 0 4 8 12 16 20 24 28
5339 2nd vec (E3): 1 5 9 13 17 21 25 29
5340 3rd vec (E2): 2 6 10 14 18 22 26 30
5341 4th vec (E4): 3 7 11 15 19 23 27 31. */
5344 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5345 unsigned int length,
5347 block_stmt_iterator *bsi,
5348 VEC(tree,heap) **result_chain)
5350 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
5351 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5356 /* Check that the operation is supported. */
5357 if (!vect_strided_load_supported (vectype))
5360 *result_chain = VEC_copy (tree, heap, dr_chain);
5361 for (i = 0; i < exact_log2 (length); i++)
5363 for (j = 0; j < length; j +=2)
5365 first_vect = VEC_index (tree, dr_chain, j);
5366 second_vect = VEC_index (tree, dr_chain, j+1);
5368 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5369 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5370 DECL_GIMPLE_REG_P (perm_dest) = 1;
5371 add_referenced_var (perm_dest);
5373 tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
5374 first_vect, second_vect);
5375 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5377 data_ref = make_ssa_name (perm_dest, perm_stmt);
5378 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5379 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5380 mark_symbols_for_renaming (perm_stmt);
5382 VEC_replace (tree, *result_chain, j/2, data_ref);
5384 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5385 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5386 DECL_GIMPLE_REG_P (perm_dest) = 1;
5387 add_referenced_var (perm_dest);
5389 tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
5390 first_vect, second_vect);
5391 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5392 data_ref = make_ssa_name (perm_dest, perm_stmt);
5393 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5394 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5395 mark_symbols_for_renaming (perm_stmt);
5397 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5399 dr_chain = VEC_copy (tree, heap, *result_chain);
5405 /* Function vect_transform_strided_load.
5407 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5408 to perform their permutation and ascribe the result vectorized statements to
5409 the scalar statements.
5413 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
5414 block_stmt_iterator *bsi)
5416 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5417 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5418 tree next_stmt, new_stmt;
5419 VEC(tree,heap) *result_chain = NULL;
5420 unsigned int i, gap_count;
5423 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5424 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5425 vectors, that are ready for vector computation. */
5426 result_chain = VEC_alloc (tree, heap, size);
5428 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
5431 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5432 Since we scan the chain starting from it's first node, their order
5433 corresponds the order of data-refs in RESULT_CHAIN. */
5434 next_stmt = first_stmt;
5436 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5441 /* Skip the gaps. Loads created for the gaps will be removed by dead
5442 code elimination pass later.
5443 DR_GROUP_GAP is the number of steps in elements from the previous
5444 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5445 correspond to the gaps.
5447 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5455 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5456 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5457 copies, and we put the new vector statement in the first available
5459 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5460 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5463 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5464 tree rel_stmt = STMT_VINFO_RELATED_STMT (
5465 vinfo_for_stmt (prev_stmt));
5468 prev_stmt = rel_stmt;
5469 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5471 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
5473 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5475 /* If NEXT_STMT accesses the same DR as the previous statement,
5476 put the same TMP_DATA_REF as its vectorized statement; otherwise
5477 get the next data-ref from RESULT_CHAIN. */
5478 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5486 /* vectorizable_load.
5488 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
5490 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5491 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5492 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5495 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
5499 tree vec_dest = NULL;
5500 tree data_ref = NULL;
5502 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5503 stmt_vec_info prev_stmt_info;
5504 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5505 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5506 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5507 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5508 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
5509 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5512 tree new_stmt = NULL_TREE;
5514 enum dr_alignment_support alignment_support_scheme;
5515 tree dataref_ptr = NULL_TREE;
5517 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5518 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5519 int i, j, group_size;
5520 tree msq = NULL_TREE, lsq;
5521 tree offset = NULL_TREE;
5522 tree realignment_token = NULL_TREE;
5523 tree phi = NULL_TREE;
5524 VEC(tree,heap) *dr_chain = NULL;
5525 bool strided_load = false;
5529 bool compute_in_loop = false;
5530 struct loop *at_loop;
5532 bool slp = (slp_node != NULL);
5534 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
5535 this, so we can safely override NCOPIES with 1 here. */
5539 gcc_assert (ncopies >= 1);
5541 /* FORNOW. This restriction should be relaxed. */
5542 if (nested_in_vect_loop && ncopies > 1)
5544 if (vect_print_dump_info (REPORT_DETAILS))
5545 fprintf (vect_dump, "multiple types in nested loop.");
5549 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5552 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5555 /* Is vectorizable load? */
5556 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5559 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5560 if (TREE_CODE (scalar_dest) != SSA_NAME)
5563 op = GIMPLE_STMT_OPERAND (stmt, 1);
5564 if (TREE_CODE (op) != ARRAY_REF
5565 && TREE_CODE (op) != INDIRECT_REF
5566 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5569 if (!STMT_VINFO_DATA_REF (stmt_info))
5572 scalar_type = TREE_TYPE (DR_REF (dr));
5573 mode = (int) TYPE_MODE (vectype);
5575 /* FORNOW. In some cases can vectorize even if data-type not supported
5576 (e.g. - data copies). */
5577 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
5579 if (vect_print_dump_info (REPORT_DETAILS))
5580 fprintf (vect_dump, "Aligned load, but unsupported type.");
5584 /* Check if the load is a part of an interleaving chain. */
5585 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5587 strided_load = true;
5589 gcc_assert (! nested_in_vect_loop);
5591 /* Check if interleaving is supported. */
5592 if (!vect_strided_load_supported (vectype)
5593 && !PURE_SLP_STMT (stmt_info) && !slp)
5597 if (!vec_stmt) /* transformation not required. */
5599 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
5600 vect_model_load_cost (stmt_info, ncopies, NULL);
5604 if (vect_print_dump_info (REPORT_DETAILS))
5605 fprintf (vect_dump, "transform load.");
5611 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5612 /* Check if the chain of loads is already vectorized. */
5613 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
5615 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5618 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5619 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5620 dr_chain = VEC_alloc (tree, heap, group_size);
5622 /* VEC_NUM is the number of vect stmts to be created for this group. */
5625 strided_load = false;
5626 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5629 vec_num = group_size;
5635 group_size = vec_num = 1;
5638 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5639 gcc_assert (alignment_support_scheme);
5641 /* In case the vectorization factor (VF) is bigger than the number
5642 of elements that we can fit in a vectype (nunits), we have to generate
5643 more than one vector stmt - i.e - we need to "unroll" the
5644 vector stmt by a factor VF/nunits. In doing so, we record a pointer
5645 from one copy of the vector stmt to the next, in the field
5646 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
5647 stages to find the correct vector defs to be used when vectorizing
5648 stmts that use the defs of the current stmt. The example below illustrates
5649 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
5650 4 vectorized stmts):
5652 before vectorization:
5653 RELATED_STMT VEC_STMT
5657 step 1: vectorize stmt S1:
5658 We first create the vector stmt VS1_0, and, as usual, record a
5659 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
5660 Next, we create the vector stmt VS1_1, and record a pointer to
5661 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
5662 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
5664 RELATED_STMT VEC_STMT
5665 VS1_0: vx0 = memref0 VS1_1 -
5666 VS1_1: vx1 = memref1 VS1_2 -
5667 VS1_2: vx2 = memref2 VS1_3 -
5668 VS1_3: vx3 = memref3 - -
5669 S1: x = load - VS1_0
5672 See in documentation in vect_get_vec_def_for_stmt_copy for how the
5673 information we recorded in RELATED_STMT field is used to vectorize
5676 /* In case of interleaving (non-unit strided access):
5683 Vectorized loads are created in the order of memory accesses
5684 starting from the access of the first stmt of the chain:
5687 VS2: vx1 = &base + vec_size*1
5688 VS3: vx3 = &base + vec_size*2
5689 VS4: vx4 = &base + vec_size*3
5691 Then permutation statements are generated:
5693 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
5694 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
5697 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5698 (the order of the data-refs in the output of vect_permute_load_chain
5699 corresponds to the order of scalar stmts in the interleaving chain - see
5700 the documentation of vect_permute_load_chain()).
5701 The generation of permutation stmts and recording them in
5702 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
5704 In case of both multiple types and interleaving, the vector loads and
5705 permutation stmts above are created for every copy. The result vector stmts
5706 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5707 STMT_VINFO_RELATED_STMT for the next copies. */
5709 /* If the data reference is aligned (dr_aligned) or potentially unaligned
5710 on a target that supports unaligned accesses (dr_unaligned_supported)
5711 we generate the following code:
5715 p = p + indx * vectype_size;
5720 Otherwise, the data reference is potentially unaligned on a target that
5721 does not support unaligned accesses (dr_explicit_realign_optimized) -
5722 then generate the following code, in which the data in each iteration is
5723 obtained by two vector loads, one from the previous iteration, and one
5724 from the current iteration:
5726 msq_init = *(floor(p1))
5727 p2 = initial_addr + VS - 1;
5728 realignment_token = call target_builtin;
5731 p2 = p2 + indx * vectype_size
5733 vec_dest = realign_load (msq, lsq, realignment_token)
5738 /* If the misalignment remains the same throughout the execution of the
5739 loop, we can create the init_addr and permutation mask at the loop
5740 preheader. Otherwise, it needs to be created inside the loop.
5741 This can only occur when vectorizing memory accesses in the inner-loop
5742 nested within an outer-loop that is being vectorized. */
5744 if (nested_in_vect_loop_p (loop, stmt)
5745 && (TREE_INT_CST_LOW (DR_STEP (dr)) % UNITS_PER_SIMD_WORD != 0))
5747 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
5748 compute_in_loop = true;
5751 if ((alignment_support_scheme == dr_explicit_realign_optimized
5752 || alignment_support_scheme == dr_explicit_realign)
5753 && !compute_in_loop)
5755 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token,
5756 alignment_support_scheme, NULL_TREE,
5758 if (alignment_support_scheme == dr_explicit_realign_optimized)
5760 phi = SSA_NAME_DEF_STMT (msq);
5761 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5767 prev_stmt_info = NULL;
5768 for (j = 0; j < ncopies; j++)
5770 /* 1. Create the vector pointer update chain. */
5772 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
5774 &dummy, &ptr_incr, false,
5778 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
5780 for (i = 0; i < vec_num; i++)
5783 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
5786 /* 2. Create the vector-load in the loop. */
5787 switch (alignment_support_scheme)
5790 gcc_assert (aligned_access_p (first_dr));
5791 data_ref = build_fold_indirect_ref (dataref_ptr);
5793 case dr_unaligned_supported:
5795 int mis = DR_MISALIGNMENT (first_dr);
5796 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
5798 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
5800 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
5803 case dr_explicit_realign:
5806 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5808 if (compute_in_loop)
5809 msq = vect_setup_realignment (first_stmt, bsi,
5811 dr_explicit_realign,
5814 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5815 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5816 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5817 new_temp = make_ssa_name (vec_dest, new_stmt);
5818 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5819 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5820 copy_virtual_operands (new_stmt, stmt);
5821 mark_symbols_for_renaming (new_stmt);
5824 bump = size_binop (MULT_EXPR, vs_minus_1,
5825 TYPE_SIZE_UNIT (scalar_type));
5826 ptr = bump_vector_ptr (dataref_ptr, NULL_TREE, bsi, stmt, bump);
5827 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5830 case dr_explicit_realign_optimized:
5831 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5836 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5837 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5838 new_temp = make_ssa_name (vec_dest, new_stmt);
5839 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5840 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5841 mark_symbols_for_renaming (new_stmt);
5843 /* 3. Handle explicit realignment if necessary/supported. Create in
5844 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
5845 if (alignment_support_scheme == dr_explicit_realign_optimized
5846 || alignment_support_scheme == dr_explicit_realign)
5848 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
5849 if (!realignment_token)
5850 realignment_token = dataref_ptr;
5851 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5852 new_stmt = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
5854 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5855 new_temp = make_ssa_name (vec_dest, new_stmt);
5856 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5857 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5859 if (alignment_support_scheme == dr_explicit_realign_optimized)
5861 if (i == vec_num - 1 && j == ncopies - 1)
5862 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
5867 /* 4. Handle invariant-load. */
5870 gcc_assert (!strided_load);
5871 gcc_assert (nested_in_vect_loop_p (loop, stmt));
5876 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
5878 /* CHECKME: bitpos depends on endianess? */
5879 bitpos = bitsize_zero_node;
5880 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5882 BIT_FIELD_REF_UNSIGNED (vec_inv) =
5883 TYPE_UNSIGNED (scalar_type);
5885 vect_create_destination_var (scalar_dest, NULL_TREE);
5886 new_stmt = build_gimple_modify_stmt (vec_dest, vec_inv);
5887 new_temp = make_ssa_name (vec_dest, new_stmt);
5888 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5889 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5891 for (k = nunits - 1; k >= 0; --k)
5892 t = tree_cons (NULL_TREE, new_temp, t);
5893 /* FIXME: use build_constructor directly. */
5894 vec_inv = build_constructor_from_list (vectype, t);
5895 new_temp = vect_init_vector (stmt, vec_inv, vectype, bsi);
5896 new_stmt = SSA_NAME_DEF_STMT (new_temp);
5899 gcc_unreachable (); /* FORNOW. */
5902 /* Collect vector loads and later create their permutation in
5903 vect_transform_strided_load (). */
5905 VEC_quick_push (tree, dr_chain, new_temp);
5907 /* Store vector loads in the corresponding SLP_NODE. */
5909 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
5912 /* FORNOW: SLP with multiple types is unsupported. */
5918 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
5920 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5921 dr_chain = VEC_alloc (tree, heap, group_size);
5926 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5928 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5929 prev_stmt_info = vinfo_for_stmt (new_stmt);
5937 /* Function vectorizable_live_operation.
5939 STMT computes a value that is used outside the loop. Check if
5940 it can be supported. */
5943 vectorizable_live_operation (tree stmt,
5944 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
5945 tree *vec_stmt ATTRIBUTE_UNUSED)
5948 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5949 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5950 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5955 enum vect_def_type dt;
5957 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5959 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5962 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5965 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
5968 /* FORNOW. CHECKME. */
5969 if (nested_in_vect_loop_p (loop, stmt))
5972 operation = GIMPLE_STMT_OPERAND (stmt, 1);
5973 op_type = TREE_OPERAND_LENGTH (operation);
5975 /* FORNOW: support only if all uses are invariant. This means
5976 that the scalar operations can remain in place, unvectorized.
5977 The original last scalar value that they compute will be used. */
5979 for (i = 0; i < op_type; i++)
5981 op = TREE_OPERAND (operation, i);
5982 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5984 if (vect_print_dump_info (REPORT_DETAILS))
5985 fprintf (vect_dump, "use not simple.");
5989 if (dt != vect_invariant_def && dt != vect_constant_def)
5993 /* No transformation is required for the cases we currently support. */
5998 /* Function vect_is_simple_cond.
6001 LOOP - the loop that is being vectorized.
6002 COND - Condition that is checked for simple use.
6004 Returns whether a COND can be vectorized. Checks whether
6005 condition operands are supportable using vec_is_simple_use. */
6008 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6012 enum vect_def_type dt;
6014 if (!COMPARISON_CLASS_P (cond))
6017 lhs = TREE_OPERAND (cond, 0);
6018 rhs = TREE_OPERAND (cond, 1);
6020 if (TREE_CODE (lhs) == SSA_NAME)
6022 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6023 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6026 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6027 && TREE_CODE (lhs) != FIXED_CST)
6030 if (TREE_CODE (rhs) == SSA_NAME)
6032 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6033 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6036 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
6037 && TREE_CODE (rhs) != FIXED_CST)
6043 /* vectorizable_condition.
6045 Check if STMT is conditional modify expression that can be vectorized.
6046 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6047 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
6050 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6053 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
6055 tree scalar_dest = NULL_TREE;
6056 tree vec_dest = NULL_TREE;
6057 tree op = NULL_TREE;
6058 tree cond_expr, then_clause, else_clause;
6059 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6060 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6061 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6062 tree vec_compare, vec_cond_expr;
6064 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6065 enum machine_mode vec_mode;
6067 enum vect_def_type dt;
6068 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6069 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6071 gcc_assert (ncopies >= 1);
6073 return false; /* FORNOW */
6075 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6078 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6081 /* FORNOW: SLP not supported. */
6082 if (STMT_SLP_TYPE (stmt_info))
6085 /* FORNOW: not yet supported. */
6086 if (STMT_VINFO_LIVE_P (stmt_info))
6088 if (vect_print_dump_info (REPORT_DETAILS))
6089 fprintf (vect_dump, "value used after loop.");
6093 /* Is vectorizable conditional operation? */
6094 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
6097 op = GIMPLE_STMT_OPERAND (stmt, 1);
6099 if (TREE_CODE (op) != COND_EXPR)
6102 cond_expr = TREE_OPERAND (op, 0);
6103 then_clause = TREE_OPERAND (op, 1);
6104 else_clause = TREE_OPERAND (op, 2);
6106 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6109 /* We do not handle two different vector types for the condition
6111 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6114 if (TREE_CODE (then_clause) == SSA_NAME)
6116 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6117 if (!vect_is_simple_use (then_clause, loop_vinfo,
6118 &then_def_stmt, &def, &dt))
6121 else if (TREE_CODE (then_clause) != INTEGER_CST
6122 && TREE_CODE (then_clause) != REAL_CST
6123 && TREE_CODE (then_clause) != FIXED_CST)
6126 if (TREE_CODE (else_clause) == SSA_NAME)
6128 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6129 if (!vect_is_simple_use (else_clause, loop_vinfo,
6130 &else_def_stmt, &def, &dt))
6133 else if (TREE_CODE (else_clause) != INTEGER_CST
6134 && TREE_CODE (else_clause) != REAL_CST
6135 && TREE_CODE (else_clause) != FIXED_CST)
6139 vec_mode = TYPE_MODE (vectype);
6143 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
6144 return expand_vec_cond_expr_p (op, vec_mode);
6150 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
6151 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6153 /* Handle cond expr. */
6155 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
6157 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
6158 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
6159 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
6161 /* Arguments are ready. create the new vector stmt. */
6162 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
6163 vec_cond_lhs, vec_cond_rhs);
6164 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
6165 vec_compare, vec_then_clause, vec_else_clause);
6167 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
6168 new_temp = make_ssa_name (vec_dest, *vec_stmt);
6169 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
6170 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
6176 /* Function vect_transform_stmt.
6178 Create a vectorized stmt to replace STMT, and insert it at BSI. */
6181 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store,
6184 bool is_store = false;
6185 tree vec_stmt = NULL_TREE;
6186 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6187 tree orig_stmt_in_pattern;
6190 switch (STMT_VINFO_TYPE (stmt_info))
6192 case type_demotion_vec_info_type:
6193 gcc_assert (!slp_node);
6194 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
6198 case type_promotion_vec_info_type:
6199 gcc_assert (!slp_node);
6200 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
6204 case type_conversion_vec_info_type:
6205 done = vectorizable_conversion (stmt, bsi, &vec_stmt, slp_node);
6209 case induc_vec_info_type:
6210 gcc_assert (!slp_node);
6211 done = vectorizable_induction (stmt, bsi, &vec_stmt);
6215 case op_vec_info_type:
6216 done = vectorizable_operation (stmt, bsi, &vec_stmt, slp_node);
6220 case assignment_vec_info_type:
6221 done = vectorizable_assignment (stmt, bsi, &vec_stmt, slp_node);
6225 case load_vec_info_type:
6226 done = vectorizable_load (stmt, bsi, &vec_stmt, slp_node);
6230 case store_vec_info_type:
6231 done = vectorizable_store (stmt, bsi, &vec_stmt, slp_node);
6233 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6235 /* In case of interleaving, the whole chain is vectorized when the
6236 last store in the chain is reached. Store stmts before the last
6237 one are skipped, and there vec_stmt_info shouldn't be freed
6239 *strided_store = true;
6240 if (STMT_VINFO_VEC_STMT (stmt_info))
6247 case condition_vec_info_type:
6248 gcc_assert (!slp_node);
6249 done = vectorizable_condition (stmt, bsi, &vec_stmt);
6253 case call_vec_info_type:
6254 gcc_assert (!slp_node);
6255 done = vectorizable_call (stmt, bsi, &vec_stmt);
6258 case reduc_vec_info_type:
6259 gcc_assert (!slp_node);
6260 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
6265 if (!STMT_VINFO_LIVE_P (stmt_info))
6267 if (vect_print_dump_info (REPORT_DETAILS))
6268 fprintf (vect_dump, "stmt not supported.");
6273 if (STMT_VINFO_LIVE_P (stmt_info)
6274 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
6276 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
6282 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
6283 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
6284 if (orig_stmt_in_pattern)
6286 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
6287 /* STMT was inserted by the vectorizer to replace a computation idiom.
6288 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
6289 computed this idiom. We need to record a pointer to VEC_STMT in
6290 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
6291 documentation of vect_pattern_recog. */
6292 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
6294 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
6295 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
6304 /* This function builds ni_name = number of iterations loop executes
6305 on the loop preheader. */
6308 vect_build_loop_niters (loop_vec_info loop_vinfo)
6310 tree ni_name, stmt, var;
6312 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6313 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6315 var = create_tmp_var (TREE_TYPE (ni), "niters");
6316 add_referenced_var (var);
6317 ni_name = force_gimple_operand (ni, &stmt, false, var);
6319 pe = loop_preheader_edge (loop);
6322 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6323 gcc_assert (!new_bb);
6330 /* This function generates the following statements:
6332 ni_name = number of iterations loop executes
6333 ratio = ni_name / vf
6334 ratio_mult_vf_name = ratio * vf
6336 and places them at the loop preheader edge. */
6339 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6341 tree *ratio_mult_vf_name_ptr,
6342 tree *ratio_name_ptr)
6350 tree ratio_mult_vf_name;
6351 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6352 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
6353 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6356 pe = loop_preheader_edge (loop);
6358 /* Generate temporary variable that contains
6359 number of iterations loop executes. */
6361 ni_name = vect_build_loop_niters (loop_vinfo);
6362 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
6364 /* Create: ratio = ni >> log2(vf) */
6366 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
6367 if (!is_gimple_val (ratio_name))
6369 var = create_tmp_var (TREE_TYPE (ni), "bnd");
6370 add_referenced_var (var);
6372 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
6373 pe = loop_preheader_edge (loop);
6374 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6375 gcc_assert (!new_bb);
6378 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6380 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6381 ratio_name, log_vf);
6382 if (!is_gimple_val (ratio_mult_vf_name))
6384 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
6385 add_referenced_var (var);
6387 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
6389 pe = loop_preheader_edge (loop);
6390 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6391 gcc_assert (!new_bb);
6394 *ni_name_ptr = ni_name;
6395 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6396 *ratio_name_ptr = ratio_name;
6402 /* Function vect_update_ivs_after_vectorizer.
6404 "Advance" the induction variables of LOOP to the value they should take
6405 after the execution of LOOP. This is currently necessary because the
6406 vectorizer does not handle induction variables that are used after the
6407 loop. Such a situation occurs when the last iterations of LOOP are
6409 1. We introduced new uses after LOOP for IVs that were not originally used
6410 after LOOP: the IVs of LOOP are now used by an epilog loop.
6411 2. LOOP is going to be vectorized; this means that it will iterate N/VF
6412 times, whereas the loop IVs should be bumped N times.
6415 - LOOP - a loop that is going to be vectorized. The last few iterations
6416 of LOOP were peeled.
6417 - NITERS - the number of iterations that LOOP executes (before it is
6418 vectorized). i.e, the number of times the ivs should be bumped.
6419 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
6420 coming out from LOOP on which there are uses of the LOOP ivs
6421 (this is the path from LOOP->exit to epilog_loop->preheader).
6423 The new definitions of the ivs are placed in LOOP->exit.
6424 The phi args associated with the edge UPDATE_E in the bb
6425 UPDATE_E->dest are updated accordingly.
6427 Assumption 1: Like the rest of the vectorizer, this function assumes
6428 a single loop exit that has a single predecessor.
6430 Assumption 2: The phi nodes in the LOOP header and in update_bb are
6431 organized in the same order.
6433 Assumption 3: The access function of the ivs is simple enough (see
6434 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
6436 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
6437 coming out of LOOP on which the ivs of LOOP are used (this is the path
6438 that leads to the epilog loop; other paths skip the epilog loop). This
6439 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
6440 needs to have its phis updated.
6444 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
6447 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6448 basic_block exit_bb = single_exit (loop)->dest;
6450 basic_block update_bb = update_e->dest;
6452 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
6454 /* Make sure there exists a single-predecessor exit bb: */
6455 gcc_assert (single_pred_p (exit_bb));
6457 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
6459 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
6461 tree access_fn = NULL;
6462 tree evolution_part;
6465 tree var, ni, ni_name;
6466 block_stmt_iterator last_bsi;
6468 if (vect_print_dump_info (REPORT_DETAILS))
6470 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
6471 print_generic_expr (vect_dump, phi, TDF_SLIM);
6474 /* Skip virtual phi's. */
6475 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
6477 if (vect_print_dump_info (REPORT_DETAILS))
6478 fprintf (vect_dump, "virtual phi. skip.");
6482 /* Skip reduction phis. */
6483 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
6485 if (vect_print_dump_info (REPORT_DETAILS))
6486 fprintf (vect_dump, "reduc phi. skip.");
6490 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
6491 gcc_assert (access_fn);
6493 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
6494 gcc_assert (evolution_part != NULL_TREE);
6496 /* FORNOW: We do not support IVs whose evolution function is a polynomial
6497 of degree >= 2 or exponential. */
6498 gcc_assert (!tree_is_chrec (evolution_part));
6500 step_expr = evolution_part;
6501 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
6504 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
6505 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
6507 fold_convert (sizetype,
6508 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
6509 niters, step_expr)));
6511 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
6512 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
6513 fold_convert (TREE_TYPE (init_expr),
6520 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
6521 add_referenced_var (var);
6523 last_bsi = bsi_last (exit_bb);
6524 ni_name = force_gimple_operand_bsi (&last_bsi, ni, false, var,
6525 true, BSI_SAME_STMT);
6527 /* Fix phi expressions in the successor bb. */
6528 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
6532 /* Return the more conservative threshold between the
6533 min_profitable_iters returned by the cost model and the user
6534 specified threshold, if provided. */
6537 conservative_cost_threshold (loop_vec_info loop_vinfo,
6538 int min_profitable_iters)
6541 int min_scalar_loop_bound;
6543 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
6544 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
6546 /* Use the cost model only if it is more conservative than user specified
6548 th = (unsigned) min_scalar_loop_bound;
6549 if (min_profitable_iters
6550 && (!min_scalar_loop_bound
6551 || min_profitable_iters > min_scalar_loop_bound))
6552 th = (unsigned) min_profitable_iters;
6554 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
6555 fprintf (vect_dump, "not vectorized: vectorization may not be"
6558 if (th && vect_print_dump_info (REPORT_DETAILS))
6559 fprintf (vect_dump, "Vectorization may not be profitable.");
6564 /* Function vect_do_peeling_for_loop_bound
6566 Peel the last iterations of the loop represented by LOOP_VINFO.
6567 The peeled iterations form a new epilog loop. Given that the loop now
6568 iterates NITERS times, the new epilog loop iterates
6569 NITERS % VECTORIZATION_FACTOR times.
6571 The original loop will later be made to iterate
6572 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
6575 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
6577 tree ni_name, ratio_mult_vf_name;
6578 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6579 struct loop *new_loop;
6581 basic_block preheader;
6583 bool check_profitability = false;
6584 unsigned int th = 0;
6585 int min_profitable_iters;
6587 if (vect_print_dump_info (REPORT_DETAILS))
6588 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
6590 initialize_original_copy_tables ();
6592 /* Generate the following variables on the preheader of original loop:
6594 ni_name = number of iteration the original loop executes
6595 ratio = ni_name / vf
6596 ratio_mult_vf_name = ratio * vf */
6597 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
6598 &ratio_mult_vf_name, ratio);
6600 loop_num = loop->num;
6602 /* If cost model check not done during versioning and
6603 peeling for alignment. */
6604 if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6605 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
6606 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
6608 check_profitability = true;
6610 /* Get profitability threshold for vectorized loop. */
6611 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6613 th = conservative_cost_threshold (loop_vinfo,
6614 min_profitable_iters);
6617 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
6618 ratio_mult_vf_name, ni_name, false,
6619 th, check_profitability);
6620 gcc_assert (new_loop);
6621 gcc_assert (loop_num == loop->num);
6622 #ifdef ENABLE_CHECKING
6623 slpeel_verify_cfg_after_peeling (loop, new_loop);
6626 /* A guard that controls whether the new_loop is to be executed or skipped
6627 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
6628 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
6629 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
6630 is on the path where the LOOP IVs are used and need to be updated. */
6632 preheader = loop_preheader_edge (new_loop)->src;
6633 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
6634 update_e = EDGE_PRED (preheader, 0);
6636 update_e = EDGE_PRED (preheader, 1);
6638 /* Update IVs of original loop as if they were advanced
6639 by ratio_mult_vf_name steps. */
6640 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
6642 /* After peeling we have to reset scalar evolution analyzer. */
6645 free_original_copy_tables ();
6649 /* Function vect_gen_niters_for_prolog_loop
6651 Set the number of iterations for the loop represented by LOOP_VINFO
6652 to the minimum between LOOP_NITERS (the original iteration count of the loop)
6653 and the misalignment of DR - the data reference recorded in
6654 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
6655 this loop, the data reference DR will refer to an aligned location.
6657 The following computation is generated:
6659 If the misalignment of DR is known at compile time:
6660 addr_mis = int mis = DR_MISALIGNMENT (dr);
6661 Else, compute address misalignment in bytes:
6662 addr_mis = addr & (vectype_size - 1)
6664 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
6666 (elem_size = element type size; an element is the scalar element
6667 whose type is the inner type of the vectype)
6671 prolog_niters = min ( LOOP_NITERS ,
6672 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
6673 where group_size is the size of the interleaved group.
6675 The above formulas assume that VF == number of elements in the vector. This
6676 may not hold when there are multiple-types in the loop.
6677 In this case, for some data-references in the loop the VF does not represent
6678 the number of elements that fit in the vector. Therefore, instead of VF we
6679 use TYPE_VECTOR_SUBPARTS. */
6682 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
6684 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
6685 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6687 tree iters, iters_name;
6690 tree dr_stmt = DR_STMT (dr);
6691 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
6692 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6693 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
6694 tree niters_type = TREE_TYPE (loop_niters);
6696 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
6697 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
6699 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6701 /* For interleaved access element size must be multiplied by the size of
6702 the interleaved group. */
6703 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
6704 DR_GROUP_FIRST_DR (stmt_info)));
6705 element_size *= group_size;
6708 pe = loop_preheader_edge (loop);
6710 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
6712 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
6713 int elem_misalign = byte_misalign / element_size;
6715 if (vect_print_dump_info (REPORT_DETAILS))
6716 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
6717 iters = build_int_cst (niters_type,
6718 (nelements - elem_misalign)&(nelements/group_size-1));
6722 tree new_stmts = NULL_TREE;
6723 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
6724 &new_stmts, NULL_TREE, loop);
6725 tree ptr_type = TREE_TYPE (start_addr);
6726 tree size = TYPE_SIZE (ptr_type);
6727 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
6728 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
6729 tree elem_size_log =
6730 build_int_cst (type, exact_log2 (vectype_align/nelements));
6731 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
6732 tree nelements_tree = build_int_cst (type, nelements);
6736 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
6737 gcc_assert (!new_bb);
6739 /* Create: byte_misalign = addr & (vectype_size - 1) */
6741 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
6743 /* Create: elem_misalign = byte_misalign / element_size */
6745 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
6747 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
6748 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
6749 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
6750 iters = fold_convert (niters_type, iters);
6753 /* Create: prolog_loop_niters = min (iters, loop_niters) */
6754 /* If the loop bound is known at compile time we already verified that it is
6755 greater than vf; since the misalignment ('iters') is at most vf, there's
6756 no need to generate the MIN_EXPR in this case. */
6757 if (TREE_CODE (loop_niters) != INTEGER_CST)
6758 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
6760 if (vect_print_dump_info (REPORT_DETAILS))
6762 fprintf (vect_dump, "niters for prolog loop: ");
6763 print_generic_expr (vect_dump, iters, TDF_SLIM);
6766 var = create_tmp_var (niters_type, "prolog_loop_niters");
6767 add_referenced_var (var);
6768 iters_name = force_gimple_operand (iters, &stmt, false, var);
6770 /* Insert stmt on loop preheader edge. */
6773 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6774 gcc_assert (!new_bb);
6781 /* Function vect_update_init_of_dr
6783 NITERS iterations were peeled from LOOP. DR represents a data reference
6784 in LOOP. This function updates the information recorded in DR to
6785 account for the fact that the first NITERS iterations had already been
6786 executed. Specifically, it updates the OFFSET field of DR. */
6789 vect_update_init_of_dr (struct data_reference *dr, tree niters)
6791 tree offset = DR_OFFSET (dr);
6793 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
6794 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
6795 DR_OFFSET (dr) = offset;
6799 /* Function vect_update_inits_of_drs
6801 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
6802 This function updates the information recorded for the data references in
6803 the loop to account for the fact that the first NITERS iterations had
6804 already been executed. Specifically, it updates the initial_condition of
6805 the access_function of all the data_references in the loop. */
6808 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
6811 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
6812 struct data_reference *dr;
6814 if (vect_print_dump_info (REPORT_DETAILS))
6815 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
6817 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
6818 vect_update_init_of_dr (dr, niters);
6822 /* Function vect_do_peeling_for_alignment
6824 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
6825 'niters' is set to the misalignment of one of the data references in the
6826 loop, thereby forcing it to refer to an aligned location at the beginning
6827 of the execution of this loop. The data reference for which we are
6828 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
6831 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
6833 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6834 tree niters_of_prolog_loop, ni_name;
6836 struct loop *new_loop;
6837 bool check_profitability = false;
6838 unsigned int th = 0;
6839 int min_profitable_iters;
6841 if (vect_print_dump_info (REPORT_DETAILS))
6842 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
6844 initialize_original_copy_tables ();
6846 ni_name = vect_build_loop_niters (loop_vinfo);
6847 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
6850 /* If cost model check not done during versioning. */
6851 if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6852 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
6854 check_profitability = true;
6856 /* Get profitability threshold for vectorized loop. */
6857 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6859 th = conservative_cost_threshold (loop_vinfo,
6860 min_profitable_iters);
6863 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
6865 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
6866 niters_of_prolog_loop, ni_name, true,
6867 th, check_profitability);
6869 gcc_assert (new_loop);
6870 #ifdef ENABLE_CHECKING
6871 slpeel_verify_cfg_after_peeling (new_loop, loop);
6874 /* Update number of times loop executes. */
6875 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
6876 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
6877 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
6879 /* Update the init conditions of the access functions of all data refs. */
6880 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
6882 /* After peeling we have to reset scalar evolution analyzer. */
6885 free_original_copy_tables ();
6889 /* Function vect_create_cond_for_align_checks.
6891 Create a conditional expression that represents the alignment checks for
6892 all of data references (array element references) whose alignment must be
6896 COND_EXPR - input conditional expression. New conditions will be chained
6897 with logical AND operation.
6898 LOOP_VINFO - two fields of the loop information are used.
6899 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
6900 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
6903 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6905 The returned value is the conditional expression to be used in the if
6906 statement that controls which version of the loop gets executed at runtime.
6908 The algorithm makes two assumptions:
6909 1) The number of bytes "n" in a vector is a power of 2.
6910 2) An address "a" is aligned if a%n is zero and that this
6911 test can be done as a&(n-1) == 0. For example, for 16
6912 byte vectors the test is a&0xf == 0. */
6915 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
6917 tree *cond_expr_stmt_list)
6919 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6920 VEC(tree,heap) *may_misalign_stmts
6921 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
6923 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
6927 tree int_ptrsize_type;
6929 tree or_tmp_name = NULL_TREE;
6930 tree and_tmp, and_tmp_name, and_stmt;
6932 tree part_cond_expr;
6934 /* Check that mask is one less than a power of 2, i.e., mask is
6935 all zeros followed by all ones. */
6936 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
6938 /* CHECKME: what is the best integer or unsigned type to use to hold a
6939 cast from a pointer value? */
6940 psize = TYPE_SIZE (ptr_type_node);
6942 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
6944 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
6945 of the first vector of the i'th data reference. */
6947 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
6949 tree new_stmt_list = NULL_TREE;
6951 tree addr_tmp, addr_tmp_name, addr_stmt;
6952 tree or_tmp, new_or_tmp_name, or_stmt;
6954 /* create: addr_tmp = (int)(address_of_first_vector) */
6955 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
6956 &new_stmt_list, NULL_TREE, loop);
6958 if (new_stmt_list != NULL_TREE)
6959 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
6961 sprintf (tmp_name, "%s%d", "addr2int", i);
6962 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6963 add_referenced_var (addr_tmp);
6964 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
6965 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
6966 addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
6967 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
6968 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
6970 /* The addresses are OR together. */
6972 if (or_tmp_name != NULL_TREE)
6974 /* create: or_tmp = or_tmp | addr_tmp */
6975 sprintf (tmp_name, "%s%d", "orptrs", i);
6976 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6977 add_referenced_var (or_tmp);
6978 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
6979 tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
6980 or_tmp_name, addr_tmp_name);
6981 or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
6982 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
6983 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
6984 or_tmp_name = new_or_tmp_name;
6987 or_tmp_name = addr_tmp_name;
6991 mask_cst = build_int_cst (int_ptrsize_type, mask);
6993 /* create: and_tmp = or_tmp & mask */
6994 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
6995 add_referenced_var (and_tmp);
6996 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
6998 tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
6999 and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
7000 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
7001 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
7003 /* Make and_tmp the left operand of the conditional test against zero.
7004 if and_tmp has a nonzero bit then some address is unaligned. */
7005 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
7006 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
7007 and_tmp_name, ptrsize_zero);
7009 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7010 *cond_expr, part_cond_expr);
7012 *cond_expr = part_cond_expr;
7015 /* Function vect_vfa_segment_size.
7017 Create an expression that computes the size of segment
7018 that will be accessed for a data reference. The functions takes into
7019 account that realignment loads may access one more vector.
7022 DR: The data reference.
7023 VECT_FACTOR: vectorization factor.
7025 Return an expression whose value is the size of segment which will be
7029 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7031 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7032 DR_STEP (dr), vect_factor);
7034 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7036 tree vector_size = TYPE_SIZE_UNIT
7037 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7039 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7040 segment_length, vector_size);
7042 return fold_convert (sizetype, segment_length);
7045 /* Function vect_create_cond_for_alias_checks.
7047 Create a conditional expression that represents the run-time checks for
7048 overlapping of address ranges represented by a list of data references
7049 relations passed as input.
7052 COND_EXPR - input conditional expression. New conditions will be chained
7053 with logical AND operation.
7054 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7058 COND_EXPR - conditional expression.
7059 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7063 The returned value is the conditional expression to be used in the if
7064 statement that controls which version of the loop gets executed at runtime.
7068 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7070 tree * cond_expr_stmt_list)
7072 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7073 VEC (ddr_p, heap) * may_alias_ddrs =
7074 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7076 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7080 tree part_cond_expr;
7082 /* Create expression
7083 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7084 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7088 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7089 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
7091 if (VEC_empty (ddr_p, may_alias_ddrs))
7094 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
7096 struct data_reference *dr_a, *dr_b;
7097 tree dr_group_first_a, dr_group_first_b;
7098 tree addr_base_a, addr_base_b;
7099 tree segment_length_a, segment_length_b;
7100 tree stmt_a, stmt_b;
7103 stmt_a = DR_STMT (DDR_A (ddr));
7104 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
7105 if (dr_group_first_a)
7107 stmt_a = dr_group_first_a;
7108 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
7112 stmt_b = DR_STMT (DDR_B (ddr));
7113 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
7114 if (dr_group_first_b)
7116 stmt_b = dr_group_first_b;
7117 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
7121 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
7124 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
7127 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
7128 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
7130 if (vect_print_dump_info (REPORT_DR_DETAILS))
7133 "create runtime check for data references ");
7134 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
7135 fprintf (vect_dump, " and ");
7136 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
7141 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
7142 fold_build2 (LT_EXPR, boolean_type_node,
7143 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
7147 fold_build2 (LT_EXPR, boolean_type_node,
7148 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
7154 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7155 *cond_expr, part_cond_expr);
7157 *cond_expr = part_cond_expr;
7159 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7160 fprintf (vect_dump, "created %u versioning for alias checks.\n",
7161 VEC_length (ddr_p, may_alias_ddrs));
7165 /* Function vect_loop_versioning.
7167 If the loop has data references that may or may not be aligned or/and
7168 has data reference relations whose independence was not proven then
7169 two versions of the loop need to be generated, one which is vectorized
7170 and one which isn't. A test is then generated to control which of the
7171 loops is executed. The test checks for the alignment of all of the
7172 data references that may or may not be aligned. An additional
7173 sequence of runtime tests is generated for each pairs of DDRs whose
7174 independence was not proven. The vectorized version of loop is
7175 executed only if both alias and alignment tests are passed.
7177 The test generated to check which version of loop is executed
7178 is modified to also check for profitability as indicated by the
7179 cost model initially. */
7182 vect_loop_versioning (loop_vec_info loop_vinfo)
7184 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7186 tree cond_expr = NULL_TREE;
7187 tree cond_expr_stmt_list = NULL_TREE;
7188 basic_block condition_bb;
7189 block_stmt_iterator cond_exp_bsi;
7190 basic_block merge_bb;
7191 basic_block new_exit_bb;
7193 tree orig_phi, new_phi, arg;
7194 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
7195 tree gimplify_stmt_list;
7196 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
7197 int min_profitable_iters = 0;
7200 /* Get profitability threshold for vectorized loop. */
7201 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7203 th = conservative_cost_threshold (loop_vinfo,
7204 min_profitable_iters);
7207 build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
7208 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
7210 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
7213 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
7214 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
7215 &cond_expr_stmt_list);
7217 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7218 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
7219 &cond_expr_stmt_list);
7222 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
7224 force_gimple_operand (cond_expr, &gimplify_stmt_list, true,
7226 append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list);
7228 initialize_original_copy_tables ();
7229 nloop = loop_version (loop, cond_expr, &condition_bb,
7230 prob, prob, REG_BR_PROB_BASE - prob, true);
7231 free_original_copy_tables();
7233 /* Loop versioning violates an assumption we try to maintain during
7234 vectorization - that the loop exit block has a single predecessor.
7235 After versioning, the exit block of both loop versions is the same
7236 basic block (i.e. it has two predecessors). Just in order to simplify
7237 following transformations in the vectorizer, we fix this situation
7238 here by adding a new (empty) block on the exit-edge of the loop,
7239 with the proper loop-exit phis to maintain loop-closed-form. */
7241 merge_bb = single_exit (loop)->dest;
7242 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
7243 new_exit_bb = split_edge (single_exit (loop));
7244 new_exit_e = single_exit (loop);
7245 e = EDGE_SUCC (new_exit_bb, 0);
7247 for (orig_phi = phi_nodes (merge_bb); orig_phi;
7248 orig_phi = PHI_CHAIN (orig_phi))
7250 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
7252 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
7253 add_phi_arg (new_phi, arg, new_exit_e);
7254 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
7257 /* End loop-exit-fixes after versioning. */
7259 update_ssa (TODO_update_ssa);
7260 if (cond_expr_stmt_list)
7262 cond_exp_bsi = bsi_last (condition_bb);
7263 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
7267 /* Remove a group of stores (for SLP or interleaving), free their
7271 vect_remove_stores (tree first_stmt)
7274 tree next = first_stmt;
7276 stmt_vec_info next_stmt_info;
7277 block_stmt_iterator next_si;
7281 /* Free the attached stmt_vec_info and remove the stmt. */
7282 next_si = bsi_for_stmt (next);
7283 bsi_remove (&next_si, true);
7284 next_stmt_info = vinfo_for_stmt (next);
7285 ann = stmt_ann (next);
7286 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
7287 free (next_stmt_info);
7288 set_stmt_info (ann, NULL);
7294 /* Vectorize SLP instance tree in postorder. */
7297 vect_schedule_slp_instance (slp_tree node, unsigned int vec_stmts_size)
7300 bool strided_store, is_store;
7301 block_stmt_iterator si;
7302 stmt_vec_info stmt_info;
7307 vect_schedule_slp_instance (SLP_TREE_LEFT (node), vec_stmts_size);
7308 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), vec_stmts_size);
7310 stmt = VEC_index(tree, SLP_TREE_SCALAR_STMTS (node), 0);
7311 stmt_info = vinfo_for_stmt (stmt);
7312 SLP_TREE_VEC_STMTS (node) = VEC_alloc (tree, heap, vec_stmts_size);
7313 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
7315 if (vect_print_dump_info (REPORT_DETAILS))
7317 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
7318 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7321 si = bsi_for_stmt (stmt);
7322 is_store = vect_transform_stmt (stmt, &si, &strided_store, node);
7325 if (DR_GROUP_FIRST_DR (stmt_info))
7326 /* If IS_STORE is TRUE, the vectorization of the
7327 interleaving chain was completed - free all the stores in
7329 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
7331 /* FORNOW: SLP originates only from strided stores. */
7337 /* FORNOW: SLP originates only from strided stores. */
7343 vect_schedule_slp (loop_vec_info loop_vinfo, unsigned int nunits)
7345 VEC (slp_instance, heap) *slp_instances =
7346 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7347 slp_instance instance;
7348 unsigned int vec_stmts_size;
7349 unsigned int group_size, i;
7350 unsigned int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7351 bool is_store = false;
7353 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
7355 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
7356 /* For each SLP instance calculate number of vector stmts to be created
7357 for the scalar stmts in each node of the SLP tree. Number of vector
7358 elements in one vector iteration is the number of scalar elements in
7359 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
7361 vec_stmts_size = vectorization_factor * group_size / nunits;
7363 /* Schedule the tree of INSTANCE. */
7364 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
7367 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
7368 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
7369 fprintf (vect_dump, "vectorizing stmts using SLP.");
7375 /* Function vect_transform_loop.
7377 The analysis phase has determined that the loop is vectorizable.
7378 Vectorize the loop - created vectorized stmts to replace the scalar
7379 stmts in the loop, and update the loop exit condition. */
7382 vect_transform_loop (loop_vec_info loop_vinfo)
7384 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7385 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7386 int nbbs = loop->num_nodes;
7387 block_stmt_iterator si, next_si;
7390 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7392 bool slp_scheduled = false;
7393 unsigned int nunits;
7395 if (vect_print_dump_info (REPORT_DETAILS))
7396 fprintf (vect_dump, "=== vec_transform_loop ===");
7398 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7399 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7400 vect_loop_versioning (loop_vinfo);
7402 /* CHECKME: we wouldn't need this if we called update_ssa once
7404 bitmap_zero (vect_memsyms_to_rename);
7406 /* Peel the loop if there are data refs with unknown alignment.
7407 Only one data ref with unknown store is allowed. */
7409 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7410 vect_do_peeling_for_alignment (loop_vinfo);
7412 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
7413 compile time constant), or it is a constant that doesn't divide by the
7414 vectorization factor, then an epilog loop needs to be created.
7415 We therefore duplicate the loop: the original loop will be vectorized,
7416 and will compute the first (n/VF) iterations. The second copy of the loop
7417 will remain scalar and will compute the remaining (n%VF) iterations.
7418 (VF is the vectorization factor). */
7420 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7421 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7422 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
7423 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
7425 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7426 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
7428 /* 1) Make sure the loop header has exactly two entries
7429 2) Make sure we have a preheader basic block. */
7431 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7433 split_edge (loop_preheader_edge (loop));
7435 /* FORNOW: the vectorizer supports only loops which body consist
7436 of one basic block (header + empty latch). When the vectorizer will
7437 support more involved loop forms, the order by which the BBs are
7438 traversed need to be reconsidered. */
7440 for (i = 0; i < nbbs; i++)
7442 basic_block bb = bbs[i];
7443 stmt_vec_info stmt_info;
7446 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
7448 if (vect_print_dump_info (REPORT_DETAILS))
7450 fprintf (vect_dump, "------>vectorizing phi: ");
7451 print_generic_expr (vect_dump, phi, TDF_SLIM);
7453 stmt_info = vinfo_for_stmt (phi);
7457 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7458 && !STMT_VINFO_LIVE_P (stmt_info))
7461 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7462 != (unsigned HOST_WIDE_INT) vectorization_factor)
7463 && vect_print_dump_info (REPORT_DETAILS))
7464 fprintf (vect_dump, "multiple-types.");
7466 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
7468 if (vect_print_dump_info (REPORT_DETAILS))
7469 fprintf (vect_dump, "transform phi.");
7470 vect_transform_stmt (phi, NULL, NULL, NULL);
7474 for (si = bsi_start (bb); !bsi_end_p (si);)
7476 tree stmt = bsi_stmt (si);
7479 if (vect_print_dump_info (REPORT_DETAILS))
7481 fprintf (vect_dump, "------>vectorizing statement: ");
7482 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7485 stmt_info = vinfo_for_stmt (stmt);
7487 /* vector stmts created in the outer-loop during vectorization of
7488 stmts in an inner-loop may not have a stmt_info, and do not
7489 need to be vectorized. */
7496 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7497 && !STMT_VINFO_LIVE_P (stmt_info))
7503 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
7505 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7506 if (!STMT_SLP_TYPE (stmt_info)
7507 && nunits != (unsigned int) vectorization_factor
7508 && vect_print_dump_info (REPORT_DETAILS))
7509 /* For SLP VF is set according to unrolling factor, and not to
7510 vector size, hence for SLP this print is not valid. */
7511 fprintf (vect_dump, "multiple-types.");
7513 /* SLP. Schedule all the SLP instances when the first SLP stmt is
7515 if (STMT_SLP_TYPE (stmt_info))
7519 slp_scheduled = true;
7521 if (vect_print_dump_info (REPORT_DETAILS))
7522 fprintf (vect_dump, "=== scheduling SLP instances ===");
7524 is_store = vect_schedule_slp (loop_vinfo, nunits);
7526 /* IS_STORE is true if STMT is a store. Stores cannot be of
7527 hybrid SLP type. They are removed in
7528 vect_schedule_slp_instance and their vinfo is destroyed. */
7536 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
7537 if (PURE_SLP_STMT (stmt_info))
7544 /* -------- vectorize statement ------------ */
7545 if (vect_print_dump_info (REPORT_DETAILS))
7546 fprintf (vect_dump, "transform statement.");
7548 strided_store = false;
7549 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL);
7553 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7555 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7556 interleaving chain was completed - free all the stores in
7558 tree next = DR_GROUP_FIRST_DR (stmt_info);
7560 stmt_vec_info next_stmt_info;
7564 next_si = bsi_for_stmt (next);
7565 next_stmt_info = vinfo_for_stmt (next);
7566 /* Free the attached stmt_vec_info and remove the stmt. */
7567 ann = stmt_ann (next);
7568 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
7569 free (next_stmt_info);
7570 set_stmt_info (ann, NULL);
7571 bsi_remove (&next_si, true);
7574 bsi_remove (&si, true);
7579 /* Free the attached stmt_vec_info and remove the stmt. */
7580 ann = stmt_ann (stmt);
7582 set_stmt_info (ann, NULL);
7583 bsi_remove (&si, true);
7591 slpeel_make_loop_iterate_ntimes (loop, ratio);
7593 mark_set_for_renaming (vect_memsyms_to_rename);
7595 /* The memory tags and pointers in vectorized statements need to
7596 have their SSA forms updated. FIXME, why can't this be delayed
7597 until all the loops have been transformed? */
7598 update_ssa (TODO_update_ssa);
7600 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7601 fprintf (vect_dump, "LOOP VECTORIZED.");
7602 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7603 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");