1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (gimple, gimple_stmt_iterator *, bool *,
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (gimple, struct loop*, tree, tree *, gimple *, bool, bool *);
54 static tree vect_create_addr_base_for_vector_ref
55 (gimple, gimple_seq *, tree, struct loop *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, gimple, tree *);
58 static tree vect_init_vector (gimple, tree, tree, gimple_stmt_iterator *);
59 static void vect_finish_stmt_generation
60 (gimple stmt, gimple vec_stmt, gimple_stmt_iterator *);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void vect_create_epilog_for_reduction (tree, gimple, enum tree_code,
64 static tree get_initial_def_for_reduction (gimple, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
78 cost_for_stmt (gimple stmt)
80 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
82 switch (STMT_VINFO_TYPE (stmt_info))
84 case load_vec_info_type:
85 return TARG_SCALAR_LOAD_COST;
86 case store_vec_info_type:
87 return TARG_SCALAR_STORE_COST;
88 case op_vec_info_type:
89 case condition_vec_info_type:
90 case assignment_vec_info_type:
91 case reduc_vec_info_type:
92 case induc_vec_info_type:
93 case type_promotion_vec_info_type:
94 case type_demotion_vec_info_type:
95 case type_conversion_vec_info_type:
96 case call_vec_info_type:
97 return TARG_SCALAR_STMT_COST;
98 case undef_vec_info_type:
105 /* Function vect_estimate_min_profitable_iters
107 Return the number of iterations required for the vector version of the
108 loop to be profitable relative to the cost of the scalar version of the
111 TODO: Take profile info into account before making vectorization
112 decisions, if available. */
115 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
118 int min_profitable_iters;
119 int peel_iters_prologue;
120 int peel_iters_epilogue;
121 int vec_inside_cost = 0;
122 int vec_outside_cost = 0;
123 int scalar_single_iter_cost = 0;
124 int scalar_outside_cost = 0;
125 bool runtime_test = false;
126 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
127 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
128 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
129 int nbbs = loop->num_nodes;
130 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
131 int peel_guard_costs = 0;
132 int innerloop_iters = 0, factor;
133 VEC (slp_instance, heap) *slp_instances;
134 slp_instance instance;
136 /* Cost model disabled. */
137 if (!flag_vect_cost_model)
139 if (vect_print_dump_info (REPORT_COST))
140 fprintf (vect_dump, "cost model disabled.");
144 /* If the number of iterations is unknown, or the
145 peeling-for-misalignment amount is unknown, we will have to generate
146 a runtime test to test the loop count against the threshold. */
147 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
148 || (byte_misalign < 0))
151 /* Requires loop versioning tests to handle misalignment. */
153 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
155 /* FIXME: Make cost depend on complexity of individual check. */
157 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
158 if (vect_print_dump_info (REPORT_COST))
159 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
160 "versioning to treat misalignment.\n");
163 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
165 /* FIXME: Make cost depend on complexity of individual check. */
167 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
168 if (vect_print_dump_info (REPORT_COST))
169 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
170 "versioning aliasing.\n");
173 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
174 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
176 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
179 /* Count statements in scalar loop. Using this as scalar cost for a single
182 TODO: Add outer loop support.
184 TODO: Consider assigning different costs to different scalar
189 innerloop_iters = 50; /* FIXME */
191 for (i = 0; i < nbbs; i++)
193 gimple_stmt_iterator si;
194 basic_block bb = bbs[i];
196 if (bb->loop_father == loop->inner)
197 factor = innerloop_iters;
201 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
203 gimple stmt = gsi_stmt (si);
204 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
205 /* Skip stmts that are not vectorized inside the loop. */
206 if (!STMT_VINFO_RELEVANT_P (stmt_info)
207 && (!STMT_VINFO_LIVE_P (stmt_info)
208 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
210 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
211 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
212 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
213 some of the "outside" costs are generated inside the outer-loop. */
214 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
218 /* Add additional cost for the peeled instructions in prologue and epilogue
221 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
222 at compile-time - we assume it's vf/2 (the worst would be vf-1).
224 TODO: Build an expression that represents peel_iters for prologue and
225 epilogue to be used in a run-time test. */
227 if (byte_misalign < 0)
229 peel_iters_prologue = vf/2;
230 if (vect_print_dump_info (REPORT_COST))
231 fprintf (vect_dump, "cost model: "
232 "prologue peel iters set to vf/2.");
234 /* If peeling for alignment is unknown, loop bound of main loop becomes
236 peel_iters_epilogue = vf/2;
237 if (vect_print_dump_info (REPORT_COST))
238 fprintf (vect_dump, "cost model: "
239 "epilogue peel iters set to vf/2 because "
240 "peeling for alignment is unknown .");
242 /* If peeled iterations are unknown, count a taken branch and a not taken
243 branch per peeled loop. Even if scalar loop iterations are known,
244 vector iterations are not known since peeled prologue iterations are
245 not known. Hence guards remain the same. */
246 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
247 + TARG_COND_NOT_TAKEN_BRANCH_COST);
254 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
255 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
256 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
257 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
259 peel_iters_prologue = nelements - (byte_misalign / element_size);
262 peel_iters_prologue = 0;
264 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
266 peel_iters_epilogue = vf/2;
267 if (vect_print_dump_info (REPORT_COST))
268 fprintf (vect_dump, "cost model: "
269 "epilogue peel iters set to vf/2 because "
270 "loop iterations are unknown .");
272 /* If peeled iterations are known but number of scalar loop
273 iterations are unknown, count a taken branch per peeled loop. */
274 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
279 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
280 peel_iters_prologue = niters < peel_iters_prologue ?
281 niters : peel_iters_prologue;
282 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
286 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
287 + (peel_iters_epilogue * scalar_single_iter_cost)
290 /* FORNOW: The scalar outside cost is incremented in one of the
293 1. The vectorizer checks for alignment and aliasing and generates
294 a condition that allows dynamic vectorization. A cost model
295 check is ANDED with the versioning condition. Hence scalar code
296 path now has the added cost of the versioning check.
298 if (cost > th & versioning_check)
301 Hence run-time scalar is incremented by not-taken branch cost.
303 2. The vectorizer then checks if a prologue is required. If the
304 cost model check was not done before during versioning, it has to
305 be done before the prologue check.
308 prologue = scalar_iters
313 if (prologue == num_iters)
316 Hence the run-time scalar cost is incremented by a taken branch,
317 plus a not-taken branch, plus a taken branch cost.
319 3. The vectorizer then checks if an epilogue is required. If the
320 cost model check was not done before during prologue check, it
321 has to be done with the epilogue check.
327 if (prologue == num_iters)
330 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
333 Hence the run-time scalar cost should be incremented by 2 taken
336 TODO: The back end may reorder the BBS's differently and reverse
337 conditions/branch directions. Change the estimates below to
338 something more reasonable. */
342 /* Cost model check occurs at versioning. */
343 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
344 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
345 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
348 /* Cost model occurs at prologue generation. */
349 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
350 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
351 + TARG_COND_NOT_TAKEN_BRANCH_COST;
352 /* Cost model check occurs at epilogue generation. */
354 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
359 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
360 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
362 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
363 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
366 /* Calculate number of iterations required to make the vector version
367 profitable, relative to the loop bodies only. The following condition
369 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
371 SIC = scalar iteration cost, VIC = vector iteration cost,
372 VOC = vector outside cost, VF = vectorization factor,
373 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
374 SOC = scalar outside cost for run time cost model check. */
376 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
378 if (vec_outside_cost <= 0)
379 min_profitable_iters = 1;
382 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
383 - vec_inside_cost * peel_iters_prologue
384 - vec_inside_cost * peel_iters_epilogue)
385 / ((scalar_single_iter_cost * vf)
388 if ((scalar_single_iter_cost * vf * min_profitable_iters)
389 <= ((vec_inside_cost * min_profitable_iters)
390 + ((vec_outside_cost - scalar_outside_cost) * vf)))
391 min_profitable_iters++;
394 /* vector version will never be profitable. */
397 if (vect_print_dump_info (REPORT_COST))
398 fprintf (vect_dump, "cost model: vector iteration cost = %d "
399 "is divisible by scalar iteration cost = %d by a factor "
400 "greater than or equal to the vectorization factor = %d .",
401 vec_inside_cost, scalar_single_iter_cost, vf);
405 if (vect_print_dump_info (REPORT_COST))
407 fprintf (vect_dump, "Cost model analysis: \n");
408 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
410 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
412 fprintf (vect_dump, " Scalar iteration cost: %d\n",
413 scalar_single_iter_cost);
414 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
415 fprintf (vect_dump, " prologue iterations: %d\n",
416 peel_iters_prologue);
417 fprintf (vect_dump, " epilogue iterations: %d\n",
418 peel_iters_epilogue);
419 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
420 min_profitable_iters);
423 min_profitable_iters =
424 min_profitable_iters < vf ? vf : min_profitable_iters;
426 /* Because the condition we create is:
427 if (niters <= min_profitable_iters)
428 then skip the vectorized loop. */
429 min_profitable_iters--;
431 if (vect_print_dump_info (REPORT_COST))
432 fprintf (vect_dump, " Profitability threshold = %d\n",
433 min_profitable_iters);
435 return min_profitable_iters;
439 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
440 functions. Design better to avoid maintenance issues. */
442 /* Function vect_model_reduction_cost.
444 Models cost for a reduction operation, including the vector ops
445 generated within the strip-mine loop, the initial definition before
446 the loop, and the epilogue code that must be generated. */
449 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
456 gimple stmt, orig_stmt;
458 enum machine_mode mode;
459 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
460 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
463 /* Cost of reduction op inside loop. */
464 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
466 stmt = STMT_VINFO_STMT (stmt_info);
468 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
470 case GIMPLE_SINGLE_RHS:
471 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
472 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
474 case GIMPLE_UNARY_RHS:
475 reduction_op = gimple_assign_rhs1 (stmt);
477 case GIMPLE_BINARY_RHS:
478 reduction_op = gimple_assign_rhs2 (stmt);
484 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
487 if (vect_print_dump_info (REPORT_COST))
489 fprintf (vect_dump, "unsupported data-type ");
490 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
495 mode = TYPE_MODE (vectype);
496 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
499 orig_stmt = STMT_VINFO_STMT (stmt_info);
501 code = gimple_assign_rhs_code (orig_stmt);
503 /* Add in cost for initial definition. */
504 outer_cost += TARG_SCALAR_TO_VEC_COST;
506 /* Determine cost of epilogue code.
508 We have a reduction operator that will reduce the vector in one statement.
509 Also requires scalar extract. */
511 if (!nested_in_vect_loop_p (loop, orig_stmt))
513 if (reduc_code < NUM_TREE_CODES)
514 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
517 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
519 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
520 int element_bitsize = tree_low_cst (bitsize, 1);
521 int nelements = vec_size_in_bits / element_bitsize;
523 optab = optab_for_tree_code (code, vectype, optab_default);
525 /* We have a whole vector shift available. */
526 if (VECTOR_MODE_P (mode)
527 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
528 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
529 /* Final reduction via vector shifts and the reduction operator. Also
530 requires scalar extract. */
531 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
532 + TARG_VEC_TO_SCALAR_COST);
534 /* Use extracts and reduction op for final reduction. For N elements,
535 we have N extracts and N-1 reduction ops. */
536 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
540 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
542 if (vect_print_dump_info (REPORT_COST))
543 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
544 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
545 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
551 /* Function vect_model_induction_cost.
553 Models cost for induction operations. */
556 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
558 /* loop cost for vec_loop. */
559 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
560 /* prologue cost for vec_init and vec_step. */
561 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
563 if (vect_print_dump_info (REPORT_COST))
564 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
565 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
566 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
570 /* Function vect_model_simple_cost.
572 Models cost for simple operations, i.e. those that only emit ncopies of a
573 single op. Right now, this does not account for multiple insns that could
574 be generated for the single vector op. We will handle that shortly. */
577 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
578 enum vect_def_type *dt, slp_tree slp_node)
581 int inside_cost = 0, outside_cost = 0;
583 inside_cost = ncopies * TARG_VEC_STMT_COST;
585 /* FORNOW: Assuming maximum 2 args per stmts. */
586 for (i = 0; i < 2; i++)
588 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
589 outside_cost += TARG_SCALAR_TO_VEC_COST;
592 if (vect_print_dump_info (REPORT_COST))
593 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
594 "outside_cost = %d .", inside_cost, outside_cost);
596 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
597 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
598 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
602 /* Function vect_cost_strided_group_size
604 For strided load or store, return the group_size only if it is the first
605 load or store of a group, else return 1. This ensures that group size is
606 only returned once per group. */
609 vect_cost_strided_group_size (stmt_vec_info stmt_info)
611 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
613 if (first_stmt == STMT_VINFO_STMT (stmt_info))
614 return DR_GROUP_SIZE (stmt_info);
620 /* Function vect_model_store_cost
622 Models cost for stores. In the case of strided accesses, one access
623 has the overhead of the strided access attributed to it. */
626 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
627 enum vect_def_type dt, slp_tree slp_node)
630 int inside_cost = 0, outside_cost = 0;
632 if (dt == vect_constant_def || dt == vect_invariant_def)
633 outside_cost = TARG_SCALAR_TO_VEC_COST;
635 /* Strided access? */
636 if (DR_GROUP_FIRST_DR (stmt_info))
637 group_size = vect_cost_strided_group_size (stmt_info);
638 /* Not a strided access. */
642 /* Is this an access in a group of stores, which provide strided access?
643 If so, add in the cost of the permutes. */
646 /* Uses a high and low interleave operation for each needed permute. */
647 inside_cost = ncopies * exact_log2(group_size) * group_size
648 * TARG_VEC_STMT_COST;
650 if (vect_print_dump_info (REPORT_COST))
651 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
656 /* Costs of the stores. */
657 inside_cost += ncopies * TARG_VEC_STORE_COST;
659 if (vect_print_dump_info (REPORT_COST))
660 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
661 "outside_cost = %d .", inside_cost, outside_cost);
663 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
664 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
665 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
669 /* Function vect_model_load_cost
671 Models cost for loads. In the case of strided accesses, the last access
672 has the overhead of the strided access attributed to it. Since unaligned
673 accesses are supported for loads, we also account for the costs of the
674 access scheme chosen. */
677 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
681 int alignment_support_cheme;
683 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
684 int inside_cost = 0, outside_cost = 0;
686 /* Strided accesses? */
687 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
688 if (first_stmt && !slp_node)
690 group_size = vect_cost_strided_group_size (stmt_info);
691 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
693 /* Not a strided access. */
700 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
702 /* Is this an access in a group of loads providing strided access?
703 If so, add in the cost of the permutes. */
706 /* Uses an even and odd extract operations for each needed permute. */
707 inside_cost = ncopies * exact_log2(group_size) * group_size
708 * TARG_VEC_STMT_COST;
710 if (vect_print_dump_info (REPORT_COST))
711 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
716 /* The loads themselves. */
717 switch (alignment_support_cheme)
721 inside_cost += ncopies * TARG_VEC_LOAD_COST;
723 if (vect_print_dump_info (REPORT_COST))
724 fprintf (vect_dump, "vect_model_load_cost: aligned.");
728 case dr_unaligned_supported:
730 /* Here, we assign an additional cost for the unaligned load. */
731 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
733 if (vect_print_dump_info (REPORT_COST))
734 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
739 case dr_explicit_realign:
741 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
743 /* FIXME: If the misalignment remains fixed across the iterations of
744 the containing loop, the following cost should be added to the
746 if (targetm.vectorize.builtin_mask_for_load)
747 inside_cost += TARG_VEC_STMT_COST;
751 case dr_explicit_realign_optimized:
753 if (vect_print_dump_info (REPORT_COST))
754 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
757 /* Unaligned software pipeline has a load of an address, an initial
758 load, and possibly a mask operation to "prime" the loop. However,
759 if this is an access in a group of loads, which provide strided
760 access, then the above cost should only be considered for one
761 access in the group. Inside the loop, there is a load op
762 and a realignment op. */
764 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
766 outside_cost = 2*TARG_VEC_STMT_COST;
767 if (targetm.vectorize.builtin_mask_for_load)
768 outside_cost += TARG_VEC_STMT_COST;
771 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
780 if (vect_print_dump_info (REPORT_COST))
781 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
782 "outside_cost = %d .", inside_cost, outside_cost);
784 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
785 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
786 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
790 /* Function vect_get_new_vect_var.
792 Returns a name for a new variable. The current naming scheme appends the
793 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
794 the name of vectorizer generated variables, and appends that to NAME if
798 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
805 case vect_simple_var:
808 case vect_scalar_var:
811 case vect_pointer_var:
820 char* tmp = concat (prefix, name, NULL);
821 new_vect_var = create_tmp_var (type, tmp);
825 new_vect_var = create_tmp_var (type, prefix);
827 /* Mark vector typed variable as a gimple register variable. */
828 if (TREE_CODE (type) == VECTOR_TYPE)
829 DECL_GIMPLE_REG_P (new_vect_var) = true;
835 /* Function vect_create_addr_base_for_vector_ref.
837 Create an expression that computes the address of the first memory location
838 that will be accessed for a data reference.
841 STMT: The statement containing the data reference.
842 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
843 OFFSET: Optional. If supplied, it is be added to the initial address.
844 LOOP: Specify relative to which loop-nest should the address be computed.
845 For example, when the dataref is in an inner-loop nested in an
846 outer-loop that is now being vectorized, LOOP can be either the
847 outer-loop, or the inner-loop. The first memory location accessed
848 by the following dataref ('in' points to short):
855 if LOOP=i_loop: &in (relative to i_loop)
856 if LOOP=j_loop: &in+i*2B (relative to j_loop)
859 1. Return an SSA_NAME whose value is the address of the memory location of
860 the first vector of the data reference.
861 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
862 these statement(s) which define the returned SSA_NAME.
864 FORNOW: We are only handling array accesses with step 1. */
867 vect_create_addr_base_for_vector_ref (gimple stmt,
868 gimple_seq *new_stmt_list,
872 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
873 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
874 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
875 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
877 tree data_ref_base_var;
879 tree addr_base, addr_expr;
881 gimple_seq seq = NULL;
882 tree base_offset = unshare_expr (DR_OFFSET (dr));
883 tree init = unshare_expr (DR_INIT (dr));
884 tree vect_ptr_type, addr_expr2;
885 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
888 if (loop != containing_loop)
890 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
891 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
893 gcc_assert (nested_in_vect_loop_p (loop, stmt));
895 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
896 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
897 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
900 /* Create data_ref_base */
901 base_name = build_fold_indirect_ref (data_ref_base);
902 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
903 add_referenced_var (data_ref_base_var);
904 data_ref_base = force_gimple_operand (data_ref_base, &seq, true,
906 gimple_seq_add_seq (new_stmt_list, seq);
908 /* Create base_offset */
909 base_offset = size_binop (PLUS_EXPR, base_offset, init);
910 base_offset = fold_convert (sizetype, base_offset);
911 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
912 add_referenced_var (dest);
913 base_offset = force_gimple_operand (base_offset, &seq, true, dest);
914 gimple_seq_add_seq (new_stmt_list, seq);
918 tree tmp = create_tmp_var (sizetype, "offset");
920 add_referenced_var (tmp);
921 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
922 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
923 base_offset, offset);
924 base_offset = force_gimple_operand (base_offset, &seq, false, tmp);
925 gimple_seq_add_seq (new_stmt_list, seq);
928 /* base + base_offset */
929 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
930 data_ref_base, base_offset);
932 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
934 /* addr_expr = addr_base */
935 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
936 get_name (base_name));
937 add_referenced_var (addr_expr);
938 vec_stmt = fold_convert (vect_ptr_type, addr_base);
939 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
940 get_name (base_name));
941 add_referenced_var (addr_expr2);
942 vec_stmt = force_gimple_operand (vec_stmt, &seq, false, addr_expr2);
943 gimple_seq_add_seq (new_stmt_list, seq);
945 if (vect_print_dump_info (REPORT_DETAILS))
947 fprintf (vect_dump, "created ");
948 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
954 /* Function vect_create_data_ref_ptr.
956 Create a new pointer to vector type (vp), that points to the first location
957 accessed in the loop by STMT, along with the def-use update chain to
958 appropriately advance the pointer through the loop iterations. Also set
959 aliasing information for the pointer. This vector pointer is used by the
960 callers to this function to create a memory reference expression for vector
964 1. STMT: a stmt that references memory. Expected to be of the form
965 GIMPLE_ASSIGN <name, data-ref> or
966 GIMPLE_ASSIGN <data-ref, name>.
967 2. AT_LOOP: the loop where the vector memref is to be created.
968 3. OFFSET (optional): an offset to be added to the initial address accessed
969 by the data-ref in STMT.
970 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
971 pointing to the initial address.
974 1. Declare a new ptr to vector_type, and have it point to the base of the
975 data reference (initial addressed accessed by the data reference).
976 For example, for vector of type V8HI, the following code is generated:
979 vp = (v8hi *)initial_address;
981 if OFFSET is not supplied:
982 initial_address = &a[init];
983 if OFFSET is supplied:
984 initial_address = &a[init + OFFSET];
986 Return the initial_address in INITIAL_ADDRESS.
988 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
989 update the pointer in each iteration of the loop.
991 Return the increment stmt that updates the pointer in PTR_INCR.
993 3. Set INV_P to true if the access pattern of the data reference in the
994 vectorized loop is invariant. Set it to false otherwise.
996 4. Return the pointer. */
999 vect_create_data_ref_ptr (gimple stmt, struct loop *at_loop,
1000 tree offset, tree *initial_address, gimple *ptr_incr,
1001 bool only_init, bool *inv_p)
1004 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1005 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1006 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1007 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
1008 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
1009 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1015 gimple_seq new_stmt_list = NULL;
1019 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1021 gimple_stmt_iterator incr_gsi;
1023 tree indx_before_incr, indx_after_incr;
1027 /* Check the step (evolution) of the load in LOOP, and record
1028 whether it's invariant. */
1029 if (nested_in_vect_loop)
1030 step = STMT_VINFO_DR_STEP (stmt_info);
1032 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1034 if (tree_int_cst_compare (step, size_zero_node) == 0)
1039 /* Create an expression for the first address accessed by this load
1041 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1043 if (vect_print_dump_info (REPORT_DETAILS))
1045 tree data_ref_base = base_name;
1046 fprintf (vect_dump, "create vector-pointer variable to type: ");
1047 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1048 if (TREE_CODE (data_ref_base) == VAR_DECL)
1049 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
1050 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1051 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
1052 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1053 fprintf (vect_dump, " vectorizing a record based array ref: ");
1054 else if (TREE_CODE (data_ref_base) == SSA_NAME)
1055 fprintf (vect_dump, " vectorizing a pointer ref: ");
1056 print_generic_expr (vect_dump, base_name, TDF_SLIM);
1059 /** (1) Create the new vector-pointer variable: **/
1060 vect_ptr_type = build_pointer_type (vectype);
1062 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1063 get_name (base_name));
1064 add_referenced_var (vect_ptr);
1066 /** (2) Add aliasing information to the new vector-pointer:
1067 (The points-to info (DR_PTR_INFO) may be defined later.) **/
1069 tag = DR_SYMBOL_TAG (dr);
1072 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1073 tag must be created with tag added to its may alias list. */
1075 new_type_alias (vect_ptr, tag, DR_REF (dr));
1077 set_symbol_mem_tag (vect_ptr, tag);
1079 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1080 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1081 def-use update cycles for the pointer: One relative to the outer-loop
1082 (LOOP), which is what steps (3) and (4) below do. The other is relative
1083 to the inner-loop (which is the inner-most loop containing the dataref),
1084 and this is done be step (5) below.
1086 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1087 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1088 redundant. Steps (3),(4) create the following:
1091 LOOP: vp1 = phi(vp0,vp2)
1097 If there is an inner-loop nested in loop, then step (5) will also be
1098 applied, and an additional update in the inner-loop will be created:
1101 LOOP: vp1 = phi(vp0,vp2)
1103 inner: vp3 = phi(vp1,vp4)
1104 vp4 = vp3 + inner_step
1110 /** (3) Calculate the initial address the vector-pointer, and set
1111 the vector-pointer to point to it before the loop: **/
1113 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1115 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1117 pe = loop_preheader_edge (loop);
1120 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
1121 gcc_assert (!new_bb);
1124 *initial_address = new_temp;
1126 /* Create: p = (vectype *) initial_base */
1127 vec_stmt = gimple_build_assign (vect_ptr,
1128 fold_convert (vect_ptr_type, new_temp));
1129 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1130 gimple_assign_set_lhs (vec_stmt, vect_ptr_init);
1131 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
1132 gcc_assert (!new_bb);
1135 /** (4) Handle the updating of the vector-pointer inside the loop.
1136 This is needed when ONLY_INIT is false, and also when AT_LOOP
1137 is the inner-loop nested in LOOP (during outer-loop vectorization).
1140 if (only_init && at_loop == loop) /* No update in loop is required. */
1142 /* Copy the points-to information if it exists. */
1143 if (DR_PTR_INFO (dr))
1144 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1145 vptr = vect_ptr_init;
1149 /* The step of the vector pointer is the Vector Size. */
1150 tree step = TYPE_SIZE_UNIT (vectype);
1151 /* One exception to the above is when the scalar step of the load in
1152 LOOP is zero. In this case the step here is also zero. */
1154 step = size_zero_node;
1156 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1158 create_iv (vect_ptr_init,
1159 fold_convert (vect_ptr_type, step),
1160 NULL_TREE, loop, &incr_gsi, insert_after,
1161 &indx_before_incr, &indx_after_incr);
1162 incr = gsi_stmt (incr_gsi);
1163 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1165 /* Copy the points-to information if it exists. */
1166 if (DR_PTR_INFO (dr))
1168 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1169 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1171 merge_alias_info (vect_ptr_init, indx_before_incr);
1172 merge_alias_info (vect_ptr_init, indx_after_incr);
1176 vptr = indx_before_incr;
1179 if (!nested_in_vect_loop || only_init)
1183 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1184 nested in LOOP, if exists: **/
1186 gcc_assert (nested_in_vect_loop);
1189 standard_iv_increment_position (containing_loop, &incr_gsi,
1191 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE,
1192 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
1194 incr = gsi_stmt (incr_gsi);
1195 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1197 /* Copy the points-to information if it exists. */
1198 if (DR_PTR_INFO (dr))
1200 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1201 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1203 merge_alias_info (vect_ptr_init, indx_before_incr);
1204 merge_alias_info (vect_ptr_init, indx_after_incr);
1208 return indx_before_incr;
1215 /* Function bump_vector_ptr
1217 Increment a pointer (to a vector type) by vector-size. If requested,
1218 i.e. if PTR-INCR is given, then also connect the new increment stmt
1219 to the existing def-use update-chain of the pointer, by modifying
1220 the PTR_INCR as illustrated below:
1222 The pointer def-use update-chain before this function:
1223 DATAREF_PTR = phi (p_0, p_2)
1225 PTR_INCR: p_2 = DATAREF_PTR + step
1227 The pointer def-use update-chain after this function:
1228 DATAREF_PTR = phi (p_0, p_2)
1230 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1232 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1235 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1237 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1238 the loop. The increment amount across iterations is expected
1240 BSI - location where the new update stmt is to be placed.
1241 STMT - the original scalar memory-access stmt that is being vectorized.
1242 BUMP - optional. The offset by which to bump the pointer. If not given,
1243 the offset is assumed to be vector_size.
1245 Output: Return NEW_DATAREF_PTR as illustrated above.
1250 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
1251 gimple stmt, tree bump)
1253 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1254 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1255 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1256 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1257 tree update = TYPE_SIZE_UNIT (vectype);
1260 use_operand_p use_p;
1261 tree new_dataref_ptr;
1266 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, ptr_var,
1267 dataref_ptr, update);
1268 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1269 gimple_assign_set_lhs (incr_stmt, new_dataref_ptr);
1270 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
1272 /* Copy the points-to information if it exists. */
1273 if (DR_PTR_INFO (dr))
1274 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1275 merge_alias_info (new_dataref_ptr, dataref_ptr);
1278 return new_dataref_ptr;
1280 /* Update the vector-pointer's cross-iteration increment. */
1281 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1283 tree use = USE_FROM_PTR (use_p);
1285 if (use == dataref_ptr)
1286 SET_USE (use_p, new_dataref_ptr);
1288 gcc_assert (tree_int_cst_compare (use, update) == 0);
1291 return new_dataref_ptr;
1295 /* Function vect_create_destination_var.
1297 Create a new temporary of type VECTYPE. */
1300 vect_create_destination_var (tree scalar_dest, tree vectype)
1303 const char *new_name;
1305 enum vect_var_kind kind;
1307 kind = vectype ? vect_simple_var : vect_scalar_var;
1308 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1310 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1312 new_name = get_name (scalar_dest);
1315 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1316 add_referenced_var (vec_dest);
1322 /* Function vect_init_vector.
1324 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1325 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1326 is not NULL. Otherwise, place the initialization at the loop preheader.
1327 Return the DEF of INIT_STMT.
1328 It will be used in the vectorization of STMT. */
1331 vect_init_vector (gimple stmt, tree vector_var, tree vector_type,
1332 gimple_stmt_iterator *gsi)
1334 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1342 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1343 add_referenced_var (new_var);
1344 init_stmt = gimple_build_assign (new_var, vector_var);
1345 new_temp = make_ssa_name (new_var, init_stmt);
1346 gimple_assign_set_lhs (init_stmt, new_temp);
1349 vect_finish_stmt_generation (stmt, init_stmt, gsi);
1352 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1353 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1355 if (nested_in_vect_loop_p (loop, stmt))
1357 pe = loop_preheader_edge (loop);
1358 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1359 gcc_assert (!new_bb);
1362 if (vect_print_dump_info (REPORT_DETAILS))
1364 fprintf (vect_dump, "created new init_stmt: ");
1365 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1368 vec_oprnd = gimple_assign_lhs (init_stmt);
1373 /* For constant and loop invariant defs of SLP_NODE this function returns
1374 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1375 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1379 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1380 unsigned int op_num)
1382 VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1383 gimple stmt = VEC_index (gimple, stmts, 0);
1384 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1385 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1386 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1389 int j, number_of_places_left_in_vector;
1392 int group_size = VEC_length (gimple, stmts);
1393 unsigned int vec_num, i;
1394 int number_of_copies = 1;
1395 bool is_store = false;
1396 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1397 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1400 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1403 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1404 created vectors. It is greater than 1 if unrolling is performed.
1406 For example, we have two scalar operands, s1 and s2 (e.g., group of
1407 strided accesses of size two), while NUNITS is four (i.e., four scalars
1408 of this type can be packed in a vector). The output vector will contain
1409 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1412 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1413 containing the operands.
1415 For example, NUNITS is four as before, and the group size is 8
1416 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1417 {s5, s6, s7, s8}. */
1419 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1421 number_of_places_left_in_vector = nunits;
1423 for (j = 0; j < number_of_copies; j++)
1425 for (i = group_size - 1; VEC_iterate (gimple, stmts, i, stmt); i--)
1428 op = gimple_assign_rhs1 (stmt);
1430 op = gimple_op (stmt, op_num + 1);
1431 if (!CONSTANT_CLASS_P (op))
1434 /* Create 'vect_ = {op0,op1,...,opn}'. */
1435 t = tree_cons (NULL_TREE, op, t);
1437 number_of_places_left_in_vector--;
1439 if (number_of_places_left_in_vector == 0)
1441 number_of_places_left_in_vector = nunits;
1443 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1444 gcc_assert (vector_type);
1446 vec_cst = build_vector (vector_type, t);
1448 vec_cst = build_constructor_from_list (vector_type, t);
1450 VEC_quick_push (tree, voprnds,
1451 vect_init_vector (stmt, vec_cst, vector_type,
1458 /* Since the vectors are created in the reverse order, we should invert
1460 vec_num = VEC_length (tree, voprnds);
1461 for (j = vec_num - 1; j >= 0; j--)
1463 vop = VEC_index (tree, voprnds, j);
1464 VEC_quick_push (tree, *vec_oprnds, vop);
1467 VEC_free (tree, heap, voprnds);
1469 /* In case that VF is greater than the unrolling factor needed for the SLP
1470 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1471 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1472 to replicate the vectors. */
1473 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1475 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1476 VEC_quick_push (tree, *vec_oprnds, vop);
1481 /* Get vectorized definitions from SLP_NODE that contains corresponding
1482 vectorized def-stmts. */
1485 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1488 gimple vec_def_stmt;
1491 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1494 VEC_iterate (gimple, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1497 gcc_assert (vec_def_stmt);
1498 vec_oprnd = gimple_get_lhs (vec_def_stmt);
1499 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1504 /* Get vectorized definitions for SLP_NODE.
1505 If the scalar definitions are loop invariants or constants, collect them and
1506 call vect_get_constant_vectors() to create vector stmts.
1507 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1508 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1509 vect_get_slp_vect_defs() to retrieve them.
1510 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1511 the right node. This is used when the second operand must remain scalar. */
1514 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1515 VEC (tree,heap) **vec_oprnds1)
1518 enum tree_code code;
1520 /* Allocate memory for vectorized defs. */
1521 *vec_oprnds0 = VEC_alloc (tree, heap,
1522 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1524 /* SLP_NODE corresponds either to a group of stores or to a group of
1525 unary/binary operations. We don't call this function for loads. */
1526 if (SLP_TREE_LEFT (slp_node))
1527 /* The defs are already vectorized. */
1528 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1530 /* Build vectors from scalar defs. */
1531 vect_get_constant_vectors (slp_node, vec_oprnds0, 0);
1533 first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1534 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1535 /* Since we don't call this function with loads, this is a group of
1539 code = gimple_assign_rhs_code (first_stmt);
1540 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1)
1543 *vec_oprnds1 = VEC_alloc (tree, heap,
1544 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1546 if (SLP_TREE_RIGHT (slp_node))
1547 /* The defs are already vectorized. */
1548 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1550 /* Build vectors from scalar defs. */
1551 vect_get_constant_vectors (slp_node, vec_oprnds1, 1);
1555 /* Function get_initial_def_for_induction
1558 STMT - a stmt that performs an induction operation in the loop.
1559 IV_PHI - the initial value of the induction variable
1562 Return a vector variable, initialized with the first VF values of
1563 the induction variable. E.g., for an iv with IV_PHI='X' and
1564 evolution S, for a vector of 4 units, we want to return:
1565 [X, X + S, X + 2*S, X + 3*S]. */
1568 get_initial_def_for_induction (gimple iv_phi)
1570 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1571 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1572 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1573 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
1576 edge pe = loop_preheader_edge (loop);
1577 struct loop *iv_loop;
1579 tree vec, vec_init, vec_step, t;
1583 gimple init_stmt, induction_phi, new_stmt;
1584 tree induc_def, vec_def, vec_dest;
1585 tree init_expr, step_expr;
1586 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1591 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1592 bool nested_in_vect_loop = false;
1593 gimple_seq stmts = NULL;
1594 imm_use_iterator imm_iter;
1595 use_operand_p use_p;
1599 gimple_stmt_iterator si;
1600 basic_block bb = gimple_bb (iv_phi);
1602 vectype = get_vectype_for_scalar_type (scalar_type);
1603 gcc_assert (vectype);
1604 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1605 ncopies = vf / nunits;
1607 gcc_assert (phi_info);
1608 gcc_assert (ncopies >= 1);
1610 /* Find the first insertion point in the BB. */
1611 si = gsi_after_labels (bb);
1613 if (INTEGRAL_TYPE_P (scalar_type) || POINTER_TYPE_P (scalar_type))
1614 step_expr = build_int_cst (scalar_type, 0);
1616 step_expr = build_real (scalar_type, dconst0);
1618 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1619 if (nested_in_vect_loop_p (loop, iv_phi))
1621 nested_in_vect_loop = true;
1622 iv_loop = loop->inner;
1626 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
1628 latch_e = loop_latch_edge (iv_loop);
1629 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1631 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1632 gcc_assert (access_fn);
1633 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1634 &init_expr, &step_expr);
1636 pe = loop_preheader_edge (iv_loop);
1638 /* Create the vector that holds the initial_value of the induction. */
1639 if (nested_in_vect_loop)
1641 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1642 been created during vectorization of previous stmts; We obtain it from
1643 the STMT_VINFO_VEC_STMT of the defining stmt. */
1644 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1645 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1649 /* iv_loop is the loop to be vectorized. Create:
1650 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1651 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1652 add_referenced_var (new_var);
1654 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1657 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
1658 gcc_assert (!new_bb);
1662 t = tree_cons (NULL_TREE, init_expr, t);
1663 for (i = 1; i < nunits; i++)
1665 /* Create: new_name_i = new_name + step_expr */
1666 enum tree_code code = POINTER_TYPE_P (scalar_type)
1667 ? POINTER_PLUS_EXPR : PLUS_EXPR;
1668 init_stmt = gimple_build_assign_with_ops (code, new_var,
1669 new_name, step_expr);
1670 new_name = make_ssa_name (new_var, init_stmt);
1671 gimple_assign_set_lhs (init_stmt, new_name);
1673 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1674 gcc_assert (!new_bb);
1676 if (vect_print_dump_info (REPORT_DETAILS))
1678 fprintf (vect_dump, "created new init_stmt: ");
1679 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1681 t = tree_cons (NULL_TREE, new_name, t);
1683 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1684 vec = build_constructor_from_list (vectype, nreverse (t));
1685 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1689 /* Create the vector that holds the step of the induction. */
1690 if (nested_in_vect_loop)
1691 /* iv_loop is nested in the loop to be vectorized. Generate:
1692 vec_step = [S, S, S, S] */
1693 new_name = step_expr;
1696 /* iv_loop is the loop to be vectorized. Generate:
1697 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1698 expr = build_int_cst (scalar_type, vf);
1699 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1703 for (i = 0; i < nunits; i++)
1704 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1705 gcc_assert (CONSTANT_CLASS_P (new_name));
1706 vec = build_vector (vectype, t);
1707 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1710 /* Create the following def-use cycle:
1715 vec_iv = PHI <vec_init, vec_loop>
1719 vec_loop = vec_iv + vec_step; */
1721 /* Create the induction-phi that defines the induction-operand. */
1722 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1723 add_referenced_var (vec_dest);
1724 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1725 set_vinfo_for_stmt (induction_phi,
1726 new_stmt_vec_info (induction_phi, loop_vinfo));
1727 induc_def = PHI_RESULT (induction_phi);
1729 /* Create the iv update inside the loop */
1730 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1731 induc_def, vec_step);
1732 vec_def = make_ssa_name (vec_dest, new_stmt);
1733 gimple_assign_set_lhs (new_stmt, vec_def);
1734 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1735 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
1737 /* Set the arguments of the phi node: */
1738 add_phi_arg (induction_phi, vec_init, pe);
1739 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1742 /* In case that vectorization factor (VF) is bigger than the number
1743 of elements that we can fit in a vectype (nunits), we have to generate
1744 more than one vector stmt - i.e - we need to "unroll" the
1745 vector stmt by a factor VF/nunits. For more details see documentation
1746 in vectorizable_operation. */
1750 stmt_vec_info prev_stmt_vinfo;
1751 /* FORNOW. This restriction should be relaxed. */
1752 gcc_assert (!nested_in_vect_loop);
1754 /* Create the vector that holds the step of the induction. */
1755 expr = build_int_cst (scalar_type, nunits);
1756 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1758 for (i = 0; i < nunits; i++)
1759 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1760 gcc_assert (CONSTANT_CLASS_P (new_name));
1761 vec = build_vector (vectype, t);
1762 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1764 vec_def = induc_def;
1765 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1766 for (i = 1; i < ncopies; i++)
1768 /* vec_i = vec_prev + vec_step */
1769 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1771 vec_def = make_ssa_name (vec_dest, new_stmt);
1772 gimple_assign_set_lhs (new_stmt, vec_def);
1774 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1775 set_vinfo_for_stmt (new_stmt,
1776 new_stmt_vec_info (new_stmt, loop_vinfo));
1777 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1778 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1782 if (nested_in_vect_loop)
1784 /* Find the loop-closed exit-phi of the induction, and record
1785 the final vector of induction results: */
1787 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1789 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
1791 exit_phi = USE_STMT (use_p);
1797 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1798 /* FORNOW. Currently not supporting the case that an inner-loop induction
1799 is not used in the outer-loop (i.e. only outside the outer-loop). */
1800 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1801 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1803 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1804 if (vect_print_dump_info (REPORT_DETAILS))
1806 fprintf (vect_dump, "vector of inductions after inner-loop:");
1807 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
1813 if (vect_print_dump_info (REPORT_DETAILS))
1815 fprintf (vect_dump, "transform induction: created def-use cycle:");
1816 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
1817 fprintf (vect_dump, "\n");
1818 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
1821 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1826 /* Function vect_get_vec_def_for_operand.
1828 OP is an operand in STMT. This function returns a (vector) def that will be
1829 used in the vectorized stmt for STMT.
1831 In the case that OP is an SSA_NAME which is defined in the loop, then
1832 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1834 In case OP is an invariant or constant, a new stmt that creates a vector def
1835 needs to be introduced. */
1838 vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
1843 stmt_vec_info def_stmt_info = NULL;
1844 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1845 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1846 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1847 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1853 enum vect_def_type dt;
1857 if (vect_print_dump_info (REPORT_DETAILS))
1859 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1860 print_generic_expr (vect_dump, op, TDF_SLIM);
1863 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1864 gcc_assert (is_simple_use);
1865 if (vect_print_dump_info (REPORT_DETAILS))
1869 fprintf (vect_dump, "def = ");
1870 print_generic_expr (vect_dump, def, TDF_SLIM);
1874 fprintf (vect_dump, " def_stmt = ");
1875 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1881 /* Case 1: operand is a constant. */
1882 case vect_constant_def:
1887 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1888 if (vect_print_dump_info (REPORT_DETAILS))
1889 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1891 for (i = nunits - 1; i >= 0; --i)
1893 t = tree_cons (NULL_TREE, op, t);
1895 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1896 gcc_assert (vector_type);
1897 vec_cst = build_vector (vector_type, t);
1899 return vect_init_vector (stmt, vec_cst, vector_type, NULL);
1902 /* Case 2: operand is defined outside the loop - loop invariant. */
1903 case vect_invariant_def:
1908 /* Create 'vec_inv = {inv,inv,..,inv}' */
1909 if (vect_print_dump_info (REPORT_DETAILS))
1910 fprintf (vect_dump, "Create vector_inv.");
1912 for (i = nunits - 1; i >= 0; --i)
1914 t = tree_cons (NULL_TREE, def, t);
1917 /* FIXME: use build_constructor directly. */
1918 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1919 gcc_assert (vector_type);
1920 vec_inv = build_constructor_from_list (vector_type, t);
1921 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1924 /* Case 3: operand is defined inside the loop. */
1928 *scalar_def = NULL/* FIXME tuples: def_stmt*/;
1930 /* Get the def from the vectorized stmt. */
1931 def_stmt_info = vinfo_for_stmt (def_stmt);
1932 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1933 gcc_assert (vec_stmt);
1934 if (gimple_code (vec_stmt) == GIMPLE_PHI)
1935 vec_oprnd = PHI_RESULT (vec_stmt);
1936 else if (is_gimple_call (vec_stmt))
1937 vec_oprnd = gimple_call_lhs (vec_stmt);
1939 vec_oprnd = gimple_assign_lhs (vec_stmt);
1943 /* Case 4: operand is defined by a loop header phi - reduction */
1944 case vect_reduction_def:
1948 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
1949 loop = (gimple_bb (def_stmt))->loop_father;
1951 /* Get the def before the loop */
1952 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1953 return get_initial_def_for_reduction (stmt, op, scalar_def);
1956 /* Case 5: operand is defined by loop-header phi - induction. */
1957 case vect_induction_def:
1959 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
1961 /* Get the def from the vectorized stmt. */
1962 def_stmt_info = vinfo_for_stmt (def_stmt);
1963 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1964 gcc_assert (vec_stmt && gimple_code (vec_stmt) == GIMPLE_PHI);
1965 vec_oprnd = PHI_RESULT (vec_stmt);
1975 /* Function vect_get_vec_def_for_stmt_copy
1977 Return a vector-def for an operand. This function is used when the
1978 vectorized stmt to be created (by the caller to this function) is a "copy"
1979 created in case the vectorized result cannot fit in one vector, and several
1980 copies of the vector-stmt are required. In this case the vector-def is
1981 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
1982 of the stmt that defines VEC_OPRND.
1983 DT is the type of the vector def VEC_OPRND.
1986 In case the vectorization factor (VF) is bigger than the number
1987 of elements that can fit in a vectype (nunits), we have to generate
1988 more than one vector stmt to vectorize the scalar stmt. This situation
1989 arises when there are multiple data-types operated upon in the loop; the
1990 smallest data-type determines the VF, and as a result, when vectorizing
1991 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
1992 vector stmt (each computing a vector of 'nunits' results, and together
1993 computing 'VF' results in each iteration). This function is called when
1994 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
1995 which VF=16 and nunits=4, so the number of copies required is 4):
1997 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
1999 S1: x = load VS1.0: vx.0 = memref0 VS1.1
2000 VS1.1: vx.1 = memref1 VS1.2
2001 VS1.2: vx.2 = memref2 VS1.3
2002 VS1.3: vx.3 = memref3
2004 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
2005 VSnew.1: vz1 = vx.1 + ... VSnew.2
2006 VSnew.2: vz2 = vx.2 + ... VSnew.3
2007 VSnew.3: vz3 = vx.3 + ...
2009 The vectorization of S1 is explained in vectorizable_load.
2010 The vectorization of S2:
2011 To create the first vector-stmt out of the 4 copies - VSnew.0 -
2012 the function 'vect_get_vec_def_for_operand' is called to
2013 get the relevant vector-def for each operand of S2. For operand x it
2014 returns the vector-def 'vx.0'.
2016 To create the remaining copies of the vector-stmt (VSnew.j), this
2017 function is called to get the relevant vector-def for each operand. It is
2018 obtained from the respective VS1.j stmt, which is recorded in the
2019 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2021 For example, to obtain the vector-def 'vx.1' in order to create the
2022 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2023 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2024 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2025 and return its def ('vx.1').
2026 Overall, to create the above sequence this function will be called 3 times:
2027 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2028 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2029 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
2032 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2034 gimple vec_stmt_for_operand;
2035 stmt_vec_info def_stmt_info;
2037 /* Do nothing; can reuse same def. */
2038 if (dt == vect_invariant_def || dt == vect_constant_def )
2041 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2042 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2043 gcc_assert (def_stmt_info);
2044 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2045 gcc_assert (vec_stmt_for_operand);
2046 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2051 /* Get vectorized definitions for the operands to create a copy of an original
2052 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
2055 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2056 VEC(tree,heap) **vec_oprnds0,
2057 VEC(tree,heap) **vec_oprnds1)
2059 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2061 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2062 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2064 if (vec_oprnds1 && *vec_oprnds1)
2066 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2067 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2068 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2073 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
2076 vect_get_vec_defs (tree op0, tree op1, gimple stmt,
2077 VEC(tree,heap) **vec_oprnds0, VEC(tree,heap) **vec_oprnds1,
2081 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2086 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2087 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2088 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2092 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2093 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2094 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2100 /* Function vect_finish_stmt_generation.
2102 Insert a new stmt. */
2105 vect_finish_stmt_generation (gimple stmt, gimple vec_stmt,
2106 gimple_stmt_iterator *gsi)
2108 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2109 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2111 gcc_assert (stmt == gsi_stmt (*gsi));
2112 gcc_assert (gimple_code (stmt) != GIMPLE_LABEL);
2114 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
2116 set_vinfo_for_stmt (vec_stmt, new_stmt_vec_info (vec_stmt, loop_vinfo));
2118 if (vect_print_dump_info (REPORT_DETAILS))
2120 fprintf (vect_dump, "add new stmt: ");
2121 print_gimple_stmt (vect_dump, vec_stmt, 0, TDF_SLIM);
2124 /* Make sure gsi points to the stmt that is being vectorized. */
2125 gcc_assert (stmt == gsi_stmt (*gsi));
2127 gimple_set_location (vec_stmt, gimple_location (stmt));
2131 /* Function get_initial_def_for_reduction
2134 STMT - a stmt that performs a reduction operation in the loop.
2135 INIT_VAL - the initial value of the reduction variable
2138 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2139 of the reduction (used for adjusting the epilog - see below).
2140 Return a vector variable, initialized according to the operation that STMT
2141 performs. This vector will be used as the initial value of the
2142 vector of partial results.
2144 Option1 (adjust in epilog): Initialize the vector as follows:
2147 min/max: [init_val,init_val,..,init_val,init_val]
2148 bit and/or: [init_val,init_val,..,init_val,init_val]
2149 and when necessary (e.g. add/mult case) let the caller know
2150 that it needs to adjust the result by init_val.
2152 Option2: Initialize the vector as follows:
2153 add: [0,0,...,0,init_val]
2154 mult: [1,1,...,1,init_val]
2155 min/max: [init_val,init_val,...,init_val]
2156 bit and/or: [init_val,init_val,...,init_val]
2157 and no adjustments are needed.
2159 For example, for the following code:
2165 STMT is 's = s + a[i]', and the reduction variable is 's'.
2166 For a vector of 4 units, we want to return either [0,0,0,init_val],
2167 or [0,0,0,0] and let the caller know that it needs to adjust
2168 the result at the end by 'init_val'.
2170 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2171 initialization vector is simpler (same element in all entries).
2172 A cost model should help decide between these two schemes. */
2175 get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
2177 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2178 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2179 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2180 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2181 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2182 enum tree_code code = gimple_assign_rhs_code (stmt);
2183 tree type = TREE_TYPE (init_val);
2190 bool nested_in_vect_loop = false;
2192 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2193 if (nested_in_vect_loop_p (loop, stmt))
2194 nested_in_vect_loop = true;
2196 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2198 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2202 case WIDEN_SUM_EXPR:
2205 if (nested_in_vect_loop)
2206 *adjustment_def = vecdef;
2208 *adjustment_def = init_val;
2209 /* Create a vector of zeros for init_def. */
2210 if (SCALAR_FLOAT_TYPE_P (type))
2211 def_for_init = build_real (type, dconst0);
2213 def_for_init = build_int_cst (type, 0);
2214 for (i = nunits - 1; i >= 0; --i)
2215 t = tree_cons (NULL_TREE, def_for_init, t);
2216 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
2217 gcc_assert (vector_type);
2218 init_def = build_vector (vector_type, t);
2223 *adjustment_def = NULL_TREE;
2235 /* Function vect_create_epilog_for_reduction
2237 Create code at the loop-epilog to finalize the result of a reduction
2240 VECT_DEF is a vector of partial results.
2241 REDUC_CODE is the tree-code for the epilog reduction.
2242 STMT is the scalar reduction stmt that is being vectorized.
2243 REDUCTION_PHI is the phi-node that carries the reduction computation.
2246 1. Creates the reduction def-use cycle: sets the arguments for
2248 The loop-entry argument is the vectorized initial-value of the reduction.
2249 The loop-latch argument is VECT_DEF - the vector of partial sums.
2250 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2251 by applying the operation specified by REDUC_CODE if available, or by
2252 other means (whole-vector shifts or a scalar loop).
2253 The function also creates a new phi node at the loop exit to preserve
2254 loop-closed form, as illustrated below.
2256 The flow at the entry to this function:
2259 vec_def = phi <null, null> # REDUCTION_PHI
2260 VECT_DEF = vector_stmt # vectorized form of STMT
2261 s_loop = scalar_stmt # (scalar) STMT
2263 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2267 The above is transformed by this function into:
2270 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2271 VECT_DEF = vector_stmt # vectorized form of STMT
2272 s_loop = scalar_stmt # (scalar) STMT
2274 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2275 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2276 v_out2 = reduce <v_out1>
2277 s_out3 = extract_field <v_out2, 0>
2278 s_out4 = adjust_result <s_out3>
2284 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2285 enum tree_code reduc_code,
2286 gimple reduction_phi)
2288 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2290 enum machine_mode mode;
2291 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2292 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2293 basic_block exit_bb;
2297 gimple_stmt_iterator exit_gsi;
2299 tree new_temp = NULL_TREE;
2301 gimple epilog_stmt = NULL;
2302 tree new_scalar_dest, new_dest;
2304 tree bitsize, bitpos, bytesize;
2305 enum tree_code code = gimple_assign_rhs_code (stmt);
2306 tree adjustment_def;
2307 tree vec_initial_def;
2309 imm_use_iterator imm_iter;
2310 use_operand_p use_p;
2311 bool extract_scalar_result = false;
2312 tree reduction_op, expr;
2315 bool nested_in_vect_loop = false;
2316 VEC(gimple,heap) *phis = NULL;
2319 if (nested_in_vect_loop_p (loop, stmt))
2322 nested_in_vect_loop = true;
2325 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2327 case GIMPLE_SINGLE_RHS:
2328 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2329 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2331 case GIMPLE_UNARY_RHS:
2332 reduction_op = gimple_assign_rhs1 (stmt);
2334 case GIMPLE_BINARY_RHS:
2335 reduction_op = gimple_assign_rhs2 (stmt);
2341 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2342 gcc_assert (vectype);
2343 mode = TYPE_MODE (vectype);
2345 /*** 1. Create the reduction def-use cycle ***/
2347 /* 1.1 set the loop-entry arg of the reduction-phi: */
2348 /* For the case of reduction, vect_get_vec_def_for_operand returns
2349 the scalar def before the loop, that defines the initial value
2350 of the reduction variable. */
2351 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2353 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
2355 /* 1.2 set the loop-latch arg for the reduction-phi: */
2356 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
2358 if (vect_print_dump_info (REPORT_DETAILS))
2360 fprintf (vect_dump, "transform reduction: created def-use cycle:");
2361 print_gimple_stmt (vect_dump, reduction_phi, 0, TDF_SLIM);
2362 fprintf (vect_dump, "\n");
2363 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vect_def), 0, TDF_SLIM);
2367 /*** 2. Create epilog code
2368 The reduction epilog code operates across the elements of the vector
2369 of partial results computed by the vectorized loop.
2370 The reduction epilog code consists of:
2371 step 1: compute the scalar result in a vector (v_out2)
2372 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2373 step 3: adjust the scalar result (s_out3) if needed.
2375 Step 1 can be accomplished using one the following three schemes:
2376 (scheme 1) using reduc_code, if available.
2377 (scheme 2) using whole-vector shifts, if available.
2378 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2381 The overall epilog code looks like this:
2383 s_out0 = phi <s_loop> # original EXIT_PHI
2384 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2385 v_out2 = reduce <v_out1> # step 1
2386 s_out3 = extract_field <v_out2, 0> # step 2
2387 s_out4 = adjust_result <s_out3> # step 3
2389 (step 3 is optional, and step2 1 and 2 may be combined).
2390 Lastly, the uses of s_out0 are replaced by s_out4.
2394 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2395 v_out1 = phi <v_loop> */
2397 exit_bb = single_exit (loop)->dest;
2398 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2399 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
2400 exit_gsi = gsi_after_labels (exit_bb);
2402 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2403 (i.e. when reduc_code is not available) and in the final adjustment
2404 code (if needed). Also get the original scalar reduction variable as
2405 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2406 represents a reduction pattern), the tree-code and scalar-def are
2407 taken from the original stmt that the pattern-stmt (STMT) replaces.
2408 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2409 are taken from STMT. */
2411 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2414 /* Regular reduction */
2419 /* Reduction pattern */
2420 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2421 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2422 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2424 code = gimple_assign_rhs_code (orig_stmt);
2425 scalar_dest = gimple_assign_lhs (orig_stmt);
2426 scalar_type = TREE_TYPE (scalar_dest);
2427 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2428 bitsize = TYPE_SIZE (scalar_type);
2429 bytesize = TYPE_SIZE_UNIT (scalar_type);
2432 /* In case this is a reduction in an inner-loop while vectorizing an outer
2433 loop - we don't need to extract a single scalar result at the end of the
2434 inner-loop. The final vector of partial results will be used in the
2435 vectorized outer-loop, or reduced to a scalar result at the end of the
2437 if (nested_in_vect_loop)
2438 goto vect_finalize_reduction;
2440 /* 2.3 Create the reduction code, using one of the three schemes described
2443 if (reduc_code < NUM_TREE_CODES)
2447 /*** Case 1: Create:
2448 v_out2 = reduc_expr <v_out1> */
2450 if (vect_print_dump_info (REPORT_DETAILS))
2451 fprintf (vect_dump, "Reduce using direct vector reduction.");
2453 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2454 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2455 epilog_stmt = gimple_build_assign (vec_dest, tmp);
2456 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2457 gimple_assign_set_lhs (epilog_stmt, new_temp);
2458 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2460 extract_scalar_result = true;
2464 enum tree_code shift_code = 0;
2465 bool have_whole_vector_shift = true;
2467 int element_bitsize = tree_low_cst (bitsize, 1);
2468 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2471 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2472 shift_code = VEC_RSHIFT_EXPR;
2474 have_whole_vector_shift = false;
2476 /* Regardless of whether we have a whole vector shift, if we're
2477 emulating the operation via tree-vect-generic, we don't want
2478 to use it. Only the first round of the reduction is likely
2479 to still be profitable via emulation. */
2480 /* ??? It might be better to emit a reduction tree code here, so that
2481 tree-vect-generic can expand the first round via bit tricks. */
2482 if (!VECTOR_MODE_P (mode))
2483 have_whole_vector_shift = false;
2486 optab optab = optab_for_tree_code (code, vectype, optab_default);
2487 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2488 have_whole_vector_shift = false;
2491 if (have_whole_vector_shift)
2493 /*** Case 2: Create:
2494 for (offset = VS/2; offset >= element_size; offset/=2)
2496 Create: va' = vec_shift <va, offset>
2497 Create: va = vop <va, va'>
2500 if (vect_print_dump_info (REPORT_DETAILS))
2501 fprintf (vect_dump, "Reduce using vector shifts");
2503 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2504 new_temp = PHI_RESULT (new_phi);
2506 for (bit_offset = vec_size_in_bits/2;
2507 bit_offset >= element_bitsize;
2510 tree bitpos = size_int (bit_offset);
2511 epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
2513 new_name = make_ssa_name (vec_dest, epilog_stmt);
2514 gimple_assign_set_lhs (epilog_stmt, new_name);
2515 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2517 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
2518 new_name, new_temp);
2519 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2520 gimple_assign_set_lhs (epilog_stmt, new_temp);
2521 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2524 extract_scalar_result = true;
2530 /*** Case 3: Create:
2531 s = extract_field <v_out2, 0>
2532 for (offset = element_size;
2533 offset < vector_size;
2534 offset += element_size;)
2536 Create: s' = extract_field <v_out2, offset>
2537 Create: s = op <s, s'>
2540 if (vect_print_dump_info (REPORT_DETAILS))
2541 fprintf (vect_dump, "Reduce using scalar code. ");
2543 vec_temp = PHI_RESULT (new_phi);
2544 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2545 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2547 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2548 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2549 gimple_assign_set_lhs (epilog_stmt, new_temp);
2550 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2552 for (bit_offset = element_bitsize;
2553 bit_offset < vec_size_in_bits;
2554 bit_offset += element_bitsize)
2556 tree bitpos = bitsize_int (bit_offset);
2557 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2560 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2561 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2562 gimple_assign_set_lhs (epilog_stmt, new_name);
2563 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2565 epilog_stmt = gimple_build_assign_with_ops (code,
2567 new_name, new_temp);
2568 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2569 gimple_assign_set_lhs (epilog_stmt, new_temp);
2570 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2573 extract_scalar_result = false;
2577 /* 2.4 Extract the final scalar result. Create:
2578 s_out3 = extract_field <v_out2, bitpos> */
2580 if (extract_scalar_result)
2584 gcc_assert (!nested_in_vect_loop);
2585 if (vect_print_dump_info (REPORT_DETAILS))
2586 fprintf (vect_dump, "extract scalar result");
2588 if (BYTES_BIG_ENDIAN)
2589 bitpos = size_binop (MULT_EXPR,
2590 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2591 TYPE_SIZE (scalar_type));
2593 bitpos = bitsize_zero_node;
2595 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2596 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2597 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2598 gimple_assign_set_lhs (epilog_stmt, new_temp);
2599 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2602 vect_finalize_reduction:
2604 /* 2.5 Adjust the final result by the initial value of the reduction
2605 variable. (When such adjustment is not needed, then
2606 'adjustment_def' is zero). For example, if code is PLUS we create:
2607 new_temp = loop_exit_def + adjustment_def */
2611 if (nested_in_vect_loop)
2613 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2614 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2615 new_dest = vect_create_destination_var (scalar_dest, vectype);
2619 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2620 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2621 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2623 epilog_stmt = gimple_build_assign (new_dest, expr);
2624 new_temp = make_ssa_name (new_dest, epilog_stmt);
2625 gimple_assign_set_lhs (epilog_stmt, new_temp);
2626 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
2627 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2631 /* 2.6 Handle the loop-exit phi */
2633 /* Replace uses of s_out0 with uses of s_out3:
2634 Find the loop-closed-use at the loop exit of the original scalar result.
2635 (The reduction result is expected to have two immediate uses - one at the
2636 latch block, and one at the loop exit). */
2637 phis = VEC_alloc (gimple, heap, 10);
2638 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2640 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2642 exit_phi = USE_STMT (use_p);
2643 VEC_quick_push (gimple, phis, exit_phi);
2646 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2647 gcc_assert (!VEC_empty (gimple, phis));
2649 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
2651 if (nested_in_vect_loop)
2653 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2655 /* FORNOW. Currently not supporting the case that an inner-loop reduction
2656 is not used in the outer-loop (but only outside the outer-loop). */
2657 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2658 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2660 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2661 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2662 set_vinfo_for_stmt (epilog_stmt,
2663 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2667 /* Replace the uses: */
2668 orig_name = PHI_RESULT (exit_phi);
2669 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2670 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2671 SET_USE (use_p, new_temp);
2673 VEC_free (gimple, heap, phis);
2677 /* Function vectorizable_reduction.
2679 Check if STMT performs a reduction operation that can be vectorized.
2680 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2681 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2682 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2684 This function also handles reduction idioms (patterns) that have been
2685 recognized in advance during vect_pattern_recog. In this case, STMT may be
2687 X = pattern_expr (arg0, arg1, ..., X)
2688 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2689 sequence that had been detected and replaced by the pattern-stmt (STMT).
2691 In some cases of reduction patterns, the type of the reduction variable X is
2692 different than the type of the other arguments of STMT.
2693 In such cases, the vectype that is used when transforming STMT into a vector
2694 stmt is different than the vectype that is used to determine the
2695 vectorization factor, because it consists of a different number of elements
2696 than the actual number of elements that are being operated upon in parallel.
2698 For example, consider an accumulation of shorts into an int accumulator.
2699 On some targets it's possible to vectorize this pattern operating on 8
2700 shorts at a time (hence, the vectype for purposes of determining the
2701 vectorization factor should be V8HI); on the other hand, the vectype that
2702 is used to create the vector form is actually V4SI (the type of the result).
2704 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2705 indicates what is the actual level of parallelism (V8HI in the example), so
2706 that the right vectorization factor would be derived. This vectype
2707 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2708 be used to create the vectorized stmt. The right vectype for the vectorized
2709 stmt is obtained from the type of the result X:
2710 get_vectype_for_scalar_type (TREE_TYPE (X))
2712 This means that, contrary to "regular" reductions (or "regular" stmts in
2713 general), the following equation:
2714 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2715 does *NOT* necessarily hold for reduction patterns. */
2718 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
2723 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2724 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2725 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2726 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2727 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2728 enum tree_code code, orig_code, epilog_reduc_code = 0;
2729 enum machine_mode vec_mode;
2731 optab optab, reduc_optab;
2732 tree new_temp = NULL_TREE;
2735 enum vect_def_type dt;
2740 stmt_vec_info orig_stmt_info;
2741 tree expr = NULL_TREE;
2743 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2744 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2745 stmt_vec_info prev_stmt_info;
2747 gimple new_stmt = NULL;
2751 if (nested_in_vect_loop_p (loop, stmt))
2754 /* FORNOW. This restriction should be relaxed. */
2757 if (vect_print_dump_info (REPORT_DETAILS))
2758 fprintf (vect_dump, "multiple types in nested loop.");
2763 gcc_assert (ncopies >= 1);
2765 /* FORNOW: SLP not supported. */
2766 if (STMT_SLP_TYPE (stmt_info))
2769 /* 1. Is vectorizable reduction? */
2771 /* Not supportable if the reduction variable is used in the loop. */
2772 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2775 /* Reductions that are not used even in an enclosing outer-loop,
2776 are expected to be "live" (used out of the loop). */
2777 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2778 && !STMT_VINFO_LIVE_P (stmt_info))
2781 /* Make sure it was already recognized as a reduction computation. */
2782 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2785 /* 2. Has this been recognized as a reduction pattern?
2787 Check if STMT represents a pattern that has been recognized
2788 in earlier analysis stages. For stmts that represent a pattern,
2789 the STMT_VINFO_RELATED_STMT field records the last stmt in
2790 the original sequence that constitutes the pattern. */
2792 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2795 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2796 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2797 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2798 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2801 /* 3. Check the operands of the operation. The first operands are defined
2802 inside the loop body. The last operand is the reduction variable,
2803 which is defined by the loop-header-phi. */
2805 gcc_assert (is_gimple_assign (stmt));
2808 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2810 case GIMPLE_SINGLE_RHS:
2811 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
2812 if (op_type == ternary_op)
2814 tree rhs = gimple_assign_rhs1 (stmt);
2815 ops[0] = TREE_OPERAND (rhs, 0);
2816 ops[1] = TREE_OPERAND (rhs, 1);
2817 ops[2] = TREE_OPERAND (rhs, 2);
2818 code = TREE_CODE (rhs);
2824 case GIMPLE_BINARY_RHS:
2825 code = gimple_assign_rhs_code (stmt);
2826 op_type = TREE_CODE_LENGTH (code);
2827 gcc_assert (op_type == binary_op);
2828 ops[0] = gimple_assign_rhs1 (stmt);
2829 ops[1] = gimple_assign_rhs2 (stmt);
2832 case GIMPLE_UNARY_RHS:
2839 scalar_dest = gimple_assign_lhs (stmt);
2840 scalar_type = TREE_TYPE (scalar_dest);
2841 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2842 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2845 /* All uses but the last are expected to be defined in the loop.
2846 The last use is the reduction variable. */
2847 for (i = 0; i < op_type-1; i++)
2849 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt,
2851 gcc_assert (is_simple_use);
2852 if (dt != vect_loop_def
2853 && dt != vect_invariant_def
2854 && dt != vect_constant_def
2855 && dt != vect_induction_def)
2859 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt, &def, &dt);
2860 gcc_assert (is_simple_use);
2861 gcc_assert (dt == vect_reduction_def);
2862 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2864 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2866 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2868 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2871 /* 4. Supportable by target? */
2873 /* 4.1. check support for the operation in the loop */
2874 optab = optab_for_tree_code (code, vectype, optab_default);
2877 if (vect_print_dump_info (REPORT_DETAILS))
2878 fprintf (vect_dump, "no optab.");
2881 vec_mode = TYPE_MODE (vectype);
2882 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2884 if (vect_print_dump_info (REPORT_DETAILS))
2885 fprintf (vect_dump, "op not supported by target.");
2886 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2887 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2888 < vect_min_worthwhile_factor (code))
2890 if (vect_print_dump_info (REPORT_DETAILS))
2891 fprintf (vect_dump, "proceeding using word mode.");
2894 /* Worthwhile without SIMD support? */
2895 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2896 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2897 < vect_min_worthwhile_factor (code))
2899 if (vect_print_dump_info (REPORT_DETAILS))
2900 fprintf (vect_dump, "not worthwhile without SIMD support.");
2904 /* 4.2. Check support for the epilog operation.
2906 If STMT represents a reduction pattern, then the type of the
2907 reduction variable may be different than the type of the rest
2908 of the arguments. For example, consider the case of accumulation
2909 of shorts into an int accumulator; The original code:
2910 S1: int_a = (int) short_a;
2911 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2914 STMT: int_acc = widen_sum <short_a, int_acc>
2917 1. The tree-code that is used to create the vector operation in the
2918 epilog code (that reduces the partial results) is not the
2919 tree-code of STMT, but is rather the tree-code of the original
2920 stmt from the pattern that STMT is replacing. I.e, in the example
2921 above we want to use 'widen_sum' in the loop, but 'plus' in the
2923 2. The type (mode) we use to check available target support
2924 for the vector operation to be created in the *epilog*, is
2925 determined by the type of the reduction variable (in the example
2926 above we'd check this: plus_optab[vect_int_mode]).
2927 However the type (mode) we use to check available target support
2928 for the vector operation to be created *inside the loop*, is
2929 determined by the type of the other arguments to STMT (in the
2930 example we'd check this: widen_sum_optab[vect_short_mode]).
2932 This is contrary to "regular" reductions, in which the types of all
2933 the arguments are the same as the type of the reduction variable.
2934 For "regular" reductions we can therefore use the same vector type
2935 (and also the same tree-code) when generating the epilog code and
2936 when generating the code inside the loop. */
2940 /* This is a reduction pattern: get the vectype from the type of the
2941 reduction variable, and get the tree-code from orig_stmt. */
2942 orig_code = gimple_assign_rhs_code (orig_stmt);
2943 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
2946 if (vect_print_dump_info (REPORT_DETAILS))
2948 fprintf (vect_dump, "unsupported data-type ");
2949 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
2954 vec_mode = TYPE_MODE (vectype);
2958 /* Regular reduction: use the same vectype and tree-code as used for
2959 the vector code inside the loop can be used for the epilog code. */
2963 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
2965 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default);
2968 if (vect_print_dump_info (REPORT_DETAILS))
2969 fprintf (vect_dump, "no optab for reduction.");
2970 epilog_reduc_code = NUM_TREE_CODES;
2972 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
2974 if (vect_print_dump_info (REPORT_DETAILS))
2975 fprintf (vect_dump, "reduc op not supported by target.");
2976 epilog_reduc_code = NUM_TREE_CODES;
2979 if (!vec_stmt) /* transformation not required. */
2981 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
2982 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
2989 if (vect_print_dump_info (REPORT_DETAILS))
2990 fprintf (vect_dump, "transform reduction.");
2992 /* Create the destination vector */
2993 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2995 /* Create the reduction-phi that defines the reduction-operand. */
2996 new_phi = create_phi_node (vec_dest, loop->header);
2998 /* In case the vectorization factor (VF) is bigger than the number
2999 of elements that we can fit in a vectype (nunits), we have to generate
3000 more than one vector stmt - i.e - we need to "unroll" the
3001 vector stmt by a factor VF/nunits. For more details see documentation
3002 in vectorizable_operation. */
3004 prev_stmt_info = NULL;
3005 for (j = 0; j < ncopies; j++)
3010 loop_vec_def0 = vect_get_vec_def_for_operand (ops[0], stmt, NULL);
3011 if (op_type == ternary_op)
3013 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, NULL);
3016 /* Get the vector def for the reduction variable from the phi node */
3017 reduc_def = PHI_RESULT (new_phi);
3021 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3022 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3023 if (op_type == ternary_op)
3024 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3026 /* Get the vector def for the reduction variable from the vectorized
3027 reduction operation generated in the previous iteration (j-1) */
3028 reduc_def = gimple_assign_lhs (new_stmt);
3031 /* Arguments are ready. create the new vector stmt. */
3032 if (op_type == binary_op)
3033 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3035 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3037 new_stmt = gimple_build_assign (vec_dest, expr);
3038 new_temp = make_ssa_name (vec_dest, new_stmt);
3039 gimple_assign_set_lhs (new_stmt, new_temp);
3040 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3043 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3045 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3046 prev_stmt_info = vinfo_for_stmt (new_stmt);
3049 /* Finalize the reduction-phi (set it's arguments) and create the
3050 epilog reduction code. */
3051 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
3055 /* Checks if CALL can be vectorized in type VECTYPE. Returns
3056 a function declaration if the target has a vectorized version
3057 of the function, or NULL_TREE if the function cannot be vectorized. */
3060 vectorizable_function (gimple call, tree vectype_out, tree vectype_in)
3062 tree fndecl = gimple_call_fndecl (call);
3063 enum built_in_function code;
3065 /* We only handle functions that do not read or clobber memory -- i.e.
3066 const or novops ones. */
3067 if (!(gimple_call_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3071 || TREE_CODE (fndecl) != FUNCTION_DECL
3072 || !DECL_BUILT_IN (fndecl))
3075 code = DECL_FUNCTION_CODE (fndecl);
3076 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3080 /* Function vectorizable_call.
3082 Check if STMT performs a function call that can be vectorized.
3083 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3084 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3085 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3088 vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
3093 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3094 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3095 tree vectype_out, vectype_in;
3098 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3099 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3100 tree fndecl, new_temp, def, rhs_type, lhs_type;
3102 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3105 VEC(tree, heap) *vargs = NULL;
3106 enum { NARROW, NONE, WIDEN } modifier;
3109 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3112 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3115 /* FORNOW: SLP not supported. */
3116 if (STMT_SLP_TYPE (stmt_info))
3119 /* Is STMT a vectorizable call? */
3120 if (!is_gimple_call (stmt))
3123 if (TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
3126 /* Process function arguments. */
3127 rhs_type = NULL_TREE;
3128 nargs = gimple_call_num_args (stmt);
3130 /* Bail out if the function has more than two arguments, we
3131 do not have interesting builtin functions to vectorize with
3132 more than two arguments. No arguments is also not good. */
3133 if (nargs == 0 || nargs > 2)
3136 for (i = 0; i < nargs; i++)
3138 op = gimple_call_arg (stmt, i);
3140 /* We can only handle calls with arguments of the same type. */
3142 && rhs_type != TREE_TYPE (op))
3144 if (vect_print_dump_info (REPORT_DETAILS))
3145 fprintf (vect_dump, "argument types differ.");
3148 rhs_type = TREE_TYPE (op);
3150 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[i]))
3152 if (vect_print_dump_info (REPORT_DETAILS))
3153 fprintf (vect_dump, "use not simple.");
3158 vectype_in = get_vectype_for_scalar_type (rhs_type);
3161 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3163 lhs_type = TREE_TYPE (gimple_call_lhs (stmt));
3164 vectype_out = get_vectype_for_scalar_type (lhs_type);
3167 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3170 if (nunits_in == nunits_out / 2)
3172 else if (nunits_out == nunits_in)
3174 else if (nunits_out == nunits_in / 2)
3179 /* For now, we only vectorize functions if a target specific builtin
3180 is available. TODO -- in some cases, it might be profitable to
3181 insert the calls for pieces of the vector, in order to be able
3182 to vectorize other operations in the loop. */
3183 fndecl = vectorizable_function (stmt, vectype_out, vectype_in);
3184 if (fndecl == NULL_TREE)
3186 if (vect_print_dump_info (REPORT_DETAILS))
3187 fprintf (vect_dump, "function is not vectorizable.");
3192 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3194 if (modifier == NARROW)
3195 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3197 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3199 /* Sanity check: make sure that at least one copy of the vectorized stmt
3200 needs to be generated. */
3201 gcc_assert (ncopies >= 1);
3203 /* FORNOW. This restriction should be relaxed. */
3204 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3206 if (vect_print_dump_info (REPORT_DETAILS))
3207 fprintf (vect_dump, "multiple types in nested loop.");
3211 if (!vec_stmt) /* transformation not required. */
3213 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3214 if (vect_print_dump_info (REPORT_DETAILS))
3215 fprintf (vect_dump, "=== vectorizable_call ===");
3216 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3222 if (vect_print_dump_info (REPORT_DETAILS))
3223 fprintf (vect_dump, "transform operation.");
3225 /* FORNOW. This restriction should be relaxed. */
3226 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3228 if (vect_print_dump_info (REPORT_DETAILS))
3229 fprintf (vect_dump, "multiple types in nested loop.");
3234 scalar_dest = gimple_call_lhs (stmt);
3235 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3237 prev_stmt_info = NULL;
3241 for (j = 0; j < ncopies; ++j)
3243 /* Build argument list for the vectorized call. */
3245 vargs = VEC_alloc (tree, heap, nargs);
3247 VEC_truncate (tree, vargs, 0);
3249 for (i = 0; i < nargs; i++)
3251 op = gimple_call_arg (stmt, i);
3254 = vect_get_vec_def_for_operand (op, stmt, NULL);
3257 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3259 VEC_quick_push (tree, vargs, vec_oprnd0);
3262 new_stmt = gimple_build_call_vec (fndecl, vargs);
3263 new_temp = make_ssa_name (vec_dest, new_stmt);
3264 gimple_call_set_lhs (new_stmt, new_temp);
3266 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3269 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3271 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3273 prev_stmt_info = vinfo_for_stmt (new_stmt);
3279 for (j = 0; j < ncopies; ++j)
3281 /* Build argument list for the vectorized call. */
3283 vargs = VEC_alloc (tree, heap, nargs * 2);
3285 VEC_truncate (tree, vargs, 0);
3287 for (i = 0; i < nargs; i++)
3289 op = gimple_call_arg (stmt, i);
3293 = vect_get_vec_def_for_operand (op, stmt, NULL);
3295 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3300 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3302 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3305 VEC_quick_push (tree, vargs, vec_oprnd0);
3306 VEC_quick_push (tree, vargs, vec_oprnd1);
3309 new_stmt = gimple_build_call_vec (fndecl, vargs);
3310 new_temp = make_ssa_name (vec_dest, new_stmt);
3311 gimple_call_set_lhs (new_stmt, new_temp);
3313 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3316 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3318 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3320 prev_stmt_info = vinfo_for_stmt (new_stmt);
3323 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3328 /* No current target implements this case. */
3332 VEC_free (tree, heap, vargs);
3334 /* The call in STMT might prevent it from being removed in dce.
3335 We however cannot remove it here, due to the way the ssa name
3336 it defines is mapped to the new definition. So just replace
3337 rhs of the statement with something harmless. */
3339 type = TREE_TYPE (scalar_dest);
3340 new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
3341 fold_convert (type, integer_zero_node));
3342 set_vinfo_for_stmt (new_stmt, stmt_info);
3343 set_vinfo_for_stmt (stmt, NULL);
3344 STMT_VINFO_STMT (stmt_info) = new_stmt;
3345 gsi_replace (gsi, new_stmt, false);
3346 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3352 /* Function vect_gen_widened_results_half
3354 Create a vector stmt whose code, type, number of arguments, and result
3355 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
3356 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3357 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3358 needs to be created (DECL is a function-decl of a target-builtin).
3359 STMT is the original scalar stmt that we are vectorizing. */
3362 vect_gen_widened_results_half (enum tree_code code,
3363 tree vectype ATTRIBUTE_UNUSED,
3365 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3366 tree vec_dest, gimple_stmt_iterator *gsi,
3374 /* Generate half of the widened result: */
3375 if (code == CALL_EXPR)
3377 /* Target specific support */
3378 if (op_type == binary_op)
3379 new_stmt = gimple_build_call (decl, 2, vec_oprnd0, vec_oprnd1);
3381 new_stmt = gimple_build_call (decl, 1, vec_oprnd0);
3382 new_temp = make_ssa_name (vec_dest, new_stmt);
3383 gimple_call_set_lhs (new_stmt, new_temp);
3387 /* Generic support */
3388 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3389 if (op_type != binary_op)
3391 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vec_oprnd0,
3393 new_temp = make_ssa_name (vec_dest, new_stmt);
3394 gimple_assign_set_lhs (new_stmt, new_temp);
3396 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3398 if (code == CALL_EXPR)
3400 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3402 if (TREE_CODE (sym) == SSA_NAME)
3403 sym = SSA_NAME_VAR (sym);
3404 mark_sym_for_renaming (sym);
3412 /* Check if STMT performs a conversion operation, that can be vectorized.
3413 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3414 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3415 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3418 vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
3419 gimple *vec_stmt, slp_tree slp_node)
3424 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3425 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3426 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3427 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3428 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3429 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3433 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3434 gimple new_stmt = NULL;
3435 stmt_vec_info prev_stmt_info;
3438 tree vectype_out, vectype_in;
3441 tree rhs_type, lhs_type;
3443 enum { NARROW, NONE, WIDEN } modifier;
3445 VEC(tree,heap) *vec_oprnds0 = NULL;
3448 /* Is STMT a vectorizable conversion? */
3450 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3453 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3456 if (!is_gimple_assign (stmt))
3459 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
3462 code = gimple_assign_rhs_code (stmt);
3463 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3466 /* Check types of lhs and rhs. */
3467 op0 = gimple_assign_rhs1 (stmt);
3468 rhs_type = TREE_TYPE (op0);
3469 vectype_in = get_vectype_for_scalar_type (rhs_type);
3472 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3474 scalar_dest = gimple_assign_lhs (stmt);
3475 lhs_type = TREE_TYPE (scalar_dest);
3476 vectype_out = get_vectype_for_scalar_type (lhs_type);
3479 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3482 if (nunits_in == nunits_out / 2)
3484 else if (nunits_out == nunits_in)
3486 else if (nunits_out == nunits_in / 2)
3491 if (modifier == NONE)
3492 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3494 /* Bail out if the types are both integral or non-integral. */
3495 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3496 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3499 if (modifier == NARROW)
3500 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3502 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3504 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3505 this, so we can safely override NCOPIES with 1 here. */
3509 /* Sanity check: make sure that at least one copy of the vectorized stmt
3510 needs to be generated. */
3511 gcc_assert (ncopies >= 1);
3513 /* FORNOW. This restriction should be relaxed. */
3514 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3516 if (vect_print_dump_info (REPORT_DETAILS))
3517 fprintf (vect_dump, "multiple types in nested loop.");
3521 /* Check the operands of the operation. */
3522 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3524 if (vect_print_dump_info (REPORT_DETAILS))
3525 fprintf (vect_dump, "use not simple.");
3529 /* Supportable by target? */
3530 if ((modifier == NONE
3531 && !targetm.vectorize.builtin_conversion (code, vectype_in))
3532 || (modifier == WIDEN
3533 && !supportable_widening_operation (code, stmt, vectype_in,
3536 || (modifier == NARROW
3537 && !supportable_narrowing_operation (code, stmt, vectype_in,
3540 if (vect_print_dump_info (REPORT_DETAILS))
3541 fprintf (vect_dump, "op not supported by target.");
3545 if (modifier != NONE)
3547 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3548 /* FORNOW: SLP not supported. */
3549 if (STMT_SLP_TYPE (stmt_info))
3553 if (!vec_stmt) /* transformation not required. */
3555 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3560 if (vect_print_dump_info (REPORT_DETAILS))
3561 fprintf (vect_dump, "transform conversion.");
3564 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3566 if (modifier == NONE && !slp_node)
3567 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3569 prev_stmt_info = NULL;
3573 for (j = 0; j < ncopies; j++)
3579 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3581 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3584 targetm.vectorize.builtin_conversion (code, vectype_in);
3585 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3587 /* Arguments are ready. create the new vector stmt. */
3588 new_stmt = gimple_build_call (builtin_decl, 1, vop0);
3589 new_temp = make_ssa_name (vec_dest, new_stmt);
3590 gimple_call_set_lhs (new_stmt, new_temp);
3591 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3592 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3593 SSA_OP_ALL_VIRTUALS)
3595 if (TREE_CODE (sym) == SSA_NAME)
3596 sym = SSA_NAME_VAR (sym);
3597 mark_sym_for_renaming (sym);
3600 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3604 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3606 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3607 prev_stmt_info = vinfo_for_stmt (new_stmt);
3612 /* In case the vectorization factor (VF) is bigger than the number
3613 of elements that we can fit in a vectype (nunits), we have to
3614 generate more than one vector stmt - i.e - we need to "unroll"
3615 the vector stmt by a factor VF/nunits. */
3616 for (j = 0; j < ncopies; j++)
3619 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3621 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3623 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3625 /* Generate first half of the widened result: */
3627 = vect_gen_widened_results_half (code1, vectype_out, decl1,
3628 vec_oprnd0, vec_oprnd1,
3629 unary_op, vec_dest, gsi, stmt);
3631 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3633 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3634 prev_stmt_info = vinfo_for_stmt (new_stmt);
3636 /* Generate second half of the widened result: */
3638 = vect_gen_widened_results_half (code2, vectype_out, decl2,
3639 vec_oprnd0, vec_oprnd1,
3640 unary_op, vec_dest, gsi, stmt);
3641 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3642 prev_stmt_info = vinfo_for_stmt (new_stmt);
3647 /* In case the vectorization factor (VF) is bigger than the number
3648 of elements that we can fit in a vectype (nunits), we have to
3649 generate more than one vector stmt - i.e - we need to "unroll"
3650 the vector stmt by a factor VF/nunits. */
3651 for (j = 0; j < ncopies; j++)
3656 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3657 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3661 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3662 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3665 /* Arguments are ready. Create the new vector stmt. */
3666 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3667 new_stmt = gimple_build_assign_with_ops (code1, vec_dest, vec_oprnd0,
3669 new_temp = make_ssa_name (vec_dest, new_stmt);
3670 gimple_assign_set_lhs (new_stmt, new_temp);
3671 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3674 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3676 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3678 prev_stmt_info = vinfo_for_stmt (new_stmt);
3681 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3685 VEC_free (tree, heap, vec_oprnds0);
3691 /* Function vectorizable_assignment.
3693 Check if STMT performs an assignment (copy) that can be vectorized.
3694 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3695 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3696 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3699 vectorizable_assignment (gimple stmt, gimple_stmt_iterator *gsi,
3700 gimple *vec_stmt, slp_tree slp_node)
3705 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3706 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3707 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3711 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3712 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3713 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3715 VEC(tree,heap) *vec_oprnds = NULL;
3718 /* FORNOW: SLP with multiple types is not supported. The SLP analysis
3719 verifies this, so we can safely override NCOPIES with 1 here. */
3723 gcc_assert (ncopies >= 1);
3725 return false; /* FORNOW */
3727 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3730 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3733 /* Is vectorizable assignment? */
3734 if (!is_gimple_assign (stmt))
3737 scalar_dest = gimple_assign_lhs (stmt);
3738 if (TREE_CODE (scalar_dest) != SSA_NAME)
3741 if (gimple_assign_single_p (stmt)
3742 || gimple_assign_rhs_code (stmt) == PAREN_EXPR)
3743 op = gimple_assign_rhs1 (stmt);
3747 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3749 if (vect_print_dump_info (REPORT_DETAILS))
3750 fprintf (vect_dump, "use not simple.");
3754 if (!vec_stmt) /* transformation not required. */
3756 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3757 if (vect_print_dump_info (REPORT_DETAILS))
3758 fprintf (vect_dump, "=== vectorizable_assignment ===");
3759 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3764 if (vect_print_dump_info (REPORT_DETAILS))
3765 fprintf (vect_dump, "transform assignment.");
3768 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3771 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3773 /* Arguments are ready. create the new vector stmt. */
3774 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3776 *vec_stmt = gimple_build_assign (vec_dest, vop);
3777 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3778 gimple_assign_set_lhs (*vec_stmt, new_temp);
3779 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
3780 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3783 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3786 VEC_free (tree, heap, vec_oprnds);
3791 /* Function vect_min_worthwhile_factor.
3793 For a loop where we could vectorize the operation indicated by CODE,
3794 return the minimum vectorization factor that makes it worthwhile
3795 to use generic vectors. */
3797 vect_min_worthwhile_factor (enum tree_code code)
3818 /* Function vectorizable_induction
3820 Check if PHI performs an induction computation that can be vectorized.
3821 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3822 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3823 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3826 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3829 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3830 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3831 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3832 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3833 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3836 gcc_assert (ncopies >= 1);
3838 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3841 /* FORNOW: SLP not supported. */
3842 if (STMT_SLP_TYPE (stmt_info))
3845 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3847 if (gimple_code (phi) != GIMPLE_PHI)
3850 if (!vec_stmt) /* transformation not required. */
3852 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3853 if (vect_print_dump_info (REPORT_DETAILS))
3854 fprintf (vect_dump, "=== vectorizable_induction ===");
3855 vect_model_induction_cost (stmt_info, ncopies);
3861 if (vect_print_dump_info (REPORT_DETAILS))
3862 fprintf (vect_dump, "transform induction phi.");
3864 vec_def = get_initial_def_for_induction (phi);
3865 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3870 /* Function vectorizable_operation.
3872 Check if STMT performs a binary or unary operation that can be vectorized.
3873 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3874 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3875 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3878 vectorizable_operation (gimple stmt, gimple_stmt_iterator *gsi,
3879 gimple *vec_stmt, slp_tree slp_node)
3883 tree op0, op1 = NULL;
3884 tree vec_oprnd1 = NULL_TREE;
3885 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3886 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3887 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3888 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3889 enum tree_code code;
3890 enum machine_mode vec_mode;
3895 enum machine_mode optab_op2_mode;
3898 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3899 gimple new_stmt = NULL;
3900 stmt_vec_info prev_stmt_info;
3901 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
3904 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3906 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
3909 bool shift_p = false;
3910 bool scalar_shift_arg = false;
3912 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3913 this, so we can safely override NCOPIES with 1 here. */
3916 gcc_assert (ncopies >= 1);
3917 /* FORNOW. This restriction should be relaxed. */
3918 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3920 if (vect_print_dump_info (REPORT_DETAILS))
3921 fprintf (vect_dump, "multiple types in nested loop.");
3925 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3928 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3931 /* Is STMT a vectorizable binary/unary operation? */
3932 if (!is_gimple_assign (stmt))
3935 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
3938 scalar_dest = gimple_assign_lhs (stmt);
3939 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3942 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3943 if (nunits_out != nunits_in)
3946 code = gimple_assign_rhs_code (stmt);
3948 /* For pointer addition, we should use the normal plus for
3949 the vector addition. */
3950 if (code == POINTER_PLUS_EXPR)
3953 /* Support only unary or binary operations. */
3954 op_type = TREE_CODE_LENGTH (code);
3955 if (op_type != unary_op && op_type != binary_op)
3957 if (vect_print_dump_info (REPORT_DETAILS))
3958 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
3962 op0 = gimple_assign_rhs1 (stmt);
3963 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3965 if (vect_print_dump_info (REPORT_DETAILS))
3966 fprintf (vect_dump, "use not simple.");
3970 if (op_type == binary_op)
3972 op1 = gimple_assign_rhs2 (stmt);
3973 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
3975 if (vect_print_dump_info (REPORT_DETAILS))
3976 fprintf (vect_dump, "use not simple.");
3981 /* If this is a shift/rotate, determine whether the shift amount is a vector,
3982 or scalar. If the shift/rotate amount is a vector, use the vector/vector
3984 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
3985 || code == RROTATE_EXPR)
3989 /* vector shifted by vector */
3990 if (dt[1] == vect_loop_def)
3992 optab = optab_for_tree_code (code, vectype, optab_vector);
3993 if (vect_print_dump_info (REPORT_DETAILS))
3994 fprintf (vect_dump, "vector/vector shift/rotate found.");
3997 /* See if the machine has a vector shifted by scalar insn and if not
3998 then see if it has a vector shifted by vector insn */
3999 else if (dt[1] == vect_constant_def || dt[1] == vect_invariant_def)
4001 optab = optab_for_tree_code (code, vectype, optab_scalar);
4003 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4004 != CODE_FOR_nothing))
4006 scalar_shift_arg = true;
4007 if (vect_print_dump_info (REPORT_DETAILS))
4008 fprintf (vect_dump, "vector/scalar shift/rotate found.");
4012 optab = optab_for_tree_code (code, vectype, optab_vector);
4013 if (vect_print_dump_info (REPORT_DETAILS)
4015 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4016 != CODE_FOR_nothing))
4017 fprintf (vect_dump, "vector/vector shift/rotate found.");
4023 if (vect_print_dump_info (REPORT_DETAILS))
4024 fprintf (vect_dump, "operand mode requires invariant argument.");
4029 optab = optab_for_tree_code (code, vectype, optab_default);
4031 /* Supportable by target? */
4034 if (vect_print_dump_info (REPORT_DETAILS))
4035 fprintf (vect_dump, "no optab.");
4038 vec_mode = TYPE_MODE (vectype);
4039 icode = (int) optab_handler (optab, vec_mode)->insn_code;
4040 if (icode == CODE_FOR_nothing)
4042 if (vect_print_dump_info (REPORT_DETAILS))
4043 fprintf (vect_dump, "op not supported by target.");
4044 /* Check only during analysis. */
4045 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4046 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4047 < vect_min_worthwhile_factor (code)
4050 if (vect_print_dump_info (REPORT_DETAILS))
4051 fprintf (vect_dump, "proceeding using word mode.");
4054 /* Worthwhile without SIMD support? Check only during analysis. */
4055 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
4056 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4057 < vect_min_worthwhile_factor (code)
4060 if (vect_print_dump_info (REPORT_DETAILS))
4061 fprintf (vect_dump, "not worthwhile without SIMD support.");
4065 if (!vec_stmt) /* transformation not required. */
4067 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
4068 if (vect_print_dump_info (REPORT_DETAILS))
4069 fprintf (vect_dump, "=== vectorizable_operation ===");
4070 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4076 if (vect_print_dump_info (REPORT_DETAILS))
4077 fprintf (vect_dump, "transform binary/unary operation.");
4080 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4082 /* Allocate VECs for vector operands. In case of SLP, vector operands are
4083 created in the previous stages of the recursion, so no allocation is
4084 needed, except for the case of shift with scalar shift argument. In that
4085 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
4086 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
4087 In case of loop-based vectorization we allocate VECs of size 1. We
4088 allocate VEC_OPRNDS1 only in case of binary operation. */
4091 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4092 if (op_type == binary_op)
4093 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4095 else if (scalar_shift_arg)
4096 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4098 /* In case the vectorization factor (VF) is bigger than the number
4099 of elements that we can fit in a vectype (nunits), we have to generate
4100 more than one vector stmt - i.e - we need to "unroll" the
4101 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4102 from one copy of the vector stmt to the next, in the field
4103 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4104 stages to find the correct vector defs to be used when vectorizing
4105 stmts that use the defs of the current stmt. The example below illustrates
4106 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4107 4 vectorized stmts):
4109 before vectorization:
4110 RELATED_STMT VEC_STMT
4114 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4116 RELATED_STMT VEC_STMT
4117 VS1_0: vx0 = memref0 VS1_1 -
4118 VS1_1: vx1 = memref1 VS1_2 -
4119 VS1_2: vx2 = memref2 VS1_3 -
4120 VS1_3: vx3 = memref3 - -
4121 S1: x = load - VS1_0
4124 step2: vectorize stmt S2 (done here):
4125 To vectorize stmt S2 we first need to find the relevant vector
4126 def for the first operand 'x'. This is, as usual, obtained from
4127 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4128 that defines 'x' (S1). This way we find the stmt VS1_0, and the
4129 relevant vector def 'vx0'. Having found 'vx0' we can generate
4130 the vector stmt VS2_0, and as usual, record it in the
4131 STMT_VINFO_VEC_STMT of stmt S2.
4132 When creating the second copy (VS2_1), we obtain the relevant vector
4133 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4134 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4135 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4136 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4137 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4138 chain of stmts and pointers:
4139 RELATED_STMT VEC_STMT
4140 VS1_0: vx0 = memref0 VS1_1 -
4141 VS1_1: vx1 = memref1 VS1_2 -
4142 VS1_2: vx2 = memref2 VS1_3 -
4143 VS1_3: vx3 = memref3 - -
4144 S1: x = load - VS1_0
4145 VS2_0: vz0 = vx0 + v1 VS2_1 -
4146 VS2_1: vz1 = vx1 + v1 VS2_2 -
4147 VS2_2: vz2 = vx2 + v1 VS2_3 -
4148 VS2_3: vz3 = vx3 + v1 - -
4149 S2: z = x + 1 - VS2_0 */
4151 prev_stmt_info = NULL;
4152 for (j = 0; j < ncopies; j++)
4157 if (op_type == binary_op && scalar_shift_arg)
4159 /* Vector shl and shr insn patterns can be defined with scalar
4160 operand 2 (shift operand). In this case, use constant or loop
4161 invariant op1 directly, without extending it to vector mode
4163 optab_op2_mode = insn_data[icode].operand[2].mode;
4164 if (!VECTOR_MODE_P (optab_op2_mode))
4166 if (vect_print_dump_info (REPORT_DETAILS))
4167 fprintf (vect_dump, "operand 1 using scalar mode.");
4169 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4172 /* Store vec_oprnd1 for every vector stmt to be created
4173 for SLP_NODE. We check during the analysis that all the
4174 shift arguments are the same.
4175 TODO: Allow different constants for different vector
4176 stmts generated for an SLP instance. */
4177 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4178 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4183 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4184 (a special case for certain kind of vector shifts); otherwise,
4185 operand 1 should be of a vector type (the usual case). */
4186 if (op_type == binary_op && !vec_oprnd1)
4187 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4190 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4194 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4196 /* Arguments are ready. Create the new vector stmt. */
4197 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4199 vop1 = ((op_type == binary_op)
4200 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4201 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4202 new_temp = make_ssa_name (vec_dest, new_stmt);
4203 gimple_assign_set_lhs (new_stmt, new_temp);
4204 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4206 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4210 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4212 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4213 prev_stmt_info = vinfo_for_stmt (new_stmt);
4216 VEC_free (tree, heap, vec_oprnds0);
4218 VEC_free (tree, heap, vec_oprnds1);
4224 /* Function vectorizable_type_demotion
4226 Check if STMT performs a binary or unary operation that involves
4227 type demotion, and if it can be vectorized.
4228 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4229 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4230 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4233 vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi,
4239 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4240 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4241 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4242 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4243 enum tree_code code, code1 = ERROR_MARK;
4247 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4249 stmt_vec_info prev_stmt_info;
4257 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4260 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4263 /* Is STMT a vectorizable type-demotion operation? */
4264 if (!is_gimple_assign (stmt))
4267 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4270 code = gimple_assign_rhs_code (stmt);
4271 if (code != NOP_EXPR && code != CONVERT_EXPR)
4274 op0 = gimple_assign_rhs1 (stmt);
4275 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4278 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4280 scalar_dest = gimple_assign_lhs (stmt);
4281 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4284 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4285 if (nunits_in != nunits_out / 2) /* FORNOW */
4288 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4289 gcc_assert (ncopies >= 1);
4290 /* FORNOW. This restriction should be relaxed. */
4291 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4293 if (vect_print_dump_info (REPORT_DETAILS))
4294 fprintf (vect_dump, "multiple types in nested loop.");
4298 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4299 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4300 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4301 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4302 && (code == NOP_EXPR || code == CONVERT_EXPR))))
4305 /* Check the operands of the operation. */
4306 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4308 if (vect_print_dump_info (REPORT_DETAILS))
4309 fprintf (vect_dump, "use not simple.");
4313 /* Supportable by target? */
4314 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1))
4317 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4319 if (!vec_stmt) /* transformation not required. */
4321 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4322 if (vect_print_dump_info (REPORT_DETAILS))
4323 fprintf (vect_dump, "=== vectorizable_demotion ===");
4324 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4329 if (vect_print_dump_info (REPORT_DETAILS))
4330 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4334 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4336 /* In case the vectorization factor (VF) is bigger than the number
4337 of elements that we can fit in a vectype (nunits), we have to generate
4338 more than one vector stmt - i.e - we need to "unroll" the
4339 vector stmt by a factor VF/nunits. */
4340 prev_stmt_info = NULL;
4341 for (j = 0; j < ncopies; j++)
4346 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4347 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4351 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
4352 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4355 /* Arguments are ready. Create the new vector stmt. */
4356 new_stmt = gimple_build_assign_with_ops (code1, vec_dest, vec_oprnd0,
4358 new_temp = make_ssa_name (vec_dest, new_stmt);
4359 gimple_assign_set_lhs (new_stmt, new_temp);
4360 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4363 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4365 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4367 prev_stmt_info = vinfo_for_stmt (new_stmt);
4370 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4375 /* Function vectorizable_type_promotion
4377 Check if STMT performs a binary or unary operation that involves
4378 type promotion, and if it can be vectorized.
4379 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4380 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4381 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4384 vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
4389 tree op0, op1 = NULL;
4390 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4391 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4392 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4393 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4394 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4395 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4399 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4401 stmt_vec_info prev_stmt_info;
4409 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4412 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4415 /* Is STMT a vectorizable type-promotion operation? */
4416 if (!is_gimple_assign (stmt))
4419 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4422 code = gimple_assign_rhs_code (stmt);
4423 if (code != NOP_EXPR && code != CONVERT_EXPR
4424 && code != WIDEN_MULT_EXPR)
4427 op0 = gimple_assign_rhs1 (stmt);
4428 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4431 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4433 scalar_dest = gimple_assign_lhs (stmt);
4434 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4437 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4438 if (nunits_out != nunits_in / 2) /* FORNOW */
4441 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4442 gcc_assert (ncopies >= 1);
4443 /* FORNOW. This restriction should be relaxed. */
4444 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4446 if (vect_print_dump_info (REPORT_DETAILS))
4447 fprintf (vect_dump, "multiple types in nested loop.");
4451 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4452 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4453 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4454 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4455 && (code == CONVERT_EXPR || code == NOP_EXPR))))
4458 /* Check the operands of the operation. */
4459 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4461 if (vect_print_dump_info (REPORT_DETAILS))
4462 fprintf (vect_dump, "use not simple.");
4466 op_type = TREE_CODE_LENGTH (code);
4467 if (op_type == binary_op)
4469 op1 = gimple_assign_rhs2 (stmt);
4470 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4472 if (vect_print_dump_info (REPORT_DETAILS))
4473 fprintf (vect_dump, "use not simple.");
4478 /* Supportable by target? */
4479 if (!supportable_widening_operation (code, stmt, vectype_in,
4480 &decl1, &decl2, &code1, &code2))
4483 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4485 if (!vec_stmt) /* transformation not required. */
4487 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4488 if (vect_print_dump_info (REPORT_DETAILS))
4489 fprintf (vect_dump, "=== vectorizable_promotion ===");
4490 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4496 if (vect_print_dump_info (REPORT_DETAILS))
4497 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4501 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4503 /* In case the vectorization factor (VF) is bigger than the number
4504 of elements that we can fit in a vectype (nunits), we have to generate
4505 more than one vector stmt - i.e - we need to "unroll" the
4506 vector stmt by a factor VF/nunits. */
4508 prev_stmt_info = NULL;
4509 for (j = 0; j < ncopies; j++)
4514 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4515 if (op_type == binary_op)
4516 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4520 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4521 if (op_type == binary_op)
4522 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4525 /* Arguments are ready. Create the new vector stmt. We are creating
4526 two vector defs because the widened result does not fit in one vector.
4527 The vectorized stmt can be expressed as a call to a target builtin,
4528 or a using a tree-code. */
4529 /* Generate first half of the widened result: */
4530 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
4531 vec_oprnd0, vec_oprnd1, op_type, vec_dest, gsi, stmt);
4533 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4535 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4536 prev_stmt_info = vinfo_for_stmt (new_stmt);
4538 /* Generate second half of the widened result: */
4539 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
4540 vec_oprnd0, vec_oprnd1, op_type, vec_dest, gsi, stmt);
4541 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4542 prev_stmt_info = vinfo_for_stmt (new_stmt);
4546 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4551 /* Function vect_strided_store_supported.
4553 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4554 and FALSE otherwise. */
4557 vect_strided_store_supported (tree vectype)
4559 optab interleave_high_optab, interleave_low_optab;
4562 mode = (int) TYPE_MODE (vectype);
4564 /* Check that the operation is supported. */
4565 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4566 vectype, optab_default);
4567 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4568 vectype, optab_default);
4569 if (!interleave_high_optab || !interleave_low_optab)
4571 if (vect_print_dump_info (REPORT_DETAILS))
4572 fprintf (vect_dump, "no optab for interleave.");
4576 if (optab_handler (interleave_high_optab, mode)->insn_code
4578 || optab_handler (interleave_low_optab, mode)->insn_code
4579 == CODE_FOR_nothing)
4581 if (vect_print_dump_info (REPORT_DETAILS))
4582 fprintf (vect_dump, "interleave op not supported by target.");
4590 /* Function vect_permute_store_chain.
4592 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4593 a power of 2, generate interleave_high/low stmts to reorder the data
4594 correctly for the stores. Return the final references for stores in
4597 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4598 The input is 4 vectors each containing 8 elements. We assign a number to each
4599 element, the input sequence is:
4601 1st vec: 0 1 2 3 4 5 6 7
4602 2nd vec: 8 9 10 11 12 13 14 15
4603 3rd vec: 16 17 18 19 20 21 22 23
4604 4th vec: 24 25 26 27 28 29 30 31
4606 The output sequence should be:
4608 1st vec: 0 8 16 24 1 9 17 25
4609 2nd vec: 2 10 18 26 3 11 19 27
4610 3rd vec: 4 12 20 28 5 13 21 30
4611 4th vec: 6 14 22 30 7 15 23 31
4613 i.e., we interleave the contents of the four vectors in their order.
4615 We use interleave_high/low instructions to create such output. The input of
4616 each interleave_high/low operation is two vectors:
4619 the even elements of the result vector are obtained left-to-right from the
4620 high/low elements of the first vector. The odd elements of the result are
4621 obtained left-to-right from the high/low elements of the second vector.
4622 The output of interleave_high will be: 0 4 1 5
4623 and of interleave_low: 2 6 3 7
4626 The permutation is done in log LENGTH stages. In each stage interleave_high
4627 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4628 where the first argument is taken from the first half of DR_CHAIN and the
4629 second argument from it's second half.
4632 I1: interleave_high (1st vec, 3rd vec)
4633 I2: interleave_low (1st vec, 3rd vec)
4634 I3: interleave_high (2nd vec, 4th vec)
4635 I4: interleave_low (2nd vec, 4th vec)
4637 The output for the first stage is:
4639 I1: 0 16 1 17 2 18 3 19
4640 I2: 4 20 5 21 6 22 7 23
4641 I3: 8 24 9 25 10 26 11 27
4642 I4: 12 28 13 29 14 30 15 31
4644 The output of the second stage, i.e. the final result is:
4646 I1: 0 8 16 24 1 9 17 25
4647 I2: 2 10 18 26 3 11 19 27
4648 I3: 4 12 20 28 5 13 21 30
4649 I4: 6 14 22 30 7 15 23 31. */
4652 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
4653 unsigned int length,
4655 gimple_stmt_iterator *gsi,
4656 VEC(tree,heap) **result_chain)
4658 tree perm_dest, vect1, vect2, high, low;
4660 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4664 enum tree_code high_code, low_code;
4666 scalar_dest = gimple_assign_lhs (stmt);
4668 /* Check that the operation is supported. */
4669 if (!vect_strided_store_supported (vectype))
4672 *result_chain = VEC_copy (tree, heap, dr_chain);
4674 for (i = 0; i < exact_log2 (length); i++)
4676 for (j = 0; j < length/2; j++)
4678 vect1 = VEC_index (tree, dr_chain, j);
4679 vect2 = VEC_index (tree, dr_chain, j+length/2);
4681 /* Create interleaving stmt:
4682 in the case of big endian:
4683 high = interleave_high (vect1, vect2)
4684 and in the case of little endian:
4685 high = interleave_low (vect1, vect2). */
4686 perm_dest = create_tmp_var (vectype, "vect_inter_high");
4687 DECL_GIMPLE_REG_P (perm_dest) = 1;
4688 add_referenced_var (perm_dest);
4689 if (BYTES_BIG_ENDIAN)
4691 high_code = VEC_INTERLEAVE_HIGH_EXPR;
4692 low_code = VEC_INTERLEAVE_LOW_EXPR;
4696 low_code = VEC_INTERLEAVE_HIGH_EXPR;
4697 high_code = VEC_INTERLEAVE_LOW_EXPR;
4699 perm_stmt = gimple_build_assign_with_ops (high_code, perm_dest,
4701 high = make_ssa_name (perm_dest, perm_stmt);
4702 gimple_assign_set_lhs (perm_stmt, high);
4703 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4704 VEC_replace (tree, *result_chain, 2*j, high);
4706 /* Create interleaving stmt:
4707 in the case of big endian:
4708 low = interleave_low (vect1, vect2)
4709 and in the case of little endian:
4710 low = interleave_high (vect1, vect2). */
4711 perm_dest = create_tmp_var (vectype, "vect_inter_low");
4712 DECL_GIMPLE_REG_P (perm_dest) = 1;
4713 add_referenced_var (perm_dest);
4714 perm_stmt = gimple_build_assign_with_ops (low_code, perm_dest,
4716 low = make_ssa_name (perm_dest, perm_stmt);
4717 gimple_assign_set_lhs (perm_stmt, low);
4718 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4719 VEC_replace (tree, *result_chain, 2*j+1, low);
4721 dr_chain = VEC_copy (tree, heap, *result_chain);
4727 /* Function vectorizable_store.
4729 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
4731 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4732 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4733 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4736 vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
4742 tree vec_oprnd = NULL_TREE;
4743 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4744 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
4745 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4746 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4747 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4748 enum machine_mode vec_mode;
4750 enum dr_alignment_support alignment_support_scheme;
4753 enum vect_def_type dt;
4754 stmt_vec_info prev_stmt_info = NULL;
4755 tree dataref_ptr = NULL_TREE;
4756 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4757 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4759 gimple next_stmt, first_stmt = NULL;
4760 bool strided_store = false;
4761 unsigned int group_size, i;
4762 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
4764 VEC(tree,heap) *vec_oprnds = NULL;
4765 bool slp = (slp_node != NULL);
4766 stmt_vec_info first_stmt_vinfo;
4767 unsigned int vec_num;
4769 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
4770 this, so we can safely override NCOPIES with 1 here. */
4774 gcc_assert (ncopies >= 1);
4776 /* FORNOW. This restriction should be relaxed. */
4777 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4779 if (vect_print_dump_info (REPORT_DETAILS))
4780 fprintf (vect_dump, "multiple types in nested loop.");
4784 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4787 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4790 /* Is vectorizable store? */
4792 if (!is_gimple_assign (stmt))
4795 scalar_dest = gimple_assign_lhs (stmt);
4796 if (TREE_CODE (scalar_dest) != ARRAY_REF
4797 && TREE_CODE (scalar_dest) != INDIRECT_REF
4798 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
4801 gcc_assert (gimple_assign_single_p (stmt));
4802 op = gimple_assign_rhs1 (stmt);
4803 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4805 if (vect_print_dump_info (REPORT_DETAILS))
4806 fprintf (vect_dump, "use not simple.");
4810 /* If accesses through a pointer to vectype do not alias the original
4811 memory reference we have a problem. */
4812 if (get_alias_set (vectype) != get_alias_set (TREE_TYPE (scalar_dest))
4813 && !alias_set_subset_of (get_alias_set (vectype),
4814 get_alias_set (TREE_TYPE (scalar_dest))))
4816 if (vect_print_dump_info (REPORT_DETAILS))
4817 fprintf (vect_dump, "vector type does not alias scalar type");
4821 if (!useless_type_conversion_p (TREE_TYPE (op), TREE_TYPE (scalar_dest)))
4823 if (vect_print_dump_info (REPORT_DETAILS))
4824 fprintf (vect_dump, "operands of different types");
4828 vec_mode = TYPE_MODE (vectype);
4829 /* FORNOW. In some cases can vectorize even if data-type not supported
4830 (e.g. - array initialization with 0). */
4831 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
4834 if (!STMT_VINFO_DATA_REF (stmt_info))
4837 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4839 strided_store = true;
4840 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4841 if (!vect_strided_store_supported (vectype)
4842 && !PURE_SLP_STMT (stmt_info) && !slp)
4845 if (first_stmt == stmt)
4847 /* STMT is the leader of the group. Check the operands of all the
4848 stmts of the group. */
4849 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
4852 gcc_assert (gimple_assign_single_p (next_stmt));
4853 op = gimple_assign_rhs1 (next_stmt);
4854 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4856 if (vect_print_dump_info (REPORT_DETAILS))
4857 fprintf (vect_dump, "use not simple.");
4860 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4865 if (!vec_stmt) /* transformation not required. */
4867 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
4868 if (!PURE_SLP_STMT (stmt_info))
4869 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
4877 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
4878 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
4880 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
4883 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
4885 /* We vectorize all the stmts of the interleaving group when we
4886 reach the last stmt in the group. */
4887 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
4888 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
4896 strided_store = false;
4898 /* VEC_NUM is the number of vect stmts to be created for this group. */
4899 if (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) < group_size)
4900 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4902 vec_num = group_size;
4908 group_size = vec_num = 1;
4909 first_stmt_vinfo = stmt_info;
4912 if (vect_print_dump_info (REPORT_DETAILS))
4913 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
4915 dr_chain = VEC_alloc (tree, heap, group_size);
4916 oprnds = VEC_alloc (tree, heap, group_size);
4918 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
4919 gcc_assert (alignment_support_scheme);
4920 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
4922 /* In case the vectorization factor (VF) is bigger than the number
4923 of elements that we can fit in a vectype (nunits), we have to generate
4924 more than one vector stmt - i.e - we need to "unroll" the
4925 vector stmt by a factor VF/nunits. For more details see documentation in
4926 vect_get_vec_def_for_copy_stmt. */
4928 /* In case of interleaving (non-unit strided access):
4935 We create vectorized stores starting from base address (the access of the
4936 first stmt in the chain (S2 in the above example), when the last store stmt
4937 of the chain (S4) is reached:
4940 VS2: &base + vec_size*1 = vx0
4941 VS3: &base + vec_size*2 = vx1
4942 VS4: &base + vec_size*3 = vx3
4944 Then permutation statements are generated:
4946 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
4947 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
4950 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
4951 (the order of the data-refs in the output of vect_permute_store_chain
4952 corresponds to the order of scalar stmts in the interleaving chain - see
4953 the documentation of vect_permute_store_chain()).
4955 In case of both multiple types and interleaving, above vector stores and
4956 permutation stmts are created for every copy. The result vector stmts are
4957 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
4958 STMT_VINFO_RELATED_STMT for the next copies.
4961 prev_stmt_info = NULL;
4962 for (j = 0; j < ncopies; j++)
4971 /* Get vectorized arguments for SLP_NODE. */
4972 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
4974 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
4978 /* For interleaved stores we collect vectorized defs for all the
4979 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
4980 used as an input to vect_permute_store_chain(), and OPRNDS as
4981 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
4983 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4984 OPRNDS are of size 1. */
4985 next_stmt = first_stmt;
4986 for (i = 0; i < group_size; i++)
4988 /* Since gaps are not supported for interleaved stores,
4989 GROUP_SIZE is the exact number of stmts in the chain.
4990 Therefore, NEXT_STMT can't be NULL_TREE. In case that
4991 there is no interleaving, GROUP_SIZE is 1, and only one
4992 iteration of the loop will be executed. */
4993 gcc_assert (next_stmt);
4994 gcc_assert (gimple_assign_single_p (next_stmt));
4995 op = gimple_assign_rhs1 (next_stmt);
4997 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
4999 VEC_quick_push(tree, dr_chain, vec_oprnd);
5000 VEC_quick_push(tree, oprnds, vec_oprnd);
5001 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5005 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
5006 &dummy, &ptr_incr, false,
5008 gcc_assert (!inv_p);
5012 /* FORNOW SLP doesn't work for multiple types. */
5015 /* For interleaved stores we created vectorized defs for all the
5016 defs stored in OPRNDS in the previous iteration (previous copy).
5017 DR_CHAIN is then used as an input to vect_permute_store_chain(),
5018 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
5020 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5021 OPRNDS are of size 1. */
5022 for (i = 0; i < group_size; i++)
5024 op = VEC_index (tree, oprnds, i);
5025 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
5026 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
5027 VEC_replace(tree, dr_chain, i, vec_oprnd);
5028 VEC_replace(tree, oprnds, i, vec_oprnd);
5031 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
5036 result_chain = VEC_alloc (tree, heap, group_size);
5038 if (!vect_permute_store_chain (dr_chain, group_size, stmt, gsi,
5043 next_stmt = first_stmt;
5044 for (i = 0; i < vec_num; i++)
5047 /* Bump the vector pointer. */
5048 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
5052 vec_oprnd = VEC_index (tree, vec_oprnds, i);
5053 else if (strided_store)
5054 /* For strided stores vectorized defs are interleaved in
5055 vect_permute_store_chain(). */
5056 vec_oprnd = VEC_index (tree, result_chain, i);
5058 data_ref = build_fold_indirect_ref (dataref_ptr);
5059 /* Arguments are ready. Create the new vector stmt. */
5060 new_stmt = gimple_build_assign (data_ref, vec_oprnd);
5061 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5062 mark_symbols_for_renaming (new_stmt);
5065 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5067 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5069 prev_stmt_info = vinfo_for_stmt (new_stmt);
5070 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5076 VEC_free (tree, heap, dr_chain);
5077 VEC_free (tree, heap, oprnds);
5079 VEC_free (tree, heap, result_chain);
5085 /* Function vect_setup_realignment
5087 This function is called when vectorizing an unaligned load using
5088 the dr_explicit_realign[_optimized] scheme.
5089 This function generates the following code at the loop prolog:
5092 x msq_init = *(floor(p)); # prolog load
5093 realignment_token = call target_builtin;
5095 x msq = phi (msq_init, ---)
5097 The stmts marked with x are generated only for the case of
5098 dr_explicit_realign_optimized.
5100 The code above sets up a new (vector) pointer, pointing to the first
5101 location accessed by STMT, and a "floor-aligned" load using that pointer.
5102 It also generates code to compute the "realignment-token" (if the relevant
5103 target hook was defined), and creates a phi-node at the loop-header bb
5104 whose arguments are the result of the prolog-load (created by this
5105 function) and the result of a load that takes place in the loop (to be
5106 created by the caller to this function).
5108 For the case of dr_explicit_realign_optimized:
5109 The caller to this function uses the phi-result (msq) to create the
5110 realignment code inside the loop, and sets up the missing phi argument,
5113 msq = phi (msq_init, lsq)
5114 lsq = *(floor(p')); # load in loop
5115 result = realign_load (msq, lsq, realignment_token);
5117 For the case of dr_explicit_realign:
5119 msq = *(floor(p)); # load in loop
5121 lsq = *(floor(p')); # load in loop
5122 result = realign_load (msq, lsq, realignment_token);
5125 STMT - (scalar) load stmt to be vectorized. This load accesses
5126 a memory location that may be unaligned.
5127 BSI - place where new code is to be inserted.
5128 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5132 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5133 target hook, if defined.
5134 Return value - the result of the loop-header phi node. */
5137 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
5138 tree *realignment_token,
5139 enum dr_alignment_support alignment_support_scheme,
5141 struct loop **at_loop)
5143 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5144 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5145 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5146 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5148 tree scalar_dest = gimple_assign_lhs (stmt);
5155 tree msq_init = NULL_TREE;
5158 tree msq = NULL_TREE;
5159 gimple_seq stmts = NULL;
5161 bool compute_in_loop = false;
5162 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5163 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5164 struct loop *loop_for_initial_load;
5166 gcc_assert (alignment_support_scheme == dr_explicit_realign
5167 || alignment_support_scheme == dr_explicit_realign_optimized);
5169 /* We need to generate three things:
5170 1. the misalignment computation
5171 2. the extra vector load (for the optimized realignment scheme).
5172 3. the phi node for the two vectors from which the realignment is
5173 done (for the optimized realignment scheme).
5176 /* 1. Determine where to generate the misalignment computation.
5178 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5179 calculation will be generated by this function, outside the loop (in the
5180 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5181 caller, inside the loop.
5183 Background: If the misalignment remains fixed throughout the iterations of
5184 the loop, then both realignment schemes are applicable, and also the
5185 misalignment computation can be done outside LOOP. This is because we are
5186 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5187 are a multiple of VS (the Vector Size), and therefore the misalignment in
5188 different vectorized LOOP iterations is always the same.
5189 The problem arises only if the memory access is in an inner-loop nested
5190 inside LOOP, which is now being vectorized using outer-loop vectorization.
5191 This is the only case when the misalignment of the memory access may not
5192 remain fixed throughout the iterations of the inner-loop (as explained in
5193 detail in vect_supportable_dr_alignment). In this case, not only is the
5194 optimized realignment scheme not applicable, but also the misalignment
5195 computation (and generation of the realignment token that is passed to
5196 REALIGN_LOAD) have to be done inside the loop.
5198 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5199 or not, which in turn determines if the misalignment is computed inside
5200 the inner-loop, or outside LOOP. */
5202 if (init_addr != NULL_TREE)
5204 compute_in_loop = true;
5205 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5209 /* 2. Determine where to generate the extra vector load.
5211 For the optimized realignment scheme, instead of generating two vector
5212 loads in each iteration, we generate a single extra vector load in the
5213 preheader of the loop, and in each iteration reuse the result of the
5214 vector load from the previous iteration. In case the memory access is in
5215 an inner-loop nested inside LOOP, which is now being vectorized using
5216 outer-loop vectorization, we need to determine whether this initial vector
5217 load should be generated at the preheader of the inner-loop, or can be
5218 generated at the preheader of LOOP. If the memory access has no evolution
5219 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5220 to be generated inside LOOP (in the preheader of the inner-loop). */
5222 if (nested_in_vect_loop)
5224 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5225 bool invariant_in_outerloop =
5226 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5227 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5230 loop_for_initial_load = loop;
5232 *at_loop = loop_for_initial_load;
5234 /* 3. For the case of the optimized realignment, create the first vector
5235 load at the loop preheader. */
5237 if (alignment_support_scheme == dr_explicit_realign_optimized)
5239 /* Create msq_init = *(floor(p1)) in the loop preheader */
5241 gcc_assert (!compute_in_loop);
5242 pe = loop_preheader_edge (loop_for_initial_load);
5243 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5244 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5245 &init_addr, &inc, true, &inv_p);
5246 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5247 new_stmt = gimple_build_assign (vec_dest, data_ref);
5248 new_temp = make_ssa_name (vec_dest, new_stmt);
5249 gimple_assign_set_lhs (new_stmt, new_temp);
5250 mark_symbols_for_renaming (new_stmt);
5251 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5252 gcc_assert (!new_bb);
5253 msq_init = gimple_assign_lhs (new_stmt);
5256 /* 4. Create realignment token using a target builtin, if available.
5257 It is done either inside the containing loop, or before LOOP (as
5258 determined above). */
5260 if (targetm.vectorize.builtin_mask_for_load)
5264 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5265 if (compute_in_loop)
5266 gcc_assert (init_addr); /* already computed by the caller. */
5269 /* Generate the INIT_ADDR computation outside LOOP. */
5270 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5272 pe = loop_preheader_edge (loop);
5273 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5274 gcc_assert (!new_bb);
5277 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5278 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5280 vect_create_destination_var (scalar_dest,
5281 gimple_call_return_type (new_stmt));
5282 new_temp = make_ssa_name (vec_dest, new_stmt);
5283 gimple_call_set_lhs (new_stmt, new_temp);
5285 if (compute_in_loop)
5286 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5289 /* Generate the misalignment computation outside LOOP. */
5290 pe = loop_preheader_edge (loop);
5291 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5292 gcc_assert (!new_bb);
5295 *realignment_token = gimple_call_lhs (new_stmt);
5297 /* The result of the CALL_EXPR to this builtin is determined from
5298 the value of the parameter and no global variables are touched
5299 which makes the builtin a "const" function. Requiring the
5300 builtin to have the "const" attribute makes it unnecessary
5301 to call mark_call_clobbered. */
5302 gcc_assert (TREE_READONLY (builtin_decl));
5305 if (alignment_support_scheme == dr_explicit_realign)
5308 gcc_assert (!compute_in_loop);
5309 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5312 /* 5. Create msq = phi <msq_init, lsq> in loop */
5314 pe = loop_preheader_edge (containing_loop);
5315 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5316 msq = make_ssa_name (vec_dest, NULL);
5317 phi_stmt = create_phi_node (msq, containing_loop->header);
5318 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5319 add_phi_arg (phi_stmt, msq_init, pe);
5325 /* Function vect_strided_load_supported.
5327 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5328 and FALSE otherwise. */
5331 vect_strided_load_supported (tree vectype)
5333 optab perm_even_optab, perm_odd_optab;
5336 mode = (int) TYPE_MODE (vectype);
5338 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype,
5340 if (!perm_even_optab)
5342 if (vect_print_dump_info (REPORT_DETAILS))
5343 fprintf (vect_dump, "no optab for perm_even.");
5347 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5349 if (vect_print_dump_info (REPORT_DETAILS))
5350 fprintf (vect_dump, "perm_even op not supported by target.");
5354 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype,
5356 if (!perm_odd_optab)
5358 if (vect_print_dump_info (REPORT_DETAILS))
5359 fprintf (vect_dump, "no optab for perm_odd.");
5363 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5365 if (vect_print_dump_info (REPORT_DETAILS))
5366 fprintf (vect_dump, "perm_odd op not supported by target.");
5373 /* Function vect_permute_load_chain.
5375 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5376 a power of 2, generate extract_even/odd stmts to reorder the input data
5377 correctly. Return the final references for loads in RESULT_CHAIN.
5379 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5380 The input is 4 vectors each containing 8 elements. We assign a number to each
5381 element, the input sequence is:
5383 1st vec: 0 1 2 3 4 5 6 7
5384 2nd vec: 8 9 10 11 12 13 14 15
5385 3rd vec: 16 17 18 19 20 21 22 23
5386 4th vec: 24 25 26 27 28 29 30 31
5388 The output sequence should be:
5390 1st vec: 0 4 8 12 16 20 24 28
5391 2nd vec: 1 5 9 13 17 21 25 29
5392 3rd vec: 2 6 10 14 18 22 26 30
5393 4th vec: 3 7 11 15 19 23 27 31
5395 i.e., the first output vector should contain the first elements of each
5396 interleaving group, etc.
5398 We use extract_even/odd instructions to create such output. The input of each
5399 extract_even/odd operation is two vectors
5403 and the output is the vector of extracted even/odd elements. The output of
5404 extract_even will be: 0 2 4 6
5405 and of extract_odd: 1 3 5 7
5408 The permutation is done in log LENGTH stages. In each stage extract_even and
5409 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5410 order. In our example,
5412 E1: extract_even (1st vec, 2nd vec)
5413 E2: extract_odd (1st vec, 2nd vec)
5414 E3: extract_even (3rd vec, 4th vec)
5415 E4: extract_odd (3rd vec, 4th vec)
5417 The output for the first stage will be:
5419 E1: 0 2 4 6 8 10 12 14
5420 E2: 1 3 5 7 9 11 13 15
5421 E3: 16 18 20 22 24 26 28 30
5422 E4: 17 19 21 23 25 27 29 31
5424 In order to proceed and create the correct sequence for the next stage (or
5425 for the correct output, if the second stage is the last one, as in our
5426 example), we first put the output of extract_even operation and then the
5427 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5428 The input for the second stage is:
5430 1st vec (E1): 0 2 4 6 8 10 12 14
5431 2nd vec (E3): 16 18 20 22 24 26 28 30
5432 3rd vec (E2): 1 3 5 7 9 11 13 15
5433 4th vec (E4): 17 19 21 23 25 27 29 31
5435 The output of the second stage:
5437 E1: 0 4 8 12 16 20 24 28
5438 E2: 2 6 10 14 18 22 26 30
5439 E3: 1 5 9 13 17 21 25 29
5440 E4: 3 7 11 15 19 23 27 31
5442 And RESULT_CHAIN after reordering:
5444 1st vec (E1): 0 4 8 12 16 20 24 28
5445 2nd vec (E3): 1 5 9 13 17 21 25 29
5446 3rd vec (E2): 2 6 10 14 18 22 26 30
5447 4th vec (E4): 3 7 11 15 19 23 27 31. */
5450 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5451 unsigned int length,
5453 gimple_stmt_iterator *gsi,
5454 VEC(tree,heap) **result_chain)
5456 tree perm_dest, data_ref, first_vect, second_vect;
5458 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5462 /* Check that the operation is supported. */
5463 if (!vect_strided_load_supported (vectype))
5466 *result_chain = VEC_copy (tree, heap, dr_chain);
5467 for (i = 0; i < exact_log2 (length); i++)
5469 for (j = 0; j < length; j +=2)
5471 first_vect = VEC_index (tree, dr_chain, j);
5472 second_vect = VEC_index (tree, dr_chain, j+1);
5474 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5475 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5476 DECL_GIMPLE_REG_P (perm_dest) = 1;
5477 add_referenced_var (perm_dest);
5479 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_EVEN_EXPR,
5480 perm_dest, first_vect,
5483 data_ref = make_ssa_name (perm_dest, perm_stmt);
5484 gimple_assign_set_lhs (perm_stmt, data_ref);
5485 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5486 mark_symbols_for_renaming (perm_stmt);
5488 VEC_replace (tree, *result_chain, j/2, data_ref);
5490 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5491 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5492 DECL_GIMPLE_REG_P (perm_dest) = 1;
5493 add_referenced_var (perm_dest);
5495 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_ODD_EXPR,
5496 perm_dest, first_vect,
5498 data_ref = make_ssa_name (perm_dest, perm_stmt);
5499 gimple_assign_set_lhs (perm_stmt, data_ref);
5500 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5501 mark_symbols_for_renaming (perm_stmt);
5503 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5505 dr_chain = VEC_copy (tree, heap, *result_chain);
5511 /* Function vect_transform_strided_load.
5513 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5514 to perform their permutation and ascribe the result vectorized statements to
5515 the scalar statements.
5519 vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
5520 gimple_stmt_iterator *gsi)
5522 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5523 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5524 gimple next_stmt, new_stmt;
5525 VEC(tree,heap) *result_chain = NULL;
5526 unsigned int i, gap_count;
5529 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5530 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5531 vectors, that are ready for vector computation. */
5532 result_chain = VEC_alloc (tree, heap, size);
5534 if (!vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain))
5537 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5538 Since we scan the chain starting from it's first node, their order
5539 corresponds the order of data-refs in RESULT_CHAIN. */
5540 next_stmt = first_stmt;
5542 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5547 /* Skip the gaps. Loads created for the gaps will be removed by dead
5548 code elimination pass later. No need to check for the first stmt in
5549 the group, since it always exists.
5550 DR_GROUP_GAP is the number of steps in elements from the previous
5551 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5552 correspond to the gaps.
5554 if (next_stmt != first_stmt
5555 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5563 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5564 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5565 copies, and we put the new vector statement in the first available
5567 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5568 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5572 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5574 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5577 prev_stmt = rel_stmt;
5578 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5580 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
5582 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5584 /* If NEXT_STMT accesses the same DR as the previous statement,
5585 put the same TMP_DATA_REF as its vectorized statement; otherwise
5586 get the next data-ref from RESULT_CHAIN. */
5587 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5592 VEC_free (tree, heap, result_chain);
5597 /* vectorizable_load.
5599 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
5601 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5602 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5603 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5606 vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
5610 tree vec_dest = NULL;
5611 tree data_ref = NULL;
5612 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5613 stmt_vec_info prev_stmt_info;
5614 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5615 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5616 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5617 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5618 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
5619 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5622 gimple new_stmt = NULL;
5624 enum dr_alignment_support alignment_support_scheme;
5625 tree dataref_ptr = NULL_TREE;
5627 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5628 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5629 int i, j, group_size;
5630 tree msq = NULL_TREE, lsq;
5631 tree offset = NULL_TREE;
5632 tree realignment_token = NULL_TREE;
5634 VEC(tree,heap) *dr_chain = NULL;
5635 bool strided_load = false;
5639 bool compute_in_loop = false;
5640 struct loop *at_loop;
5642 bool slp = (slp_node != NULL);
5643 enum tree_code code;
5645 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
5646 this, so we can safely override NCOPIES with 1 here. */
5650 gcc_assert (ncopies >= 1);
5652 /* FORNOW. This restriction should be relaxed. */
5653 if (nested_in_vect_loop && ncopies > 1)
5655 if (vect_print_dump_info (REPORT_DETAILS))
5656 fprintf (vect_dump, "multiple types in nested loop.");
5660 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5663 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5666 /* Is vectorizable load? */
5667 if (!is_gimple_assign (stmt))
5670 scalar_dest = gimple_assign_lhs (stmt);
5671 if (TREE_CODE (scalar_dest) != SSA_NAME)
5674 code = gimple_assign_rhs_code (stmt);
5675 if (code != ARRAY_REF
5676 && code != INDIRECT_REF
5677 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5680 if (!STMT_VINFO_DATA_REF (stmt_info))
5683 scalar_type = TREE_TYPE (DR_REF (dr));
5684 mode = (int) TYPE_MODE (vectype);
5686 /* FORNOW. In some cases can vectorize even if data-type not supported
5687 (e.g. - data copies). */
5688 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
5690 if (vect_print_dump_info (REPORT_DETAILS))
5691 fprintf (vect_dump, "Aligned load, but unsupported type.");
5695 /* If accesses through a pointer to vectype do not alias the original
5696 memory reference we have a problem. */
5697 if (get_alias_set (vectype) != get_alias_set (scalar_type)
5698 && !alias_set_subset_of (get_alias_set (vectype),
5699 get_alias_set (scalar_type)))
5701 if (vect_print_dump_info (REPORT_DETAILS))
5702 fprintf (vect_dump, "vector type does not alias scalar type");
5706 /* Check if the load is a part of an interleaving chain. */
5707 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5709 strided_load = true;
5711 gcc_assert (! nested_in_vect_loop);
5713 /* Check if interleaving is supported. */
5714 if (!vect_strided_load_supported (vectype)
5715 && !PURE_SLP_STMT (stmt_info) && !slp)
5719 if (!vec_stmt) /* transformation not required. */
5721 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
5722 vect_model_load_cost (stmt_info, ncopies, NULL);
5726 if (vect_print_dump_info (REPORT_DETAILS))
5727 fprintf (vect_dump, "transform load.");
5733 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5734 /* Check if the chain of loads is already vectorized. */
5735 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
5737 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5740 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5741 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5742 dr_chain = VEC_alloc (tree, heap, group_size);
5744 /* VEC_NUM is the number of vect stmts to be created for this group. */
5747 strided_load = false;
5748 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5751 vec_num = group_size;
5757 group_size = vec_num = 1;
5760 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5761 gcc_assert (alignment_support_scheme);
5763 /* In case the vectorization factor (VF) is bigger than the number
5764 of elements that we can fit in a vectype (nunits), we have to generate
5765 more than one vector stmt - i.e - we need to "unroll" the
5766 vector stmt by a factor VF/nunits. In doing so, we record a pointer
5767 from one copy of the vector stmt to the next, in the field
5768 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
5769 stages to find the correct vector defs to be used when vectorizing
5770 stmts that use the defs of the current stmt. The example below illustrates
5771 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
5772 4 vectorized stmts):
5774 before vectorization:
5775 RELATED_STMT VEC_STMT
5779 step 1: vectorize stmt S1:
5780 We first create the vector stmt VS1_0, and, as usual, record a
5781 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
5782 Next, we create the vector stmt VS1_1, and record a pointer to
5783 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
5784 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
5786 RELATED_STMT VEC_STMT
5787 VS1_0: vx0 = memref0 VS1_1 -
5788 VS1_1: vx1 = memref1 VS1_2 -
5789 VS1_2: vx2 = memref2 VS1_3 -
5790 VS1_3: vx3 = memref3 - -
5791 S1: x = load - VS1_0
5794 See in documentation in vect_get_vec_def_for_stmt_copy for how the
5795 information we recorded in RELATED_STMT field is used to vectorize
5798 /* In case of interleaving (non-unit strided access):
5805 Vectorized loads are created in the order of memory accesses
5806 starting from the access of the first stmt of the chain:
5809 VS2: vx1 = &base + vec_size*1
5810 VS3: vx3 = &base + vec_size*2
5811 VS4: vx4 = &base + vec_size*3
5813 Then permutation statements are generated:
5815 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
5816 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
5819 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5820 (the order of the data-refs in the output of vect_permute_load_chain
5821 corresponds to the order of scalar stmts in the interleaving chain - see
5822 the documentation of vect_permute_load_chain()).
5823 The generation of permutation stmts and recording them in
5824 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
5826 In case of both multiple types and interleaving, the vector loads and
5827 permutation stmts above are created for every copy. The result vector stmts
5828 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5829 STMT_VINFO_RELATED_STMT for the next copies. */
5831 /* If the data reference is aligned (dr_aligned) or potentially unaligned
5832 on a target that supports unaligned accesses (dr_unaligned_supported)
5833 we generate the following code:
5837 p = p + indx * vectype_size;
5842 Otherwise, the data reference is potentially unaligned on a target that
5843 does not support unaligned accesses (dr_explicit_realign_optimized) -
5844 then generate the following code, in which the data in each iteration is
5845 obtained by two vector loads, one from the previous iteration, and one
5846 from the current iteration:
5848 msq_init = *(floor(p1))
5849 p2 = initial_addr + VS - 1;
5850 realignment_token = call target_builtin;
5853 p2 = p2 + indx * vectype_size
5855 vec_dest = realign_load (msq, lsq, realignment_token)
5860 /* If the misalignment remains the same throughout the execution of the
5861 loop, we can create the init_addr and permutation mask at the loop
5862 preheader. Otherwise, it needs to be created inside the loop.
5863 This can only occur when vectorizing memory accesses in the inner-loop
5864 nested within an outer-loop that is being vectorized. */
5866 if (nested_in_vect_loop_p (loop, stmt)
5867 && (TREE_INT_CST_LOW (DR_STEP (dr))
5868 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
5870 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
5871 compute_in_loop = true;
5874 if ((alignment_support_scheme == dr_explicit_realign_optimized
5875 || alignment_support_scheme == dr_explicit_realign)
5876 && !compute_in_loop)
5878 msq = vect_setup_realignment (first_stmt, gsi, &realignment_token,
5879 alignment_support_scheme, NULL_TREE,
5881 if (alignment_support_scheme == dr_explicit_realign_optimized)
5883 phi = SSA_NAME_DEF_STMT (msq);
5884 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5890 prev_stmt_info = NULL;
5891 for (j = 0; j < ncopies; j++)
5893 /* 1. Create the vector pointer update chain. */
5895 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
5897 &dummy, &ptr_incr, false,
5901 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
5903 for (i = 0; i < vec_num; i++)
5906 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
5909 /* 2. Create the vector-load in the loop. */
5910 switch (alignment_support_scheme)
5913 gcc_assert (aligned_access_p (first_dr));
5914 data_ref = build_fold_indirect_ref (dataref_ptr);
5916 case dr_unaligned_supported:
5918 int mis = DR_MISALIGNMENT (first_dr);
5919 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
5921 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
5923 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
5926 case dr_explicit_realign:
5929 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5931 if (compute_in_loop)
5932 msq = vect_setup_realignment (first_stmt, gsi,
5934 dr_explicit_realign,
5937 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5938 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5939 new_stmt = gimple_build_assign (vec_dest, data_ref);
5940 new_temp = make_ssa_name (vec_dest, new_stmt);
5941 gimple_assign_set_lhs (new_stmt, new_temp);
5942 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5943 copy_virtual_operands (new_stmt, stmt);
5944 mark_symbols_for_renaming (new_stmt);
5947 bump = size_binop (MULT_EXPR, vs_minus_1,
5948 TYPE_SIZE_UNIT (scalar_type));
5949 ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump);
5950 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5953 case dr_explicit_realign_optimized:
5954 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5959 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5960 new_stmt = gimple_build_assign (vec_dest, data_ref);
5961 new_temp = make_ssa_name (vec_dest, new_stmt);
5962 gimple_assign_set_lhs (new_stmt, new_temp);
5963 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5964 mark_symbols_for_renaming (new_stmt);
5966 /* 3. Handle explicit realignment if necessary/supported. Create in
5967 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
5968 if (alignment_support_scheme == dr_explicit_realign_optimized
5969 || alignment_support_scheme == dr_explicit_realign)
5973 lsq = gimple_assign_lhs (new_stmt);
5974 if (!realignment_token)
5975 realignment_token = dataref_ptr;
5976 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5977 tmp = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
5979 new_stmt = gimple_build_assign (vec_dest, tmp);
5980 new_temp = make_ssa_name (vec_dest, new_stmt);
5981 gimple_assign_set_lhs (new_stmt, new_temp);
5982 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5984 if (alignment_support_scheme == dr_explicit_realign_optimized)
5987 if (i == vec_num - 1 && j == ncopies - 1)
5988 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
5993 /* 4. Handle invariant-load. */
5996 gcc_assert (!strided_load);
5997 gcc_assert (nested_in_vect_loop_p (loop, stmt));
6002 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
6004 /* CHECKME: bitpos depends on endianess? */
6005 bitpos = bitsize_zero_node;
6006 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6009 vect_create_destination_var (scalar_dest, NULL_TREE);
6010 new_stmt = gimple_build_assign (vec_dest, vec_inv);
6011 new_temp = make_ssa_name (vec_dest, new_stmt);
6012 gimple_assign_set_lhs (new_stmt, new_temp);
6013 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6015 for (k = nunits - 1; k >= 0; --k)
6016 t = tree_cons (NULL_TREE, new_temp, t);
6017 /* FIXME: use build_constructor directly. */
6018 vec_inv = build_constructor_from_list (vectype, t);
6019 new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
6020 new_stmt = SSA_NAME_DEF_STMT (new_temp);
6023 gcc_unreachable (); /* FORNOW. */
6026 /* Collect vector loads and later create their permutation in
6027 vect_transform_strided_load (). */
6029 VEC_quick_push (tree, dr_chain, new_temp);
6031 /* Store vector loads in the corresponding SLP_NODE. */
6033 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
6036 /* FORNOW: SLP with multiple types is unsupported. */
6042 if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi))
6044 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6045 VEC_free (tree, heap, dr_chain);
6046 dr_chain = VEC_alloc (tree, heap, group_size);
6051 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6053 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6054 prev_stmt_info = vinfo_for_stmt (new_stmt);
6059 VEC_free (tree, heap, dr_chain);
6065 /* Function vectorizable_live_operation.
6067 STMT computes a value that is used outside the loop. Check if
6068 it can be supported. */
6071 vectorizable_live_operation (gimple stmt,
6072 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6073 gimple *vec_stmt ATTRIBUTE_UNUSED)
6075 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6076 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6077 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6083 enum vect_def_type dt;
6084 enum tree_code code;
6085 enum gimple_rhs_class rhs_class;
6087 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6089 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6092 if (!is_gimple_assign (stmt))
6095 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6098 /* FORNOW. CHECKME. */
6099 if (nested_in_vect_loop_p (loop, stmt))
6102 code = gimple_assign_rhs_code (stmt);
6103 op_type = TREE_CODE_LENGTH (code);
6104 rhs_class = get_gimple_rhs_class (code);
6105 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
6106 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
6108 /* FORNOW: support only if all uses are invariant. This means
6109 that the scalar operations can remain in place, unvectorized.
6110 The original last scalar value that they compute will be used. */
6112 for (i = 0; i < op_type; i++)
6114 if (rhs_class == GIMPLE_SINGLE_RHS)
6115 op = TREE_OPERAND (gimple_op (stmt, 1), i);
6117 op = gimple_op (stmt, i + 1);
6118 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
6120 if (vect_print_dump_info (REPORT_DETAILS))
6121 fprintf (vect_dump, "use not simple.");
6125 if (dt != vect_invariant_def && dt != vect_constant_def)
6129 /* No transformation is required for the cases we currently support. */
6134 /* Function vect_is_simple_cond.
6137 LOOP - the loop that is being vectorized.
6138 COND - Condition that is checked for simple use.
6140 Returns whether a COND can be vectorized. Checks whether
6141 condition operands are supportable using vec_is_simple_use. */
6144 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6148 enum vect_def_type dt;
6150 if (!COMPARISON_CLASS_P (cond))
6153 lhs = TREE_OPERAND (cond, 0);
6154 rhs = TREE_OPERAND (cond, 1);
6156 if (TREE_CODE (lhs) == SSA_NAME)
6158 gimple lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6159 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6162 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6163 && TREE_CODE (lhs) != FIXED_CST)
6166 if (TREE_CODE (rhs) == SSA_NAME)
6168 gimple rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6169 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6172 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
6173 && TREE_CODE (rhs) != FIXED_CST)
6179 /* vectorizable_condition.
6181 Check if STMT is conditional modify expression that can be vectorized.
6182 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6183 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
6186 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6189 vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
6192 tree scalar_dest = NULL_TREE;
6193 tree vec_dest = NULL_TREE;
6194 tree op = NULL_TREE;
6195 tree cond_expr, then_clause, else_clause;
6196 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6197 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6198 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6199 tree vec_compare, vec_cond_expr;
6201 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6202 enum machine_mode vec_mode;
6204 enum vect_def_type dt;
6205 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6206 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6207 enum tree_code code;
6209 gcc_assert (ncopies >= 1);
6211 return false; /* FORNOW */
6213 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6216 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6219 /* FORNOW: SLP not supported. */
6220 if (STMT_SLP_TYPE (stmt_info))
6223 /* FORNOW: not yet supported. */
6224 if (STMT_VINFO_LIVE_P (stmt_info))
6226 if (vect_print_dump_info (REPORT_DETAILS))
6227 fprintf (vect_dump, "value used after loop.");
6231 /* Is vectorizable conditional operation? */
6232 if (!is_gimple_assign (stmt))
6235 code = gimple_assign_rhs_code (stmt);
6237 if (code != COND_EXPR)
6240 gcc_assert (gimple_assign_single_p (stmt));
6241 op = gimple_assign_rhs1 (stmt);
6242 cond_expr = TREE_OPERAND (op, 0);
6243 then_clause = TREE_OPERAND (op, 1);
6244 else_clause = TREE_OPERAND (op, 2);
6246 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6249 /* We do not handle two different vector types for the condition
6251 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6254 if (TREE_CODE (then_clause) == SSA_NAME)
6256 gimple then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6257 if (!vect_is_simple_use (then_clause, loop_vinfo,
6258 &then_def_stmt, &def, &dt))
6261 else if (TREE_CODE (then_clause) != INTEGER_CST
6262 && TREE_CODE (then_clause) != REAL_CST
6263 && TREE_CODE (then_clause) != FIXED_CST)
6266 if (TREE_CODE (else_clause) == SSA_NAME)
6268 gimple else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6269 if (!vect_is_simple_use (else_clause, loop_vinfo,
6270 &else_def_stmt, &def, &dt))
6273 else if (TREE_CODE (else_clause) != INTEGER_CST
6274 && TREE_CODE (else_clause) != REAL_CST
6275 && TREE_CODE (else_clause) != FIXED_CST)
6279 vec_mode = TYPE_MODE (vectype);
6283 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
6284 return expand_vec_cond_expr_p (op, vec_mode);
6290 scalar_dest = gimple_assign_lhs (stmt);
6291 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6293 /* Handle cond expr. */
6295 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
6297 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
6298 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
6299 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
6301 /* Arguments are ready. Create the new vector stmt. */
6302 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
6303 vec_cond_lhs, vec_cond_rhs);
6304 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
6305 vec_compare, vec_then_clause, vec_else_clause);
6307 *vec_stmt = gimple_build_assign (vec_dest, vec_cond_expr);
6308 new_temp = make_ssa_name (vec_dest, *vec_stmt);
6309 gimple_assign_set_lhs (*vec_stmt, new_temp);
6310 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
6316 /* Function vect_transform_stmt.
6318 Create a vectorized stmt to replace STMT, and insert it at BSI. */
6321 vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
6322 bool *strided_store, slp_tree slp_node)
6324 bool is_store = false;
6325 gimple vec_stmt = NULL;
6326 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6327 gimple orig_stmt_in_pattern;
6330 switch (STMT_VINFO_TYPE (stmt_info))
6332 case type_demotion_vec_info_type:
6333 gcc_assert (!slp_node);
6334 done = vectorizable_type_demotion (stmt, gsi, &vec_stmt);
6338 case type_promotion_vec_info_type:
6339 gcc_assert (!slp_node);
6340 done = vectorizable_type_promotion (stmt, gsi, &vec_stmt);
6344 case type_conversion_vec_info_type:
6345 done = vectorizable_conversion (stmt, gsi, &vec_stmt, slp_node);
6349 case induc_vec_info_type:
6350 gcc_assert (!slp_node);
6351 done = vectorizable_induction (stmt, gsi, &vec_stmt);
6355 case op_vec_info_type:
6356 done = vectorizable_operation (stmt, gsi, &vec_stmt, slp_node);
6360 case assignment_vec_info_type:
6361 done = vectorizable_assignment (stmt, gsi, &vec_stmt, slp_node);
6365 case load_vec_info_type:
6366 done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node);
6370 case store_vec_info_type:
6371 done = vectorizable_store (stmt, gsi, &vec_stmt, slp_node);
6373 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6375 /* In case of interleaving, the whole chain is vectorized when the
6376 last store in the chain is reached. Store stmts before the last
6377 one are skipped, and there vec_stmt_info shouldn't be freed
6379 *strided_store = true;
6380 if (STMT_VINFO_VEC_STMT (stmt_info))
6387 case condition_vec_info_type:
6388 gcc_assert (!slp_node);
6389 done = vectorizable_condition (stmt, gsi, &vec_stmt);
6393 case call_vec_info_type:
6394 gcc_assert (!slp_node);
6395 done = vectorizable_call (stmt, gsi, &vec_stmt);
6398 case reduc_vec_info_type:
6399 gcc_assert (!slp_node);
6400 done = vectorizable_reduction (stmt, gsi, &vec_stmt);
6405 if (!STMT_VINFO_LIVE_P (stmt_info))
6407 if (vect_print_dump_info (REPORT_DETAILS))
6408 fprintf (vect_dump, "stmt not supported.");
6413 if (STMT_VINFO_LIVE_P (stmt_info)
6414 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
6416 done = vectorizable_live_operation (stmt, gsi, &vec_stmt);
6422 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
6423 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
6424 if (orig_stmt_in_pattern)
6426 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
6427 /* STMT was inserted by the vectorizer to replace a computation idiom.
6428 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
6429 computed this idiom. We need to record a pointer to VEC_STMT in
6430 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
6431 documentation of vect_pattern_recog. */
6432 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
6434 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
6435 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
6444 /* This function builds ni_name = number of iterations loop executes
6445 on the loop preheader. */
6448 vect_build_loop_niters (loop_vec_info loop_vinfo)
6451 gimple_seq stmts = NULL;
6453 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6454 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6456 var = create_tmp_var (TREE_TYPE (ni), "niters");
6457 add_referenced_var (var);
6458 ni_name = force_gimple_operand (ni, &stmts, false, var);
6460 pe = loop_preheader_edge (loop);
6463 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6464 gcc_assert (!new_bb);
6471 /* This function generates the following statements:
6473 ni_name = number of iterations loop executes
6474 ratio = ni_name / vf
6475 ratio_mult_vf_name = ratio * vf
6477 and places them at the loop preheader edge. */
6480 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6482 tree *ratio_mult_vf_name_ptr,
6483 tree *ratio_name_ptr)
6492 tree ratio_mult_vf_name;
6493 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6494 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
6495 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6498 pe = loop_preheader_edge (loop);
6500 /* Generate temporary variable that contains
6501 number of iterations loop executes. */
6503 ni_name = vect_build_loop_niters (loop_vinfo);
6504 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
6506 /* Create: ratio = ni >> log2(vf) */
6508 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
6509 if (!is_gimple_val (ratio_name))
6511 var = create_tmp_var (TREE_TYPE (ni), "bnd");
6512 add_referenced_var (var);
6515 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
6516 pe = loop_preheader_edge (loop);
6517 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6518 gcc_assert (!new_bb);
6521 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6523 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6524 ratio_name, log_vf);
6525 if (!is_gimple_val (ratio_mult_vf_name))
6527 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
6528 add_referenced_var (var);
6531 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
6533 pe = loop_preheader_edge (loop);
6534 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6535 gcc_assert (!new_bb);
6538 *ni_name_ptr = ni_name;
6539 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6540 *ratio_name_ptr = ratio_name;
6546 /* Function vect_update_ivs_after_vectorizer.
6548 "Advance" the induction variables of LOOP to the value they should take
6549 after the execution of LOOP. This is currently necessary because the
6550 vectorizer does not handle induction variables that are used after the
6551 loop. Such a situation occurs when the last iterations of LOOP are
6553 1. We introduced new uses after LOOP for IVs that were not originally used
6554 after LOOP: the IVs of LOOP are now used by an epilog loop.
6555 2. LOOP is going to be vectorized; this means that it will iterate N/VF
6556 times, whereas the loop IVs should be bumped N times.
6559 - LOOP - a loop that is going to be vectorized. The last few iterations
6560 of LOOP were peeled.
6561 - NITERS - the number of iterations that LOOP executes (before it is
6562 vectorized). i.e, the number of times the ivs should be bumped.
6563 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
6564 coming out from LOOP on which there are uses of the LOOP ivs
6565 (this is the path from LOOP->exit to epilog_loop->preheader).
6567 The new definitions of the ivs are placed in LOOP->exit.
6568 The phi args associated with the edge UPDATE_E in the bb
6569 UPDATE_E->dest are updated accordingly.
6571 Assumption 1: Like the rest of the vectorizer, this function assumes
6572 a single loop exit that has a single predecessor.
6574 Assumption 2: The phi nodes in the LOOP header and in update_bb are
6575 organized in the same order.
6577 Assumption 3: The access function of the ivs is simple enough (see
6578 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
6580 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
6581 coming out of LOOP on which the ivs of LOOP are used (this is the path
6582 that leads to the epilog loop; other paths skip the epilog loop). This
6583 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
6584 needs to have its phis updated.
6588 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
6591 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6592 basic_block exit_bb = single_exit (loop)->dest;
6594 gimple_stmt_iterator gsi, gsi1;
6595 basic_block update_bb = update_e->dest;
6597 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
6599 /* Make sure there exists a single-predecessor exit bb: */
6600 gcc_assert (single_pred_p (exit_bb));
6602 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
6603 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
6604 gsi_next (&gsi), gsi_next (&gsi1))
6606 tree access_fn = NULL;
6607 tree evolution_part;
6610 tree var, ni, ni_name;
6611 gimple_stmt_iterator last_gsi;
6613 phi = gsi_stmt (gsi);
6614 phi1 = gsi_stmt (gsi1);
6615 if (vect_print_dump_info (REPORT_DETAILS))
6617 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
6618 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
6621 /* Skip virtual phi's. */
6622 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
6624 if (vect_print_dump_info (REPORT_DETAILS))
6625 fprintf (vect_dump, "virtual phi. skip.");
6629 /* Skip reduction phis. */
6630 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
6632 if (vect_print_dump_info (REPORT_DETAILS))
6633 fprintf (vect_dump, "reduc phi. skip.");
6637 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
6638 gcc_assert (access_fn);
6640 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
6641 gcc_assert (evolution_part != NULL_TREE);
6643 /* FORNOW: We do not support IVs whose evolution function is a polynomial
6644 of degree >= 2 or exponential. */
6645 gcc_assert (!tree_is_chrec (evolution_part));
6647 step_expr = evolution_part;
6648 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
6651 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
6652 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
6654 fold_convert (sizetype,
6655 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
6656 niters, step_expr)));
6658 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
6659 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
6660 fold_convert (TREE_TYPE (init_expr),
6667 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
6668 add_referenced_var (var);
6670 last_gsi = gsi_last_bb (exit_bb);
6671 ni_name = force_gimple_operand_gsi (&last_gsi, ni, false, var,
6672 true, GSI_SAME_STMT);
6674 /* Fix phi expressions in the successor bb. */
6675 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
6679 /* Return the more conservative threshold between the
6680 min_profitable_iters returned by the cost model and the user
6681 specified threshold, if provided. */
6684 conservative_cost_threshold (loop_vec_info loop_vinfo,
6685 int min_profitable_iters)
6688 int min_scalar_loop_bound;
6690 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
6691 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
6693 /* Use the cost model only if it is more conservative than user specified
6695 th = (unsigned) min_scalar_loop_bound;
6696 if (min_profitable_iters
6697 && (!min_scalar_loop_bound
6698 || min_profitable_iters > min_scalar_loop_bound))
6699 th = (unsigned) min_profitable_iters;
6701 if (th && vect_print_dump_info (REPORT_COST))
6702 fprintf (vect_dump, "Vectorization may not be profitable.");
6707 /* Function vect_do_peeling_for_loop_bound
6709 Peel the last iterations of the loop represented by LOOP_VINFO.
6710 The peeled iterations form a new epilog loop. Given that the loop now
6711 iterates NITERS times, the new epilog loop iterates
6712 NITERS % VECTORIZATION_FACTOR times.
6714 The original loop will later be made to iterate
6715 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
6718 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
6720 tree ni_name, ratio_mult_vf_name;
6721 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6722 struct loop *new_loop;
6724 basic_block preheader;
6726 bool check_profitability = false;
6727 unsigned int th = 0;
6728 int min_profitable_iters;
6730 if (vect_print_dump_info (REPORT_DETAILS))
6731 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
6733 initialize_original_copy_tables ();
6735 /* Generate the following variables on the preheader of original loop:
6737 ni_name = number of iteration the original loop executes
6738 ratio = ni_name / vf
6739 ratio_mult_vf_name = ratio * vf */
6740 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
6741 &ratio_mult_vf_name, ratio);
6743 loop_num = loop->num;
6745 /* If cost model check not done during versioning and
6746 peeling for alignment. */
6747 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6748 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
6749 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
6751 check_profitability = true;
6753 /* Get profitability threshold for vectorized loop. */
6754 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6756 th = conservative_cost_threshold (loop_vinfo,
6757 min_profitable_iters);
6760 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
6761 ratio_mult_vf_name, ni_name, false,
6762 th, check_profitability);
6763 gcc_assert (new_loop);
6764 gcc_assert (loop_num == loop->num);
6765 #ifdef ENABLE_CHECKING
6766 slpeel_verify_cfg_after_peeling (loop, new_loop);
6769 /* A guard that controls whether the new_loop is to be executed or skipped
6770 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
6771 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
6772 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
6773 is on the path where the LOOP IVs are used and need to be updated. */
6775 preheader = loop_preheader_edge (new_loop)->src;
6776 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
6777 update_e = EDGE_PRED (preheader, 0);
6779 update_e = EDGE_PRED (preheader, 1);
6781 /* Update IVs of original loop as if they were advanced
6782 by ratio_mult_vf_name steps. */
6783 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
6785 /* After peeling we have to reset scalar evolution analyzer. */
6788 free_original_copy_tables ();
6792 /* Function vect_gen_niters_for_prolog_loop
6794 Set the number of iterations for the loop represented by LOOP_VINFO
6795 to the minimum between LOOP_NITERS (the original iteration count of the loop)
6796 and the misalignment of DR - the data reference recorded in
6797 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
6798 this loop, the data reference DR will refer to an aligned location.
6800 The following computation is generated:
6802 If the misalignment of DR is known at compile time:
6803 addr_mis = int mis = DR_MISALIGNMENT (dr);
6804 Else, compute address misalignment in bytes:
6805 addr_mis = addr & (vectype_size - 1)
6807 prolog_niters = min (LOOP_NITERS, ((VF - addr_mis/elem_size)&(VF-1))/step)
6809 (elem_size = element type size; an element is the scalar element whose type
6810 is the inner type of the vectype)
6812 When the step of the data-ref in the loop is not 1 (as in interleaved data
6813 and SLP), the number of iterations of the prolog must be divided by the step
6814 (which is equal to the size of interleaved group).
6816 The above formulas assume that VF == number of elements in the vector. This
6817 may not hold when there are multiple-types in the loop.
6818 In this case, for some data-references in the loop the VF does not represent
6819 the number of elements that fit in the vector. Therefore, instead of VF we
6820 use TYPE_VECTOR_SUBPARTS. */
6823 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
6825 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
6826 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6829 tree iters, iters_name;
6832 gimple dr_stmt = DR_STMT (dr);
6833 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
6834 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6835 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
6836 tree niters_type = TREE_TYPE (loop_niters);
6838 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
6839 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
6841 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6842 step = DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info)));
6844 pe = loop_preheader_edge (loop);
6846 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
6848 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
6849 int elem_misalign = byte_misalign / element_size;
6851 if (vect_print_dump_info (REPORT_DETAILS))
6852 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
6854 iters = build_int_cst (niters_type,
6855 (((nelements - elem_misalign) & (nelements - 1)) / step));
6859 gimple_seq new_stmts = NULL;
6860 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
6861 &new_stmts, NULL_TREE, loop);
6862 tree ptr_type = TREE_TYPE (start_addr);
6863 tree size = TYPE_SIZE (ptr_type);
6864 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
6865 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
6866 tree elem_size_log =
6867 build_int_cst (type, exact_log2 (vectype_align/nelements));
6868 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
6869 tree nelements_tree = build_int_cst (type, nelements);
6873 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmts);
6874 gcc_assert (!new_bb);
6876 /* Create: byte_misalign = addr & (vectype_size - 1) */
6878 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
6880 /* Create: elem_misalign = byte_misalign / element_size */
6882 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
6884 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
6885 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
6886 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
6887 iters = fold_convert (niters_type, iters);
6890 /* Create: prolog_loop_niters = min (iters, loop_niters) */
6891 /* If the loop bound is known at compile time we already verified that it is
6892 greater than vf; since the misalignment ('iters') is at most vf, there's
6893 no need to generate the MIN_EXPR in this case. */
6894 if (TREE_CODE (loop_niters) != INTEGER_CST)
6895 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
6897 if (vect_print_dump_info (REPORT_DETAILS))
6899 fprintf (vect_dump, "niters for prolog loop: ");
6900 print_generic_expr (vect_dump, iters, TDF_SLIM);
6903 var = create_tmp_var (niters_type, "prolog_loop_niters");
6904 add_referenced_var (var);
6906 iters_name = force_gimple_operand (iters, &stmts, false, var);
6908 /* Insert stmt on loop preheader edge. */
6911 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6912 gcc_assert (!new_bb);
6919 /* Function vect_update_init_of_dr
6921 NITERS iterations were peeled from LOOP. DR represents a data reference
6922 in LOOP. This function updates the information recorded in DR to
6923 account for the fact that the first NITERS iterations had already been
6924 executed. Specifically, it updates the OFFSET field of DR. */
6927 vect_update_init_of_dr (struct data_reference *dr, tree niters)
6929 tree offset = DR_OFFSET (dr);
6931 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
6932 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
6933 DR_OFFSET (dr) = offset;
6937 /* Function vect_update_inits_of_drs
6939 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
6940 This function updates the information recorded for the data references in
6941 the loop to account for the fact that the first NITERS iterations had
6942 already been executed. Specifically, it updates the initial_condition of
6943 the access_function of all the data_references in the loop. */
6946 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
6949 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
6950 struct data_reference *dr;
6952 if (vect_print_dump_info (REPORT_DETAILS))
6953 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
6955 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
6956 vect_update_init_of_dr (dr, niters);
6960 /* Function vect_do_peeling_for_alignment
6962 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
6963 'niters' is set to the misalignment of one of the data references in the
6964 loop, thereby forcing it to refer to an aligned location at the beginning
6965 of the execution of this loop. The data reference for which we are
6966 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
6969 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
6971 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6972 tree niters_of_prolog_loop, ni_name;
6974 struct loop *new_loop;
6975 bool check_profitability = false;
6976 unsigned int th = 0;
6977 int min_profitable_iters;
6979 if (vect_print_dump_info (REPORT_DETAILS))
6980 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
6982 initialize_original_copy_tables ();
6984 ni_name = vect_build_loop_niters (loop_vinfo);
6985 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
6988 /* If cost model check not done during versioning. */
6989 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6990 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
6992 check_profitability = true;
6994 /* Get profitability threshold for vectorized loop. */
6995 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6997 th = conservative_cost_threshold (loop_vinfo,
6998 min_profitable_iters);
7001 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
7003 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
7004 niters_of_prolog_loop, ni_name, true,
7005 th, check_profitability);
7007 gcc_assert (new_loop);
7008 #ifdef ENABLE_CHECKING
7009 slpeel_verify_cfg_after_peeling (new_loop, loop);
7012 /* Update number of times loop executes. */
7013 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
7014 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
7015 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
7017 /* Update the init conditions of the access functions of all data refs. */
7018 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
7020 /* After peeling we have to reset scalar evolution analyzer. */
7023 free_original_copy_tables ();
7027 /* Function vect_create_cond_for_align_checks.
7029 Create a conditional expression that represents the alignment checks for
7030 all of data references (array element references) whose alignment must be
7034 COND_EXPR - input conditional expression. New conditions will be chained
7035 with logical AND operation.
7036 LOOP_VINFO - two fields of the loop information are used.
7037 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
7038 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
7041 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7043 The returned value is the conditional expression to be used in the if
7044 statement that controls which version of the loop gets executed at runtime.
7046 The algorithm makes two assumptions:
7047 1) The number of bytes "n" in a vector is a power of 2.
7048 2) An address "a" is aligned if a%n is zero and that this
7049 test can be done as a&(n-1) == 0. For example, for 16
7050 byte vectors the test is a&0xf == 0. */
7053 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
7055 gimple_seq *cond_expr_stmt_list)
7057 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7058 VEC(gimple,heap) *may_misalign_stmts
7059 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
7061 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
7065 tree int_ptrsize_type;
7067 tree or_tmp_name = NULL_TREE;
7068 tree and_tmp, and_tmp_name;
7071 tree part_cond_expr;
7073 /* Check that mask is one less than a power of 2, i.e., mask is
7074 all zeros followed by all ones. */
7075 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
7077 /* CHECKME: what is the best integer or unsigned type to use to hold a
7078 cast from a pointer value? */
7079 psize = TYPE_SIZE (ptr_type_node);
7081 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
7083 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
7084 of the first vector of the i'th data reference. */
7086 for (i = 0; VEC_iterate (gimple, may_misalign_stmts, i, ref_stmt); i++)
7088 gimple_seq new_stmt_list = NULL;
7090 tree addr_tmp, addr_tmp_name;
7091 tree or_tmp, new_or_tmp_name;
7092 gimple addr_stmt, or_stmt;
7094 /* create: addr_tmp = (int)(address_of_first_vector) */
7096 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
7098 if (new_stmt_list != NULL)
7099 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
7101 sprintf (tmp_name, "%s%d", "addr2int", i);
7102 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7103 add_referenced_var (addr_tmp);
7104 addr_tmp_name = make_ssa_name (addr_tmp, NULL);
7105 addr_stmt = gimple_build_assign (addr_tmp_name, addr_base);
7106 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
7107 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
7109 /* The addresses are OR together. */
7111 if (or_tmp_name != NULL_TREE)
7113 /* create: or_tmp = or_tmp | addr_tmp */
7114 sprintf (tmp_name, "%s%d", "orptrs", i);
7115 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7116 add_referenced_var (or_tmp);
7117 new_or_tmp_name = make_ssa_name (or_tmp, NULL);
7118 or_stmt = gimple_build_assign_with_ops (BIT_IOR_EXPR,
7120 or_tmp_name, addr_tmp_name);
7121 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
7122 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
7123 or_tmp_name = new_or_tmp_name;
7126 or_tmp_name = addr_tmp_name;
7130 mask_cst = build_int_cst (int_ptrsize_type, mask);
7132 /* create: and_tmp = or_tmp & mask */
7133 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
7134 add_referenced_var (and_tmp);
7135 and_tmp_name = make_ssa_name (and_tmp, NULL);
7137 and_stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, and_tmp_name,
7138 or_tmp_name, mask_cst);
7139 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
7140 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
7142 /* Make and_tmp the left operand of the conditional test against zero.
7143 if and_tmp has a nonzero bit then some address is unaligned. */
7144 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
7145 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
7146 and_tmp_name, ptrsize_zero);
7148 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7149 *cond_expr, part_cond_expr);
7151 *cond_expr = part_cond_expr;
7154 /* Function vect_vfa_segment_size.
7156 Create an expression that computes the size of segment
7157 that will be accessed for a data reference. The functions takes into
7158 account that realignment loads may access one more vector.
7161 DR: The data reference.
7162 VECT_FACTOR: vectorization factor.
7164 Return an expression whose value is the size of segment which will be
7168 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7170 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7171 DR_STEP (dr), vect_factor);
7173 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7175 tree vector_size = TYPE_SIZE_UNIT
7176 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7178 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7179 segment_length, vector_size);
7181 return fold_convert (sizetype, segment_length);
7184 /* Function vect_create_cond_for_alias_checks.
7186 Create a conditional expression that represents the run-time checks for
7187 overlapping of address ranges represented by a list of data references
7188 relations passed as input.
7191 COND_EXPR - input conditional expression. New conditions will be chained
7192 with logical AND operation.
7193 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7197 COND_EXPR - conditional expression.
7198 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7202 The returned value is the conditional expression to be used in the if
7203 statement that controls which version of the loop gets executed at runtime.
7207 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7209 gimple_seq * cond_expr_stmt_list)
7211 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7212 VEC (ddr_p, heap) * may_alias_ddrs =
7213 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7215 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7219 tree part_cond_expr;
7221 /* Create expression
7222 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7223 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7227 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7228 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
7230 if (VEC_empty (ddr_p, may_alias_ddrs))
7233 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
7235 struct data_reference *dr_a, *dr_b;
7236 gimple dr_group_first_a, dr_group_first_b;
7237 tree addr_base_a, addr_base_b;
7238 tree segment_length_a, segment_length_b;
7239 gimple stmt_a, stmt_b;
7242 stmt_a = DR_STMT (DDR_A (ddr));
7243 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
7244 if (dr_group_first_a)
7246 stmt_a = dr_group_first_a;
7247 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
7251 stmt_b = DR_STMT (DDR_B (ddr));
7252 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
7253 if (dr_group_first_b)
7255 stmt_b = dr_group_first_b;
7256 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
7260 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
7263 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
7266 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
7267 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
7269 if (vect_print_dump_info (REPORT_DR_DETAILS))
7272 "create runtime check for data references ");
7273 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
7274 fprintf (vect_dump, " and ");
7275 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
7280 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
7281 fold_build2 (LT_EXPR, boolean_type_node,
7282 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
7286 fold_build2 (LT_EXPR, boolean_type_node,
7287 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
7293 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7294 *cond_expr, part_cond_expr);
7296 *cond_expr = part_cond_expr;
7298 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7299 fprintf (vect_dump, "created %u versioning for alias checks.\n",
7300 VEC_length (ddr_p, may_alias_ddrs));
7304 /* Function vect_loop_versioning.
7306 If the loop has data references that may or may not be aligned or/and
7307 has data reference relations whose independence was not proven then
7308 two versions of the loop need to be generated, one which is vectorized
7309 and one which isn't. A test is then generated to control which of the
7310 loops is executed. The test checks for the alignment of all of the
7311 data references that may or may not be aligned. An additional
7312 sequence of runtime tests is generated for each pairs of DDRs whose
7313 independence was not proven. The vectorized version of loop is
7314 executed only if both alias and alignment tests are passed.
7316 The test generated to check which version of loop is executed
7317 is modified to also check for profitability as indicated by the
7318 cost model initially. */
7321 vect_loop_versioning (loop_vec_info loop_vinfo)
7323 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7325 tree cond_expr = NULL_TREE;
7326 gimple_seq cond_expr_stmt_list = NULL;
7327 basic_block condition_bb;
7328 gimple_stmt_iterator gsi, cond_exp_gsi;
7329 basic_block merge_bb;
7330 basic_block new_exit_bb;
7332 gimple orig_phi, new_phi;
7334 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
7335 gimple_seq gimplify_stmt_list = NULL;
7336 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
7337 int min_profitable_iters = 0;
7340 /* Get profitability threshold for vectorized loop. */
7341 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7343 th = conservative_cost_threshold (loop_vinfo,
7344 min_profitable_iters);
7347 build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
7348 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
7350 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
7353 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
7354 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
7355 &cond_expr_stmt_list);
7357 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7358 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
7359 &cond_expr_stmt_list);
7362 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
7364 force_gimple_operand (cond_expr, &gimplify_stmt_list, true, NULL_TREE);
7365 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
7367 initialize_original_copy_tables ();
7368 nloop = loop_version (loop, cond_expr, &condition_bb,
7369 prob, prob, REG_BR_PROB_BASE - prob, true);
7370 free_original_copy_tables();
7372 /* Loop versioning violates an assumption we try to maintain during
7373 vectorization - that the loop exit block has a single predecessor.
7374 After versioning, the exit block of both loop versions is the same
7375 basic block (i.e. it has two predecessors). Just in order to simplify
7376 following transformations in the vectorizer, we fix this situation
7377 here by adding a new (empty) block on the exit-edge of the loop,
7378 with the proper loop-exit phis to maintain loop-closed-form. */
7380 merge_bb = single_exit (loop)->dest;
7381 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
7382 new_exit_bb = split_edge (single_exit (loop));
7383 new_exit_e = single_exit (loop);
7384 e = EDGE_SUCC (new_exit_bb, 0);
7386 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
7388 orig_phi = gsi_stmt (gsi);
7389 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
7391 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
7392 add_phi_arg (new_phi, arg, new_exit_e);
7393 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
7396 /* End loop-exit-fixes after versioning. */
7398 update_ssa (TODO_update_ssa);
7399 if (cond_expr_stmt_list)
7401 cond_exp_gsi = gsi_last_bb (condition_bb);
7402 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, GSI_SAME_STMT);
7406 /* Remove a group of stores (for SLP or interleaving), free their
7410 vect_remove_stores (gimple first_stmt)
7412 gimple next = first_stmt;
7414 gimple_stmt_iterator next_si;
7418 /* Free the attached stmt_vec_info and remove the stmt. */
7419 next_si = gsi_for_stmt (next);
7420 gsi_remove (&next_si, true);
7421 tmp = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
7422 free_stmt_vec_info (next);
7428 /* Vectorize SLP instance tree in postorder. */
7431 vect_schedule_slp_instance (slp_tree node, unsigned int vec_stmts_size)
7434 bool strided_store, is_store;
7435 gimple_stmt_iterator si;
7436 stmt_vec_info stmt_info;
7441 vect_schedule_slp_instance (SLP_TREE_LEFT (node), vec_stmts_size);
7442 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), vec_stmts_size);
7444 stmt = VEC_index(gimple, SLP_TREE_SCALAR_STMTS (node), 0);
7445 stmt_info = vinfo_for_stmt (stmt);
7446 SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size);
7447 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
7449 if (vect_print_dump_info (REPORT_DETAILS))
7451 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
7452 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
7455 si = gsi_for_stmt (stmt);
7456 is_store = vect_transform_stmt (stmt, &si, &strided_store, node);
7459 if (DR_GROUP_FIRST_DR (stmt_info))
7460 /* If IS_STORE is TRUE, the vectorization of the
7461 interleaving chain was completed - free all the stores in
7463 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
7465 /* FORNOW: SLP originates only from strided stores. */
7471 /* FORNOW: SLP originates only from strided stores. */
7477 vect_schedule_slp (loop_vec_info loop_vinfo, unsigned int nunits)
7479 VEC (slp_instance, heap) *slp_instances =
7480 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7481 slp_instance instance;
7482 unsigned int vec_stmts_size;
7483 unsigned int group_size, i;
7484 unsigned int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7485 bool is_store = false;
7487 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
7489 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
7490 /* For each SLP instance calculate number of vector stmts to be created
7491 for the scalar stmts in each node of the SLP tree. Number of vector
7492 elements in one vector iteration is the number of scalar elements in
7493 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
7495 vec_stmts_size = vectorization_factor * group_size / nunits;
7497 /* Schedule the tree of INSTANCE. */
7498 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
7501 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
7502 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
7503 fprintf (vect_dump, "vectorizing stmts using SLP.");
7509 /* Function vect_transform_loop.
7511 The analysis phase has determined that the loop is vectorizable.
7512 Vectorize the loop - created vectorized stmts to replace the scalar
7513 stmts in the loop, and update the loop exit condition. */
7516 vect_transform_loop (loop_vec_info loop_vinfo)
7518 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7519 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7520 int nbbs = loop->num_nodes;
7521 gimple_stmt_iterator si;
7524 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7526 bool slp_scheduled = false;
7527 unsigned int nunits;
7529 if (vect_print_dump_info (REPORT_DETAILS))
7530 fprintf (vect_dump, "=== vec_transform_loop ===");
7532 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7533 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7534 vect_loop_versioning (loop_vinfo);
7536 /* CHECKME: we wouldn't need this if we called update_ssa once
7538 bitmap_zero (vect_memsyms_to_rename);
7540 /* Peel the loop if there are data refs with unknown alignment.
7541 Only one data ref with unknown store is allowed. */
7543 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7544 vect_do_peeling_for_alignment (loop_vinfo);
7546 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
7547 compile time constant), or it is a constant that doesn't divide by the
7548 vectorization factor, then an epilog loop needs to be created.
7549 We therefore duplicate the loop: the original loop will be vectorized,
7550 and will compute the first (n/VF) iterations. The second copy of the loop
7551 will remain scalar and will compute the remaining (n%VF) iterations.
7552 (VF is the vectorization factor). */
7554 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7555 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7556 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
7557 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
7559 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7560 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
7562 /* 1) Make sure the loop header has exactly two entries
7563 2) Make sure we have a preheader basic block. */
7565 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7567 split_edge (loop_preheader_edge (loop));
7569 /* FORNOW: the vectorizer supports only loops which body consist
7570 of one basic block (header + empty latch). When the vectorizer will
7571 support more involved loop forms, the order by which the BBs are
7572 traversed need to be reconsidered. */
7574 for (i = 0; i < nbbs; i++)
7576 basic_block bb = bbs[i];
7577 stmt_vec_info stmt_info;
7580 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
7582 phi = gsi_stmt (si);
7583 if (vect_print_dump_info (REPORT_DETAILS))
7585 fprintf (vect_dump, "------>vectorizing phi: ");
7586 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
7588 stmt_info = vinfo_for_stmt (phi);
7592 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7593 && !STMT_VINFO_LIVE_P (stmt_info))
7596 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7597 != (unsigned HOST_WIDE_INT) vectorization_factor)
7598 && vect_print_dump_info (REPORT_DETAILS))
7599 fprintf (vect_dump, "multiple-types.");
7601 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
7603 if (vect_print_dump_info (REPORT_DETAILS))
7604 fprintf (vect_dump, "transform phi.");
7605 vect_transform_stmt (phi, NULL, NULL, NULL);
7609 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
7611 gimple stmt = gsi_stmt (si);
7614 if (vect_print_dump_info (REPORT_DETAILS))
7616 fprintf (vect_dump, "------>vectorizing statement: ");
7617 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
7620 stmt_info = vinfo_for_stmt (stmt);
7622 /* vector stmts created in the outer-loop during vectorization of
7623 stmts in an inner-loop may not have a stmt_info, and do not
7624 need to be vectorized. */
7631 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7632 && !STMT_VINFO_LIVE_P (stmt_info))
7638 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
7640 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7641 if (!STMT_SLP_TYPE (stmt_info)
7642 && nunits != (unsigned int) vectorization_factor
7643 && vect_print_dump_info (REPORT_DETAILS))
7644 /* For SLP VF is set according to unrolling factor, and not to
7645 vector size, hence for SLP this print is not valid. */
7646 fprintf (vect_dump, "multiple-types.");
7648 /* SLP. Schedule all the SLP instances when the first SLP stmt is
7650 if (STMT_SLP_TYPE (stmt_info))
7654 slp_scheduled = true;
7656 if (vect_print_dump_info (REPORT_DETAILS))
7657 fprintf (vect_dump, "=== scheduling SLP instances ===");
7659 is_store = vect_schedule_slp (loop_vinfo, nunits);
7661 /* IS_STORE is true if STMT is a store. Stores cannot be of
7662 hybrid SLP type. They are removed in
7663 vect_schedule_slp_instance and their vinfo is destroyed. */
7671 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
7672 if (PURE_SLP_STMT (stmt_info))
7679 /* -------- vectorize statement ------------ */
7680 if (vect_print_dump_info (REPORT_DETAILS))
7681 fprintf (vect_dump, "transform statement.");
7683 strided_store = false;
7684 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL);
7687 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7689 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7690 interleaving chain was completed - free all the stores in
7692 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
7693 gsi_remove (&si, true);
7698 /* Free the attached stmt_vec_info and remove the stmt. */
7699 free_stmt_vec_info (stmt);
7700 gsi_remove (&si, true);
7708 slpeel_make_loop_iterate_ntimes (loop, ratio);
7710 mark_set_for_renaming (vect_memsyms_to_rename);
7712 /* The memory tags and pointers in vectorized statements need to
7713 have their SSA forms updated. FIXME, why can't this be delayed
7714 until all the loops have been transformed? */
7715 update_ssa (TODO_update_ssa);
7717 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7718 fprintf (vect_dump, "LOOP VECTORIZED.");
7719 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7720 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");