1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (gimple, gimple_stmt_iterator *, bool *,
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (gimple, struct loop*, tree, tree *, gimple *, bool, bool *);
54 static tree vect_create_addr_base_for_vector_ref
55 (gimple, gimple_seq *, tree, struct loop *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, gimple, tree *);
58 static tree vect_init_vector (gimple, tree, tree, gimple_stmt_iterator *);
59 static void vect_finish_stmt_generation
60 (gimple stmt, gimple vec_stmt, gimple_stmt_iterator *);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void vect_create_epilog_for_reduction
63 (tree, gimple, int, enum tree_code, gimple);
64 static tree get_initial_def_for_reduction (gimple, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
78 cost_for_stmt (gimple stmt)
80 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
82 switch (STMT_VINFO_TYPE (stmt_info))
84 case load_vec_info_type:
85 return TARG_SCALAR_LOAD_COST;
86 case store_vec_info_type:
87 return TARG_SCALAR_STORE_COST;
88 case op_vec_info_type:
89 case condition_vec_info_type:
90 case assignment_vec_info_type:
91 case reduc_vec_info_type:
92 case induc_vec_info_type:
93 case type_promotion_vec_info_type:
94 case type_demotion_vec_info_type:
95 case type_conversion_vec_info_type:
96 case call_vec_info_type:
97 return TARG_SCALAR_STMT_COST;
98 case undef_vec_info_type:
105 /* Function vect_estimate_min_profitable_iters
107 Return the number of iterations required for the vector version of the
108 loop to be profitable relative to the cost of the scalar version of the
111 TODO: Take profile info into account before making vectorization
112 decisions, if available. */
115 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
118 int min_profitable_iters;
119 int peel_iters_prologue;
120 int peel_iters_epilogue;
121 int vec_inside_cost = 0;
122 int vec_outside_cost = 0;
123 int scalar_single_iter_cost = 0;
124 int scalar_outside_cost = 0;
125 bool runtime_test = false;
126 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
127 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
128 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
129 int nbbs = loop->num_nodes;
130 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
131 int peel_guard_costs = 0;
132 int innerloop_iters = 0, factor;
133 VEC (slp_instance, heap) *slp_instances;
134 slp_instance instance;
136 /* Cost model disabled. */
137 if (!flag_vect_cost_model)
139 if (vect_print_dump_info (REPORT_COST))
140 fprintf (vect_dump, "cost model disabled.");
144 /* If the number of iterations is unknown, or the
145 peeling-for-misalignment amount is unknown, we will have to generate
146 a runtime test to test the loop count against the threshold. */
147 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
148 || (byte_misalign < 0))
151 /* Requires loop versioning tests to handle misalignment. */
153 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
155 /* FIXME: Make cost depend on complexity of individual check. */
157 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
158 if (vect_print_dump_info (REPORT_COST))
159 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
160 "versioning to treat misalignment.\n");
163 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
165 /* FIXME: Make cost depend on complexity of individual check. */
167 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
168 if (vect_print_dump_info (REPORT_COST))
169 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
170 "versioning aliasing.\n");
173 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
174 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
176 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
179 /* Count statements in scalar loop. Using this as scalar cost for a single
182 TODO: Add outer loop support.
184 TODO: Consider assigning different costs to different scalar
189 innerloop_iters = 50; /* FIXME */
191 for (i = 0; i < nbbs; i++)
193 gimple_stmt_iterator si;
194 basic_block bb = bbs[i];
196 if (bb->loop_father == loop->inner)
197 factor = innerloop_iters;
201 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
203 gimple stmt = gsi_stmt (si);
204 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
205 /* Skip stmts that are not vectorized inside the loop. */
206 if (!STMT_VINFO_RELEVANT_P (stmt_info)
207 && (!STMT_VINFO_LIVE_P (stmt_info)
208 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
210 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
211 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
212 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
213 some of the "outside" costs are generated inside the outer-loop. */
214 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
218 /* Add additional cost for the peeled instructions in prologue and epilogue
221 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
222 at compile-time - we assume it's vf/2 (the worst would be vf-1).
224 TODO: Build an expression that represents peel_iters for prologue and
225 epilogue to be used in a run-time test. */
227 if (byte_misalign < 0)
229 peel_iters_prologue = vf/2;
230 if (vect_print_dump_info (REPORT_COST))
231 fprintf (vect_dump, "cost model: "
232 "prologue peel iters set to vf/2.");
234 /* If peeling for alignment is unknown, loop bound of main loop becomes
236 peel_iters_epilogue = vf/2;
237 if (vect_print_dump_info (REPORT_COST))
238 fprintf (vect_dump, "cost model: "
239 "epilogue peel iters set to vf/2 because "
240 "peeling for alignment is unknown .");
242 /* If peeled iterations are unknown, count a taken branch and a not taken
243 branch per peeled loop. Even if scalar loop iterations are known,
244 vector iterations are not known since peeled prologue iterations are
245 not known. Hence guards remain the same. */
246 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
247 + TARG_COND_NOT_TAKEN_BRANCH_COST);
254 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
255 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
256 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
257 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
259 peel_iters_prologue = nelements - (byte_misalign / element_size);
262 peel_iters_prologue = 0;
264 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
266 peel_iters_epilogue = vf/2;
267 if (vect_print_dump_info (REPORT_COST))
268 fprintf (vect_dump, "cost model: "
269 "epilogue peel iters set to vf/2 because "
270 "loop iterations are unknown .");
272 /* If peeled iterations are known but number of scalar loop
273 iterations are unknown, count a taken branch per peeled loop. */
274 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
279 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
280 peel_iters_prologue = niters < peel_iters_prologue ?
281 niters : peel_iters_prologue;
282 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
286 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
287 + (peel_iters_epilogue * scalar_single_iter_cost)
290 /* FORNOW: The scalar outside cost is incremented in one of the
293 1. The vectorizer checks for alignment and aliasing and generates
294 a condition that allows dynamic vectorization. A cost model
295 check is ANDED with the versioning condition. Hence scalar code
296 path now has the added cost of the versioning check.
298 if (cost > th & versioning_check)
301 Hence run-time scalar is incremented by not-taken branch cost.
303 2. The vectorizer then checks if a prologue is required. If the
304 cost model check was not done before during versioning, it has to
305 be done before the prologue check.
308 prologue = scalar_iters
313 if (prologue == num_iters)
316 Hence the run-time scalar cost is incremented by a taken branch,
317 plus a not-taken branch, plus a taken branch cost.
319 3. The vectorizer then checks if an epilogue is required. If the
320 cost model check was not done before during prologue check, it
321 has to be done with the epilogue check.
327 if (prologue == num_iters)
330 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
333 Hence the run-time scalar cost should be incremented by 2 taken
336 TODO: The back end may reorder the BBS's differently and reverse
337 conditions/branch directions. Change the estimates below to
338 something more reasonable. */
342 /* Cost model check occurs at versioning. */
343 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
344 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
345 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
348 /* Cost model occurs at prologue generation. */
349 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
350 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
351 + TARG_COND_NOT_TAKEN_BRANCH_COST;
352 /* Cost model check occurs at epilogue generation. */
354 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
359 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
360 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
362 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
363 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
366 /* Calculate number of iterations required to make the vector version
367 profitable, relative to the loop bodies only. The following condition
369 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
371 SIC = scalar iteration cost, VIC = vector iteration cost,
372 VOC = vector outside cost, VF = vectorization factor,
373 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
374 SOC = scalar outside cost for run time cost model check. */
376 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
378 if (vec_outside_cost <= 0)
379 min_profitable_iters = 1;
382 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
383 - vec_inside_cost * peel_iters_prologue
384 - vec_inside_cost * peel_iters_epilogue)
385 / ((scalar_single_iter_cost * vf)
388 if ((scalar_single_iter_cost * vf * min_profitable_iters)
389 <= ((vec_inside_cost * min_profitable_iters)
390 + ((vec_outside_cost - scalar_outside_cost) * vf)))
391 min_profitable_iters++;
394 /* vector version will never be profitable. */
397 if (vect_print_dump_info (REPORT_COST))
398 fprintf (vect_dump, "cost model: vector iteration cost = %d "
399 "is divisible by scalar iteration cost = %d by a factor "
400 "greater than or equal to the vectorization factor = %d .",
401 vec_inside_cost, scalar_single_iter_cost, vf);
405 if (vect_print_dump_info (REPORT_COST))
407 fprintf (vect_dump, "Cost model analysis: \n");
408 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
410 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
412 fprintf (vect_dump, " Scalar iteration cost: %d\n",
413 scalar_single_iter_cost);
414 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
415 fprintf (vect_dump, " prologue iterations: %d\n",
416 peel_iters_prologue);
417 fprintf (vect_dump, " epilogue iterations: %d\n",
418 peel_iters_epilogue);
419 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
420 min_profitable_iters);
423 min_profitable_iters =
424 min_profitable_iters < vf ? vf : min_profitable_iters;
426 /* Because the condition we create is:
427 if (niters <= min_profitable_iters)
428 then skip the vectorized loop. */
429 min_profitable_iters--;
431 if (vect_print_dump_info (REPORT_COST))
432 fprintf (vect_dump, " Profitability threshold = %d\n",
433 min_profitable_iters);
435 return min_profitable_iters;
439 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
440 functions. Design better to avoid maintenance issues. */
442 /* Function vect_model_reduction_cost.
444 Models cost for a reduction operation, including the vector ops
445 generated within the strip-mine loop, the initial definition before
446 the loop, and the epilogue code that must be generated. */
449 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
456 gimple stmt, orig_stmt;
458 enum machine_mode mode;
459 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
460 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
463 /* Cost of reduction op inside loop. */
464 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
466 stmt = STMT_VINFO_STMT (stmt_info);
468 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
470 case GIMPLE_SINGLE_RHS:
471 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
472 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
474 case GIMPLE_UNARY_RHS:
475 reduction_op = gimple_assign_rhs1 (stmt);
477 case GIMPLE_BINARY_RHS:
478 reduction_op = gimple_assign_rhs2 (stmt);
484 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
487 if (vect_print_dump_info (REPORT_COST))
489 fprintf (vect_dump, "unsupported data-type ");
490 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
495 mode = TYPE_MODE (vectype);
496 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
499 orig_stmt = STMT_VINFO_STMT (stmt_info);
501 code = gimple_assign_rhs_code (orig_stmt);
503 /* Add in cost for initial definition. */
504 outer_cost += TARG_SCALAR_TO_VEC_COST;
506 /* Determine cost of epilogue code.
508 We have a reduction operator that will reduce the vector in one statement.
509 Also requires scalar extract. */
511 if (!nested_in_vect_loop_p (loop, orig_stmt))
513 if (reduc_code < NUM_TREE_CODES)
514 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
517 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
519 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
520 int element_bitsize = tree_low_cst (bitsize, 1);
521 int nelements = vec_size_in_bits / element_bitsize;
523 optab = optab_for_tree_code (code, vectype, optab_default);
525 /* We have a whole vector shift available. */
526 if (VECTOR_MODE_P (mode)
527 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
528 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
529 /* Final reduction via vector shifts and the reduction operator. Also
530 requires scalar extract. */
531 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
532 + TARG_VEC_TO_SCALAR_COST);
534 /* Use extracts and reduction op for final reduction. For N elements,
535 we have N extracts and N-1 reduction ops. */
536 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
540 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
542 if (vect_print_dump_info (REPORT_COST))
543 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
544 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
545 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
551 /* Function vect_model_induction_cost.
553 Models cost for induction operations. */
556 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
558 /* loop cost for vec_loop. */
559 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
560 /* prologue cost for vec_init and vec_step. */
561 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
563 if (vect_print_dump_info (REPORT_COST))
564 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
565 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
566 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
570 /* Function vect_model_simple_cost.
572 Models cost for simple operations, i.e. those that only emit ncopies of a
573 single op. Right now, this does not account for multiple insns that could
574 be generated for the single vector op. We will handle that shortly. */
577 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
578 enum vect_def_type *dt, slp_tree slp_node)
581 int inside_cost = 0, outside_cost = 0;
583 /* The SLP costs were already calculated during SLP tree build. */
584 if (PURE_SLP_STMT (stmt_info))
587 inside_cost = ncopies * TARG_VEC_STMT_COST;
589 /* FORNOW: Assuming maximum 2 args per stmts. */
590 for (i = 0; i < 2; i++)
592 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
593 outside_cost += TARG_SCALAR_TO_VEC_COST;
596 if (vect_print_dump_info (REPORT_COST))
597 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
598 "outside_cost = %d .", inside_cost, outside_cost);
600 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
601 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
602 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
606 /* Function vect_cost_strided_group_size
608 For strided load or store, return the group_size only if it is the first
609 load or store of a group, else return 1. This ensures that group size is
610 only returned once per group. */
613 vect_cost_strided_group_size (stmt_vec_info stmt_info)
615 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
617 if (first_stmt == STMT_VINFO_STMT (stmt_info))
618 return DR_GROUP_SIZE (stmt_info);
624 /* Function vect_model_store_cost
626 Models cost for stores. In the case of strided accesses, one access
627 has the overhead of the strided access attributed to it. */
630 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
631 enum vect_def_type dt, slp_tree slp_node)
634 int inside_cost = 0, outside_cost = 0;
636 /* The SLP costs were already calculated during SLP tree build. */
637 if (PURE_SLP_STMT (stmt_info))
640 if (dt == vect_constant_def || dt == vect_invariant_def)
641 outside_cost = TARG_SCALAR_TO_VEC_COST;
643 /* Strided access? */
644 if (DR_GROUP_FIRST_DR (stmt_info) && !slp_node)
645 group_size = vect_cost_strided_group_size (stmt_info);
646 /* Not a strided access. */
650 /* Is this an access in a group of stores, which provide strided access?
651 If so, add in the cost of the permutes. */
654 /* Uses a high and low interleave operation for each needed permute. */
655 inside_cost = ncopies * exact_log2(group_size) * group_size
656 * TARG_VEC_STMT_COST;
658 if (vect_print_dump_info (REPORT_COST))
659 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
664 /* Costs of the stores. */
665 inside_cost += ncopies * TARG_VEC_STORE_COST;
667 if (vect_print_dump_info (REPORT_COST))
668 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
669 "outside_cost = %d .", inside_cost, outside_cost);
671 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
672 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
673 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
677 /* Function vect_model_load_cost
679 Models cost for loads. In the case of strided accesses, the last access
680 has the overhead of the strided access attributed to it. Since unaligned
681 accesses are supported for loads, we also account for the costs of the
682 access scheme chosen. */
685 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
689 int alignment_support_cheme;
691 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
692 int inside_cost = 0, outside_cost = 0;
694 /* The SLP costs were already calculated during SLP tree build. */
695 if (PURE_SLP_STMT (stmt_info))
698 /* Strided accesses? */
699 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
700 if (first_stmt && !slp_node)
702 group_size = vect_cost_strided_group_size (stmt_info);
703 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
705 /* Not a strided access. */
712 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
714 /* Is this an access in a group of loads providing strided access?
715 If so, add in the cost of the permutes. */
718 /* Uses an even and odd extract operations for each needed permute. */
719 inside_cost = ncopies * exact_log2(group_size) * group_size
720 * TARG_VEC_STMT_COST;
722 if (vect_print_dump_info (REPORT_COST))
723 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
728 /* The loads themselves. */
729 switch (alignment_support_cheme)
733 inside_cost += ncopies * TARG_VEC_LOAD_COST;
735 if (vect_print_dump_info (REPORT_COST))
736 fprintf (vect_dump, "vect_model_load_cost: aligned.");
740 case dr_unaligned_supported:
742 /* Here, we assign an additional cost for the unaligned load. */
743 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
745 if (vect_print_dump_info (REPORT_COST))
746 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
751 case dr_explicit_realign:
753 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
755 /* FIXME: If the misalignment remains fixed across the iterations of
756 the containing loop, the following cost should be added to the
758 if (targetm.vectorize.builtin_mask_for_load)
759 inside_cost += TARG_VEC_STMT_COST;
763 case dr_explicit_realign_optimized:
765 if (vect_print_dump_info (REPORT_COST))
766 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
769 /* Unaligned software pipeline has a load of an address, an initial
770 load, and possibly a mask operation to "prime" the loop. However,
771 if this is an access in a group of loads, which provide strided
772 access, then the above cost should only be considered for one
773 access in the group. Inside the loop, there is a load op
774 and a realignment op. */
776 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
778 outside_cost = 2*TARG_VEC_STMT_COST;
779 if (targetm.vectorize.builtin_mask_for_load)
780 outside_cost += TARG_VEC_STMT_COST;
783 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
792 if (vect_print_dump_info (REPORT_COST))
793 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
794 "outside_cost = %d .", inside_cost, outside_cost);
796 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
797 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
798 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
802 /* Function vect_get_new_vect_var.
804 Returns a name for a new variable. The current naming scheme appends the
805 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
806 the name of vectorizer generated variables, and appends that to NAME if
810 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
817 case vect_simple_var:
820 case vect_scalar_var:
823 case vect_pointer_var:
832 char* tmp = concat (prefix, name, NULL);
833 new_vect_var = create_tmp_var (type, tmp);
837 new_vect_var = create_tmp_var (type, prefix);
839 /* Mark vector typed variable as a gimple register variable. */
840 if (TREE_CODE (type) == VECTOR_TYPE)
841 DECL_GIMPLE_REG_P (new_vect_var) = true;
847 /* Function vect_create_addr_base_for_vector_ref.
849 Create an expression that computes the address of the first memory location
850 that will be accessed for a data reference.
853 STMT: The statement containing the data reference.
854 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
855 OFFSET: Optional. If supplied, it is be added to the initial address.
856 LOOP: Specify relative to which loop-nest should the address be computed.
857 For example, when the dataref is in an inner-loop nested in an
858 outer-loop that is now being vectorized, LOOP can be either the
859 outer-loop, or the inner-loop. The first memory location accessed
860 by the following dataref ('in' points to short):
867 if LOOP=i_loop: &in (relative to i_loop)
868 if LOOP=j_loop: &in+i*2B (relative to j_loop)
871 1. Return an SSA_NAME whose value is the address of the memory location of
872 the first vector of the data reference.
873 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
874 these statement(s) which define the returned SSA_NAME.
876 FORNOW: We are only handling array accesses with step 1. */
879 vect_create_addr_base_for_vector_ref (gimple stmt,
880 gimple_seq *new_stmt_list,
884 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
885 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
886 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
887 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
889 tree data_ref_base_var;
891 tree addr_base, addr_expr;
893 gimple_seq seq = NULL;
894 tree base_offset = unshare_expr (DR_OFFSET (dr));
895 tree init = unshare_expr (DR_INIT (dr));
896 tree vect_ptr_type, addr_expr2;
897 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
900 if (loop != containing_loop)
902 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
903 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
905 gcc_assert (nested_in_vect_loop_p (loop, stmt));
907 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
908 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
909 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
912 /* Create data_ref_base */
913 base_name = build_fold_indirect_ref (data_ref_base);
914 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
915 add_referenced_var (data_ref_base_var);
916 data_ref_base = force_gimple_operand (data_ref_base, &seq, true,
918 gimple_seq_add_seq (new_stmt_list, seq);
920 /* Create base_offset */
921 base_offset = size_binop (PLUS_EXPR, base_offset, init);
922 base_offset = fold_convert (sizetype, base_offset);
923 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
924 add_referenced_var (dest);
925 base_offset = force_gimple_operand (base_offset, &seq, true, dest);
926 gimple_seq_add_seq (new_stmt_list, seq);
930 tree tmp = create_tmp_var (sizetype, "offset");
932 add_referenced_var (tmp);
933 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
934 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
935 base_offset, offset);
936 base_offset = force_gimple_operand (base_offset, &seq, false, tmp);
937 gimple_seq_add_seq (new_stmt_list, seq);
940 /* base + base_offset */
941 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
942 data_ref_base, base_offset);
944 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
946 /* addr_expr = addr_base */
947 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
948 get_name (base_name));
949 add_referenced_var (addr_expr);
950 vec_stmt = fold_convert (vect_ptr_type, addr_base);
951 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
952 get_name (base_name));
953 add_referenced_var (addr_expr2);
954 vec_stmt = force_gimple_operand (vec_stmt, &seq, false, addr_expr2);
955 gimple_seq_add_seq (new_stmt_list, seq);
957 if (vect_print_dump_info (REPORT_DETAILS))
959 fprintf (vect_dump, "created ");
960 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
966 /* Function vect_create_data_ref_ptr.
968 Create a new pointer to vector type (vp), that points to the first location
969 accessed in the loop by STMT, along with the def-use update chain to
970 appropriately advance the pointer through the loop iterations. Also set
971 aliasing information for the pointer. This vector pointer is used by the
972 callers to this function to create a memory reference expression for vector
976 1. STMT: a stmt that references memory. Expected to be of the form
977 GIMPLE_ASSIGN <name, data-ref> or
978 GIMPLE_ASSIGN <data-ref, name>.
979 2. AT_LOOP: the loop where the vector memref is to be created.
980 3. OFFSET (optional): an offset to be added to the initial address accessed
981 by the data-ref in STMT.
982 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
983 pointing to the initial address.
986 1. Declare a new ptr to vector_type, and have it point to the base of the
987 data reference (initial addressed accessed by the data reference).
988 For example, for vector of type V8HI, the following code is generated:
991 vp = (v8hi *)initial_address;
993 if OFFSET is not supplied:
994 initial_address = &a[init];
995 if OFFSET is supplied:
996 initial_address = &a[init + OFFSET];
998 Return the initial_address in INITIAL_ADDRESS.
1000 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
1001 update the pointer in each iteration of the loop.
1003 Return the increment stmt that updates the pointer in PTR_INCR.
1005 3. Set INV_P to true if the access pattern of the data reference in the
1006 vectorized loop is invariant. Set it to false otherwise.
1008 4. Return the pointer. */
1011 vect_create_data_ref_ptr (gimple stmt, struct loop *at_loop,
1012 tree offset, tree *initial_address, gimple *ptr_incr,
1013 bool only_init, bool *inv_p)
1016 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1017 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1018 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1019 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
1020 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
1021 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1027 gimple_seq new_stmt_list = NULL;
1031 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1033 gimple_stmt_iterator incr_gsi;
1035 tree indx_before_incr, indx_after_incr;
1039 /* Check the step (evolution) of the load in LOOP, and record
1040 whether it's invariant. */
1041 if (nested_in_vect_loop)
1042 step = STMT_VINFO_DR_STEP (stmt_info);
1044 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1046 if (tree_int_cst_compare (step, size_zero_node) == 0)
1051 /* Create an expression for the first address accessed by this load
1053 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1055 if (vect_print_dump_info (REPORT_DETAILS))
1057 tree data_ref_base = base_name;
1058 fprintf (vect_dump, "create vector-pointer variable to type: ");
1059 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1060 if (TREE_CODE (data_ref_base) == VAR_DECL)
1061 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
1062 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1063 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
1064 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1065 fprintf (vect_dump, " vectorizing a record based array ref: ");
1066 else if (TREE_CODE (data_ref_base) == SSA_NAME)
1067 fprintf (vect_dump, " vectorizing a pointer ref: ");
1068 print_generic_expr (vect_dump, base_name, TDF_SLIM);
1071 /** (1) Create the new vector-pointer variable: **/
1072 vect_ptr_type = build_pointer_type (vectype);
1074 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1075 get_name (base_name));
1076 add_referenced_var (vect_ptr);
1078 /** (2) Add aliasing information to the new vector-pointer:
1079 (The points-to info (DR_PTR_INFO) may be defined later.) **/
1081 tag = DR_SYMBOL_TAG (dr);
1084 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1085 tag must be created with tag added to its may alias list. */
1087 new_type_alias (vect_ptr, tag, DR_REF (dr));
1089 set_symbol_mem_tag (vect_ptr, tag);
1091 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1092 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1093 def-use update cycles for the pointer: One relative to the outer-loop
1094 (LOOP), which is what steps (3) and (4) below do. The other is relative
1095 to the inner-loop (which is the inner-most loop containing the dataref),
1096 and this is done be step (5) below.
1098 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1099 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1100 redundant. Steps (3),(4) create the following:
1103 LOOP: vp1 = phi(vp0,vp2)
1109 If there is an inner-loop nested in loop, then step (5) will also be
1110 applied, and an additional update in the inner-loop will be created:
1113 LOOP: vp1 = phi(vp0,vp2)
1115 inner: vp3 = phi(vp1,vp4)
1116 vp4 = vp3 + inner_step
1122 /** (3) Calculate the initial address the vector-pointer, and set
1123 the vector-pointer to point to it before the loop: **/
1125 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1127 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1129 pe = loop_preheader_edge (loop);
1132 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
1133 gcc_assert (!new_bb);
1136 *initial_address = new_temp;
1138 /* Create: p = (vectype *) initial_base */
1139 vec_stmt = gimple_build_assign (vect_ptr,
1140 fold_convert (vect_ptr_type, new_temp));
1141 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1142 gimple_assign_set_lhs (vec_stmt, vect_ptr_init);
1143 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
1144 gcc_assert (!new_bb);
1147 /** (4) Handle the updating of the vector-pointer inside the loop.
1148 This is needed when ONLY_INIT is false, and also when AT_LOOP
1149 is the inner-loop nested in LOOP (during outer-loop vectorization).
1152 if (only_init && at_loop == loop) /* No update in loop is required. */
1154 /* Copy the points-to information if it exists. */
1155 if (DR_PTR_INFO (dr))
1156 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1157 vptr = vect_ptr_init;
1161 /* The step of the vector pointer is the Vector Size. */
1162 tree step = TYPE_SIZE_UNIT (vectype);
1163 /* One exception to the above is when the scalar step of the load in
1164 LOOP is zero. In this case the step here is also zero. */
1166 step = size_zero_node;
1168 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1170 create_iv (vect_ptr_init,
1171 fold_convert (vect_ptr_type, step),
1172 NULL_TREE, loop, &incr_gsi, insert_after,
1173 &indx_before_incr, &indx_after_incr);
1174 incr = gsi_stmt (incr_gsi);
1175 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1177 /* Copy the points-to information if it exists. */
1178 if (DR_PTR_INFO (dr))
1180 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1181 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1183 merge_alias_info (vect_ptr_init, indx_before_incr);
1184 merge_alias_info (vect_ptr_init, indx_after_incr);
1188 vptr = indx_before_incr;
1191 if (!nested_in_vect_loop || only_init)
1195 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1196 nested in LOOP, if exists: **/
1198 gcc_assert (nested_in_vect_loop);
1201 standard_iv_increment_position (containing_loop, &incr_gsi,
1203 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE,
1204 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
1206 incr = gsi_stmt (incr_gsi);
1207 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1209 /* Copy the points-to information if it exists. */
1210 if (DR_PTR_INFO (dr))
1212 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1213 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1215 merge_alias_info (vect_ptr_init, indx_before_incr);
1216 merge_alias_info (vect_ptr_init, indx_after_incr);
1220 return indx_before_incr;
1227 /* Function bump_vector_ptr
1229 Increment a pointer (to a vector type) by vector-size. If requested,
1230 i.e. if PTR-INCR is given, then also connect the new increment stmt
1231 to the existing def-use update-chain of the pointer, by modifying
1232 the PTR_INCR as illustrated below:
1234 The pointer def-use update-chain before this function:
1235 DATAREF_PTR = phi (p_0, p_2)
1237 PTR_INCR: p_2 = DATAREF_PTR + step
1239 The pointer def-use update-chain after this function:
1240 DATAREF_PTR = phi (p_0, p_2)
1242 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1244 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1247 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1249 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1250 the loop. The increment amount across iterations is expected
1252 BSI - location where the new update stmt is to be placed.
1253 STMT - the original scalar memory-access stmt that is being vectorized.
1254 BUMP - optional. The offset by which to bump the pointer. If not given,
1255 the offset is assumed to be vector_size.
1257 Output: Return NEW_DATAREF_PTR as illustrated above.
1262 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
1263 gimple stmt, tree bump)
1265 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1266 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1267 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1268 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1269 tree update = TYPE_SIZE_UNIT (vectype);
1272 use_operand_p use_p;
1273 tree new_dataref_ptr;
1278 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, ptr_var,
1279 dataref_ptr, update);
1280 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1281 gimple_assign_set_lhs (incr_stmt, new_dataref_ptr);
1282 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
1284 /* Copy the points-to information if it exists. */
1285 if (DR_PTR_INFO (dr))
1286 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1287 merge_alias_info (new_dataref_ptr, dataref_ptr);
1290 return new_dataref_ptr;
1292 /* Update the vector-pointer's cross-iteration increment. */
1293 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1295 tree use = USE_FROM_PTR (use_p);
1297 if (use == dataref_ptr)
1298 SET_USE (use_p, new_dataref_ptr);
1300 gcc_assert (tree_int_cst_compare (use, update) == 0);
1303 return new_dataref_ptr;
1307 /* Function vect_create_destination_var.
1309 Create a new temporary of type VECTYPE. */
1312 vect_create_destination_var (tree scalar_dest, tree vectype)
1315 const char *new_name;
1317 enum vect_var_kind kind;
1319 kind = vectype ? vect_simple_var : vect_scalar_var;
1320 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1322 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1324 new_name = get_name (scalar_dest);
1327 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1328 add_referenced_var (vec_dest);
1334 /* Function vect_init_vector.
1336 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1337 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1338 is not NULL. Otherwise, place the initialization at the loop preheader.
1339 Return the DEF of INIT_STMT.
1340 It will be used in the vectorization of STMT. */
1343 vect_init_vector (gimple stmt, tree vector_var, tree vector_type,
1344 gimple_stmt_iterator *gsi)
1346 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1354 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1355 add_referenced_var (new_var);
1356 init_stmt = gimple_build_assign (new_var, vector_var);
1357 new_temp = make_ssa_name (new_var, init_stmt);
1358 gimple_assign_set_lhs (init_stmt, new_temp);
1361 vect_finish_stmt_generation (stmt, init_stmt, gsi);
1364 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1365 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1367 if (nested_in_vect_loop_p (loop, stmt))
1369 pe = loop_preheader_edge (loop);
1370 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1371 gcc_assert (!new_bb);
1374 if (vect_print_dump_info (REPORT_DETAILS))
1376 fprintf (vect_dump, "created new init_stmt: ");
1377 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1380 vec_oprnd = gimple_assign_lhs (init_stmt);
1385 /* For constant and loop invariant defs of SLP_NODE this function returns
1386 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1387 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1391 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1392 unsigned int op_num)
1394 VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1395 gimple stmt = VEC_index (gimple, stmts, 0);
1396 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1397 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1398 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1401 int j, number_of_places_left_in_vector;
1404 int group_size = VEC_length (gimple, stmts);
1405 unsigned int vec_num, i;
1406 int number_of_copies = 1;
1407 bool is_store = false;
1408 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1409 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1412 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1415 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1416 created vectors. It is greater than 1 if unrolling is performed.
1418 For example, we have two scalar operands, s1 and s2 (e.g., group of
1419 strided accesses of size two), while NUNITS is four (i.e., four scalars
1420 of this type can be packed in a vector). The output vector will contain
1421 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1424 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1425 containing the operands.
1427 For example, NUNITS is four as before, and the group size is 8
1428 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1429 {s5, s6, s7, s8}. */
1431 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1433 number_of_places_left_in_vector = nunits;
1435 for (j = 0; j < number_of_copies; j++)
1437 for (i = group_size - 1; VEC_iterate (gimple, stmts, i, stmt); i--)
1440 op = gimple_assign_rhs1 (stmt);
1442 op = gimple_op (stmt, op_num + 1);
1443 if (!CONSTANT_CLASS_P (op))
1446 /* Create 'vect_ = {op0,op1,...,opn}'. */
1447 t = tree_cons (NULL_TREE, op, t);
1449 number_of_places_left_in_vector--;
1451 if (number_of_places_left_in_vector == 0)
1453 number_of_places_left_in_vector = nunits;
1455 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1456 gcc_assert (vector_type);
1458 vec_cst = build_vector (vector_type, t);
1460 vec_cst = build_constructor_from_list (vector_type, t);
1462 VEC_quick_push (tree, voprnds,
1463 vect_init_vector (stmt, vec_cst, vector_type,
1470 /* Since the vectors are created in the reverse order, we should invert
1472 vec_num = VEC_length (tree, voprnds);
1473 for (j = vec_num - 1; j >= 0; j--)
1475 vop = VEC_index (tree, voprnds, j);
1476 VEC_quick_push (tree, *vec_oprnds, vop);
1479 VEC_free (tree, heap, voprnds);
1481 /* In case that VF is greater than the unrolling factor needed for the SLP
1482 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1483 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1484 to replicate the vectors. */
1485 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1487 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1488 VEC_quick_push (tree, *vec_oprnds, vop);
1493 /* Get vectorized definitions from SLP_NODE that contains corresponding
1494 vectorized def-stmts. */
1497 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1500 gimple vec_def_stmt;
1503 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1506 VEC_iterate (gimple, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1509 gcc_assert (vec_def_stmt);
1510 vec_oprnd = gimple_get_lhs (vec_def_stmt);
1511 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1516 /* Get vectorized definitions for SLP_NODE.
1517 If the scalar definitions are loop invariants or constants, collect them and
1518 call vect_get_constant_vectors() to create vector stmts.
1519 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1520 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1521 vect_get_slp_vect_defs() to retrieve them.
1522 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1523 the right node. This is used when the second operand must remain scalar. */
1526 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1527 VEC (tree,heap) **vec_oprnds1)
1530 enum tree_code code;
1532 /* Allocate memory for vectorized defs. */
1533 *vec_oprnds0 = VEC_alloc (tree, heap,
1534 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1536 /* SLP_NODE corresponds either to a group of stores or to a group of
1537 unary/binary operations. We don't call this function for loads. */
1538 if (SLP_TREE_LEFT (slp_node))
1539 /* The defs are already vectorized. */
1540 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1542 /* Build vectors from scalar defs. */
1543 vect_get_constant_vectors (slp_node, vec_oprnds0, 0);
1545 first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1546 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1547 /* Since we don't call this function with loads, this is a group of
1551 code = gimple_assign_rhs_code (first_stmt);
1552 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1)
1555 *vec_oprnds1 = VEC_alloc (tree, heap,
1556 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1558 if (SLP_TREE_RIGHT (slp_node))
1559 /* The defs are already vectorized. */
1560 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1562 /* Build vectors from scalar defs. */
1563 vect_get_constant_vectors (slp_node, vec_oprnds1, 1);
1567 /* Function get_initial_def_for_induction
1570 STMT - a stmt that performs an induction operation in the loop.
1571 IV_PHI - the initial value of the induction variable
1574 Return a vector variable, initialized with the first VF values of
1575 the induction variable. E.g., for an iv with IV_PHI='X' and
1576 evolution S, for a vector of 4 units, we want to return:
1577 [X, X + S, X + 2*S, X + 3*S]. */
1580 get_initial_def_for_induction (gimple iv_phi)
1582 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1583 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1584 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1585 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
1588 edge pe = loop_preheader_edge (loop);
1589 struct loop *iv_loop;
1591 tree vec, vec_init, vec_step, t;
1595 gimple init_stmt, induction_phi, new_stmt;
1596 tree induc_def, vec_def, vec_dest;
1597 tree init_expr, step_expr;
1598 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1603 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1604 bool nested_in_vect_loop = false;
1605 gimple_seq stmts = NULL;
1606 imm_use_iterator imm_iter;
1607 use_operand_p use_p;
1611 gimple_stmt_iterator si;
1612 basic_block bb = gimple_bb (iv_phi);
1614 vectype = get_vectype_for_scalar_type (scalar_type);
1615 gcc_assert (vectype);
1616 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1617 ncopies = vf / nunits;
1619 gcc_assert (phi_info);
1620 gcc_assert (ncopies >= 1);
1622 /* Find the first insertion point in the BB. */
1623 si = gsi_after_labels (bb);
1625 if (INTEGRAL_TYPE_P (scalar_type) || POINTER_TYPE_P (scalar_type))
1626 step_expr = build_int_cst (scalar_type, 0);
1628 step_expr = build_real (scalar_type, dconst0);
1630 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1631 if (nested_in_vect_loop_p (loop, iv_phi))
1633 nested_in_vect_loop = true;
1634 iv_loop = loop->inner;
1638 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
1640 latch_e = loop_latch_edge (iv_loop);
1641 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1643 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1644 gcc_assert (access_fn);
1645 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1646 &init_expr, &step_expr);
1648 pe = loop_preheader_edge (iv_loop);
1650 /* Create the vector that holds the initial_value of the induction. */
1651 if (nested_in_vect_loop)
1653 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1654 been created during vectorization of previous stmts; We obtain it from
1655 the STMT_VINFO_VEC_STMT of the defining stmt. */
1656 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1657 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1661 /* iv_loop is the loop to be vectorized. Create:
1662 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1663 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1664 add_referenced_var (new_var);
1666 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1669 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
1670 gcc_assert (!new_bb);
1674 t = tree_cons (NULL_TREE, init_expr, t);
1675 for (i = 1; i < nunits; i++)
1677 /* Create: new_name_i = new_name + step_expr */
1678 enum tree_code code = POINTER_TYPE_P (scalar_type)
1679 ? POINTER_PLUS_EXPR : PLUS_EXPR;
1680 init_stmt = gimple_build_assign_with_ops (code, new_var,
1681 new_name, step_expr);
1682 new_name = make_ssa_name (new_var, init_stmt);
1683 gimple_assign_set_lhs (init_stmt, new_name);
1685 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1686 gcc_assert (!new_bb);
1688 if (vect_print_dump_info (REPORT_DETAILS))
1690 fprintf (vect_dump, "created new init_stmt: ");
1691 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1693 t = tree_cons (NULL_TREE, new_name, t);
1695 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1696 vec = build_constructor_from_list (vectype, nreverse (t));
1697 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1701 /* Create the vector that holds the step of the induction. */
1702 if (nested_in_vect_loop)
1703 /* iv_loop is nested in the loop to be vectorized. Generate:
1704 vec_step = [S, S, S, S] */
1705 new_name = step_expr;
1708 /* iv_loop is the loop to be vectorized. Generate:
1709 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1710 expr = build_int_cst (scalar_type, vf);
1711 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1715 for (i = 0; i < nunits; i++)
1716 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1717 gcc_assert (CONSTANT_CLASS_P (new_name));
1718 vec = build_vector (vectype, t);
1719 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1722 /* Create the following def-use cycle:
1727 vec_iv = PHI <vec_init, vec_loop>
1731 vec_loop = vec_iv + vec_step; */
1733 /* Create the induction-phi that defines the induction-operand. */
1734 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1735 add_referenced_var (vec_dest);
1736 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1737 set_vinfo_for_stmt (induction_phi,
1738 new_stmt_vec_info (induction_phi, loop_vinfo));
1739 induc_def = PHI_RESULT (induction_phi);
1741 /* Create the iv update inside the loop */
1742 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1743 induc_def, vec_step);
1744 vec_def = make_ssa_name (vec_dest, new_stmt);
1745 gimple_assign_set_lhs (new_stmt, vec_def);
1746 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1747 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
1749 /* Set the arguments of the phi node: */
1750 add_phi_arg (induction_phi, vec_init, pe);
1751 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1754 /* In case that vectorization factor (VF) is bigger than the number
1755 of elements that we can fit in a vectype (nunits), we have to generate
1756 more than one vector stmt - i.e - we need to "unroll" the
1757 vector stmt by a factor VF/nunits. For more details see documentation
1758 in vectorizable_operation. */
1762 stmt_vec_info prev_stmt_vinfo;
1763 /* FORNOW. This restriction should be relaxed. */
1764 gcc_assert (!nested_in_vect_loop);
1766 /* Create the vector that holds the step of the induction. */
1767 expr = build_int_cst (scalar_type, nunits);
1768 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1770 for (i = 0; i < nunits; i++)
1771 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1772 gcc_assert (CONSTANT_CLASS_P (new_name));
1773 vec = build_vector (vectype, t);
1774 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1776 vec_def = induc_def;
1777 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1778 for (i = 1; i < ncopies; i++)
1780 /* vec_i = vec_prev + vec_step */
1781 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1783 vec_def = make_ssa_name (vec_dest, new_stmt);
1784 gimple_assign_set_lhs (new_stmt, vec_def);
1786 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1787 set_vinfo_for_stmt (new_stmt,
1788 new_stmt_vec_info (new_stmt, loop_vinfo));
1789 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1790 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1794 if (nested_in_vect_loop)
1796 /* Find the loop-closed exit-phi of the induction, and record
1797 the final vector of induction results: */
1799 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1801 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
1803 exit_phi = USE_STMT (use_p);
1809 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1810 /* FORNOW. Currently not supporting the case that an inner-loop induction
1811 is not used in the outer-loop (i.e. only outside the outer-loop). */
1812 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1813 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1815 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1816 if (vect_print_dump_info (REPORT_DETAILS))
1818 fprintf (vect_dump, "vector of inductions after inner-loop:");
1819 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
1825 if (vect_print_dump_info (REPORT_DETAILS))
1827 fprintf (vect_dump, "transform induction: created def-use cycle: ");
1828 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
1829 fprintf (vect_dump, "\n");
1830 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
1833 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1838 /* Function vect_get_vec_def_for_operand.
1840 OP is an operand in STMT. This function returns a (vector) def that will be
1841 used in the vectorized stmt for STMT.
1843 In the case that OP is an SSA_NAME which is defined in the loop, then
1844 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1846 In case OP is an invariant or constant, a new stmt that creates a vector def
1847 needs to be introduced. */
1850 vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
1855 stmt_vec_info def_stmt_info = NULL;
1856 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1857 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1858 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1859 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1865 enum vect_def_type dt;
1869 if (vect_print_dump_info (REPORT_DETAILS))
1871 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1872 print_generic_expr (vect_dump, op, TDF_SLIM);
1875 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1876 gcc_assert (is_simple_use);
1877 if (vect_print_dump_info (REPORT_DETAILS))
1881 fprintf (vect_dump, "def = ");
1882 print_generic_expr (vect_dump, def, TDF_SLIM);
1886 fprintf (vect_dump, " def_stmt = ");
1887 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1893 /* Case 1: operand is a constant. */
1894 case vect_constant_def:
1899 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1900 if (vect_print_dump_info (REPORT_DETAILS))
1901 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1903 for (i = nunits - 1; i >= 0; --i)
1905 t = tree_cons (NULL_TREE, op, t);
1907 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1908 gcc_assert (vector_type);
1909 vec_cst = build_vector (vector_type, t);
1911 return vect_init_vector (stmt, vec_cst, vector_type, NULL);
1914 /* Case 2: operand is defined outside the loop - loop invariant. */
1915 case vect_invariant_def:
1920 /* Create 'vec_inv = {inv,inv,..,inv}' */
1921 if (vect_print_dump_info (REPORT_DETAILS))
1922 fprintf (vect_dump, "Create vector_inv.");
1924 for (i = nunits - 1; i >= 0; --i)
1926 t = tree_cons (NULL_TREE, def, t);
1929 /* FIXME: use build_constructor directly. */
1930 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1931 gcc_assert (vector_type);
1932 vec_inv = build_constructor_from_list (vector_type, t);
1933 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1936 /* Case 3: operand is defined inside the loop. */
1940 *scalar_def = NULL/* FIXME tuples: def_stmt*/;
1942 /* Get the def from the vectorized stmt. */
1943 def_stmt_info = vinfo_for_stmt (def_stmt);
1944 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1945 gcc_assert (vec_stmt);
1946 if (gimple_code (vec_stmt) == GIMPLE_PHI)
1947 vec_oprnd = PHI_RESULT (vec_stmt);
1948 else if (is_gimple_call (vec_stmt))
1949 vec_oprnd = gimple_call_lhs (vec_stmt);
1951 vec_oprnd = gimple_assign_lhs (vec_stmt);
1955 /* Case 4: operand is defined by a loop header phi - reduction */
1956 case vect_reduction_def:
1960 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
1961 loop = (gimple_bb (def_stmt))->loop_father;
1963 /* Get the def before the loop */
1964 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1965 return get_initial_def_for_reduction (stmt, op, scalar_def);
1968 /* Case 5: operand is defined by loop-header phi - induction. */
1969 case vect_induction_def:
1971 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
1973 /* Get the def from the vectorized stmt. */
1974 def_stmt_info = vinfo_for_stmt (def_stmt);
1975 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1976 gcc_assert (vec_stmt && gimple_code (vec_stmt) == GIMPLE_PHI);
1977 vec_oprnd = PHI_RESULT (vec_stmt);
1987 /* Function vect_get_vec_def_for_stmt_copy
1989 Return a vector-def for an operand. This function is used when the
1990 vectorized stmt to be created (by the caller to this function) is a "copy"
1991 created in case the vectorized result cannot fit in one vector, and several
1992 copies of the vector-stmt are required. In this case the vector-def is
1993 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
1994 of the stmt that defines VEC_OPRND.
1995 DT is the type of the vector def VEC_OPRND.
1998 In case the vectorization factor (VF) is bigger than the number
1999 of elements that can fit in a vectype (nunits), we have to generate
2000 more than one vector stmt to vectorize the scalar stmt. This situation
2001 arises when there are multiple data-types operated upon in the loop; the
2002 smallest data-type determines the VF, and as a result, when vectorizing
2003 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
2004 vector stmt (each computing a vector of 'nunits' results, and together
2005 computing 'VF' results in each iteration). This function is called when
2006 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
2007 which VF=16 and nunits=4, so the number of copies required is 4):
2009 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
2011 S1: x = load VS1.0: vx.0 = memref0 VS1.1
2012 VS1.1: vx.1 = memref1 VS1.2
2013 VS1.2: vx.2 = memref2 VS1.3
2014 VS1.3: vx.3 = memref3
2016 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
2017 VSnew.1: vz1 = vx.1 + ... VSnew.2
2018 VSnew.2: vz2 = vx.2 + ... VSnew.3
2019 VSnew.3: vz3 = vx.3 + ...
2021 The vectorization of S1 is explained in vectorizable_load.
2022 The vectorization of S2:
2023 To create the first vector-stmt out of the 4 copies - VSnew.0 -
2024 the function 'vect_get_vec_def_for_operand' is called to
2025 get the relevant vector-def for each operand of S2. For operand x it
2026 returns the vector-def 'vx.0'.
2028 To create the remaining copies of the vector-stmt (VSnew.j), this
2029 function is called to get the relevant vector-def for each operand. It is
2030 obtained from the respective VS1.j stmt, which is recorded in the
2031 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2033 For example, to obtain the vector-def 'vx.1' in order to create the
2034 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2035 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2036 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2037 and return its def ('vx.1').
2038 Overall, to create the above sequence this function will be called 3 times:
2039 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2040 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2041 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
2044 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2046 gimple vec_stmt_for_operand;
2047 stmt_vec_info def_stmt_info;
2049 /* Do nothing; can reuse same def. */
2050 if (dt == vect_invariant_def || dt == vect_constant_def )
2053 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2054 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2055 gcc_assert (def_stmt_info);
2056 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2057 gcc_assert (vec_stmt_for_operand);
2058 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2059 if (gimple_code (vec_stmt_for_operand) == GIMPLE_PHI)
2060 vec_oprnd = PHI_RESULT (vec_stmt_for_operand);
2062 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2067 /* Get vectorized definitions for the operands to create a copy of an original
2068 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
2071 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2072 VEC(tree,heap) **vec_oprnds0,
2073 VEC(tree,heap) **vec_oprnds1)
2075 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2077 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2078 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2080 if (vec_oprnds1 && *vec_oprnds1)
2082 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2083 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2084 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2089 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
2092 vect_get_vec_defs (tree op0, tree op1, gimple stmt,
2093 VEC(tree,heap) **vec_oprnds0, VEC(tree,heap) **vec_oprnds1,
2097 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2102 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2103 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2104 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2108 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2109 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2110 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2116 /* Function vect_finish_stmt_generation.
2118 Insert a new stmt. */
2121 vect_finish_stmt_generation (gimple stmt, gimple vec_stmt,
2122 gimple_stmt_iterator *gsi)
2124 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2125 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2127 gcc_assert (stmt == gsi_stmt (*gsi));
2128 gcc_assert (gimple_code (stmt) != GIMPLE_LABEL);
2130 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
2132 set_vinfo_for_stmt (vec_stmt, new_stmt_vec_info (vec_stmt, loop_vinfo));
2134 if (vect_print_dump_info (REPORT_DETAILS))
2136 fprintf (vect_dump, "add new stmt: ");
2137 print_gimple_stmt (vect_dump, vec_stmt, 0, TDF_SLIM);
2140 /* Make sure gsi points to the stmt that is being vectorized. */
2141 gcc_assert (stmt == gsi_stmt (*gsi));
2143 gimple_set_location (vec_stmt, gimple_location (stmt));
2147 /* Function get_initial_def_for_reduction
2150 STMT - a stmt that performs a reduction operation in the loop.
2151 INIT_VAL - the initial value of the reduction variable
2154 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2155 of the reduction (used for adjusting the epilog - see below).
2156 Return a vector variable, initialized according to the operation that STMT
2157 performs. This vector will be used as the initial value of the
2158 vector of partial results.
2160 Option1 (adjust in epilog): Initialize the vector as follows:
2163 min/max: [init_val,init_val,..,init_val,init_val]
2164 bit and/or: [init_val,init_val,..,init_val,init_val]
2165 and when necessary (e.g. add/mult case) let the caller know
2166 that it needs to adjust the result by init_val.
2168 Option2: Initialize the vector as follows:
2169 add: [0,0,...,0,init_val]
2170 mult: [1,1,...,1,init_val]
2171 min/max: [init_val,init_val,...,init_val]
2172 bit and/or: [init_val,init_val,...,init_val]
2173 and no adjustments are needed.
2175 For example, for the following code:
2181 STMT is 's = s + a[i]', and the reduction variable is 's'.
2182 For a vector of 4 units, we want to return either [0,0,0,init_val],
2183 or [0,0,0,0] and let the caller know that it needs to adjust
2184 the result at the end by 'init_val'.
2186 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2187 initialization vector is simpler (same element in all entries).
2188 A cost model should help decide between these two schemes. */
2191 get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
2193 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2194 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2195 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2196 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2197 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2198 enum tree_code code = gimple_assign_rhs_code (stmt);
2199 tree type = TREE_TYPE (init_val);
2206 bool nested_in_vect_loop = false;
2208 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2209 if (nested_in_vect_loop_p (loop, stmt))
2210 nested_in_vect_loop = true;
2212 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2214 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2218 case WIDEN_SUM_EXPR:
2221 if (nested_in_vect_loop)
2222 *adjustment_def = vecdef;
2224 *adjustment_def = init_val;
2225 /* Create a vector of zeros for init_def. */
2226 if (SCALAR_FLOAT_TYPE_P (type))
2227 def_for_init = build_real (type, dconst0);
2229 def_for_init = build_int_cst (type, 0);
2230 for (i = nunits - 1; i >= 0; --i)
2231 t = tree_cons (NULL_TREE, def_for_init, t);
2232 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
2233 gcc_assert (vector_type);
2234 init_def = build_vector (vector_type, t);
2239 *adjustment_def = NULL_TREE;
2251 /* Function vect_create_epilog_for_reduction
2253 Create code at the loop-epilog to finalize the result of a reduction
2256 VECT_DEF is a vector of partial results.
2257 REDUC_CODE is the tree-code for the epilog reduction.
2258 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2259 number of elements that we can fit in a vectype (nunits). In this case
2260 we have to generate more than one vector stmt - i.e - we need to "unroll"
2261 the vector stmt by a factor VF/nunits. For more details see documentation
2262 in vectorizable_operation.
2263 STMT is the scalar reduction stmt that is being vectorized.
2264 REDUCTION_PHI is the phi-node that carries the reduction computation.
2267 1. Creates the reduction def-use cycle: sets the arguments for
2269 The loop-entry argument is the vectorized initial-value of the reduction.
2270 The loop-latch argument is VECT_DEF - the vector of partial sums.
2271 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2272 by applying the operation specified by REDUC_CODE if available, or by
2273 other means (whole-vector shifts or a scalar loop).
2274 The function also creates a new phi node at the loop exit to preserve
2275 loop-closed form, as illustrated below.
2277 The flow at the entry to this function:
2280 vec_def = phi <null, null> # REDUCTION_PHI
2281 VECT_DEF = vector_stmt # vectorized form of STMT
2282 s_loop = scalar_stmt # (scalar) STMT
2284 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2288 The above is transformed by this function into:
2291 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2292 VECT_DEF = vector_stmt # vectorized form of STMT
2293 s_loop = scalar_stmt # (scalar) STMT
2295 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2296 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2297 v_out2 = reduce <v_out1>
2298 s_out3 = extract_field <v_out2, 0>
2299 s_out4 = adjust_result <s_out3>
2305 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2307 enum tree_code reduc_code,
2308 gimple reduction_phi)
2310 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2311 stmt_vec_info prev_phi_info;
2313 enum machine_mode mode;
2314 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2315 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2316 basic_block exit_bb;
2319 gimple new_phi = NULL, phi;
2320 gimple_stmt_iterator exit_gsi;
2322 tree new_temp = NULL_TREE;
2324 gimple epilog_stmt = NULL;
2325 tree new_scalar_dest, new_dest;
2327 tree bitsize, bitpos, bytesize;
2328 enum tree_code code = gimple_assign_rhs_code (stmt);
2329 tree adjustment_def;
2330 tree vec_initial_def, def;
2332 imm_use_iterator imm_iter;
2333 use_operand_p use_p;
2334 bool extract_scalar_result = false;
2335 tree reduction_op, expr;
2338 bool nested_in_vect_loop = false;
2339 VEC(gimple,heap) *phis = NULL;
2340 enum vect_def_type dt = vect_unknown_def_type;
2343 if (nested_in_vect_loop_p (loop, stmt))
2346 nested_in_vect_loop = true;
2349 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2351 case GIMPLE_SINGLE_RHS:
2352 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2353 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2355 case GIMPLE_UNARY_RHS:
2356 reduction_op = gimple_assign_rhs1 (stmt);
2358 case GIMPLE_BINARY_RHS:
2359 reduction_op = gimple_assign_rhs2 (stmt);
2365 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2366 gcc_assert (vectype);
2367 mode = TYPE_MODE (vectype);
2369 /*** 1. Create the reduction def-use cycle ***/
2371 /* For the case of reduction, vect_get_vec_def_for_operand returns
2372 the scalar def before the loop, that defines the initial value
2373 of the reduction variable. */
2374 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2377 phi = reduction_phi;
2379 for (j = 0; j < ncopies; j++)
2381 /* 1.1 set the loop-entry arg of the reduction-phi: */
2382 add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop));
2384 /* 1.2 set the loop-latch arg for the reduction-phi: */
2386 def = vect_get_vec_def_for_stmt_copy (dt, def);
2387 add_phi_arg (phi, def, loop_latch_edge (loop));
2389 if (vect_print_dump_info (REPORT_DETAILS))
2391 fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2392 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2393 fprintf (vect_dump, "\n");
2394 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2397 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2400 /*** 2. Create epilog code
2401 The reduction epilog code operates across the elements of the vector
2402 of partial results computed by the vectorized loop.
2403 The reduction epilog code consists of:
2404 step 1: compute the scalar result in a vector (v_out2)
2405 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2406 step 3: adjust the scalar result (s_out3) if needed.
2408 Step 1 can be accomplished using one the following three schemes:
2409 (scheme 1) using reduc_code, if available.
2410 (scheme 2) using whole-vector shifts, if available.
2411 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2414 The overall epilog code looks like this:
2416 s_out0 = phi <s_loop> # original EXIT_PHI
2417 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2418 v_out2 = reduce <v_out1> # step 1
2419 s_out3 = extract_field <v_out2, 0> # step 2
2420 s_out4 = adjust_result <s_out3> # step 3
2422 (step 3 is optional, and steps 1 and 2 may be combined).
2423 Lastly, the uses of s_out0 are replaced by s_out4.
2427 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2428 v_out1 = phi <v_loop> */
2430 exit_bb = single_exit (loop)->dest;
2432 prev_phi_info = NULL;
2433 for (j = 0; j < ncopies; j++)
2435 phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2436 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
2441 def = vect_get_vec_def_for_stmt_copy (dt, def);
2442 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
2444 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
2445 prev_phi_info = vinfo_for_stmt (phi);
2447 exit_gsi = gsi_after_labels (exit_bb);
2449 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2450 (i.e. when reduc_code is not available) and in the final adjustment
2451 code (if needed). Also get the original scalar reduction variable as
2452 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2453 represents a reduction pattern), the tree-code and scalar-def are
2454 taken from the original stmt that the pattern-stmt (STMT) replaces.
2455 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2456 are taken from STMT. */
2458 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2461 /* Regular reduction */
2466 /* Reduction pattern */
2467 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2468 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2469 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2471 code = gimple_assign_rhs_code (orig_stmt);
2472 scalar_dest = gimple_assign_lhs (orig_stmt);
2473 scalar_type = TREE_TYPE (scalar_dest);
2474 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2475 bitsize = TYPE_SIZE (scalar_type);
2476 bytesize = TYPE_SIZE_UNIT (scalar_type);
2479 /* In case this is a reduction in an inner-loop while vectorizing an outer
2480 loop - we don't need to extract a single scalar result at the end of the
2481 inner-loop. The final vector of partial results will be used in the
2482 vectorized outer-loop, or reduced to a scalar result at the end of the
2484 if (nested_in_vect_loop)
2485 goto vect_finalize_reduction;
2488 gcc_assert (ncopies = 1);
2490 /* 2.3 Create the reduction code, using one of the three schemes described
2493 if (reduc_code < NUM_TREE_CODES)
2497 /*** Case 1: Create:
2498 v_out2 = reduc_expr <v_out1> */
2500 if (vect_print_dump_info (REPORT_DETAILS))
2501 fprintf (vect_dump, "Reduce using direct vector reduction.");
2503 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2504 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2505 epilog_stmt = gimple_build_assign (vec_dest, tmp);
2506 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2507 gimple_assign_set_lhs (epilog_stmt, new_temp);
2508 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2510 extract_scalar_result = true;
2514 enum tree_code shift_code = 0;
2515 bool have_whole_vector_shift = true;
2517 int element_bitsize = tree_low_cst (bitsize, 1);
2518 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2521 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2522 shift_code = VEC_RSHIFT_EXPR;
2524 have_whole_vector_shift = false;
2526 /* Regardless of whether we have a whole vector shift, if we're
2527 emulating the operation via tree-vect-generic, we don't want
2528 to use it. Only the first round of the reduction is likely
2529 to still be profitable via emulation. */
2530 /* ??? It might be better to emit a reduction tree code here, so that
2531 tree-vect-generic can expand the first round via bit tricks. */
2532 if (!VECTOR_MODE_P (mode))
2533 have_whole_vector_shift = false;
2536 optab optab = optab_for_tree_code (code, vectype, optab_default);
2537 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2538 have_whole_vector_shift = false;
2541 if (have_whole_vector_shift)
2543 /*** Case 2: Create:
2544 for (offset = VS/2; offset >= element_size; offset/=2)
2546 Create: va' = vec_shift <va, offset>
2547 Create: va = vop <va, va'>
2550 if (vect_print_dump_info (REPORT_DETAILS))
2551 fprintf (vect_dump, "Reduce using vector shifts");
2553 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2554 new_temp = PHI_RESULT (new_phi);
2556 for (bit_offset = vec_size_in_bits/2;
2557 bit_offset >= element_bitsize;
2560 tree bitpos = size_int (bit_offset);
2561 epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
2563 new_name = make_ssa_name (vec_dest, epilog_stmt);
2564 gimple_assign_set_lhs (epilog_stmt, new_name);
2565 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2567 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
2568 new_name, new_temp);
2569 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2570 gimple_assign_set_lhs (epilog_stmt, new_temp);
2571 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2574 extract_scalar_result = true;
2580 /*** Case 3: Create:
2581 s = extract_field <v_out2, 0>
2582 for (offset = element_size;
2583 offset < vector_size;
2584 offset += element_size;)
2586 Create: s' = extract_field <v_out2, offset>
2587 Create: s = op <s, s'>
2590 if (vect_print_dump_info (REPORT_DETAILS))
2591 fprintf (vect_dump, "Reduce using scalar code. ");
2593 vec_temp = PHI_RESULT (new_phi);
2594 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2595 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2597 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2598 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2599 gimple_assign_set_lhs (epilog_stmt, new_temp);
2600 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2602 for (bit_offset = element_bitsize;
2603 bit_offset < vec_size_in_bits;
2604 bit_offset += element_bitsize)
2606 tree bitpos = bitsize_int (bit_offset);
2607 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2610 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2611 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2612 gimple_assign_set_lhs (epilog_stmt, new_name);
2613 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2615 epilog_stmt = gimple_build_assign_with_ops (code,
2617 new_name, new_temp);
2618 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2619 gimple_assign_set_lhs (epilog_stmt, new_temp);
2620 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2623 extract_scalar_result = false;
2627 /* 2.4 Extract the final scalar result. Create:
2628 s_out3 = extract_field <v_out2, bitpos> */
2630 if (extract_scalar_result)
2634 gcc_assert (!nested_in_vect_loop);
2635 if (vect_print_dump_info (REPORT_DETAILS))
2636 fprintf (vect_dump, "extract scalar result");
2638 if (BYTES_BIG_ENDIAN)
2639 bitpos = size_binop (MULT_EXPR,
2640 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2641 TYPE_SIZE (scalar_type));
2643 bitpos = bitsize_zero_node;
2645 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2646 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2647 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2648 gimple_assign_set_lhs (epilog_stmt, new_temp);
2649 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2652 vect_finalize_reduction:
2654 /* 2.5 Adjust the final result by the initial value of the reduction
2655 variable. (When such adjustment is not needed, then
2656 'adjustment_def' is zero). For example, if code is PLUS we create:
2657 new_temp = loop_exit_def + adjustment_def */
2661 if (nested_in_vect_loop)
2663 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2664 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2665 new_dest = vect_create_destination_var (scalar_dest, vectype);
2669 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2670 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2671 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2673 epilog_stmt = gimple_build_assign (new_dest, expr);
2674 new_temp = make_ssa_name (new_dest, epilog_stmt);
2675 gimple_assign_set_lhs (epilog_stmt, new_temp);
2676 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
2677 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2681 /* 2.6 Handle the loop-exit phi */
2683 /* Replace uses of s_out0 with uses of s_out3:
2684 Find the loop-closed-use at the loop exit of the original scalar result.
2685 (The reduction result is expected to have two immediate uses - one at the
2686 latch block, and one at the loop exit). */
2687 phis = VEC_alloc (gimple, heap, 10);
2688 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2690 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2692 exit_phi = USE_STMT (use_p);
2693 VEC_quick_push (gimple, phis, exit_phi);
2696 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2697 gcc_assert (!VEC_empty (gimple, phis));
2699 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
2701 if (nested_in_vect_loop)
2703 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2705 /* FORNOW. Currently not supporting the case that an inner-loop
2706 reduction is not used in the outer-loop (but only outside the
2708 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2709 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2711 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2712 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2713 set_vinfo_for_stmt (epilog_stmt,
2714 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2716 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
2717 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
2721 /* Replace the uses: */
2722 orig_name = PHI_RESULT (exit_phi);
2723 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2724 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2725 SET_USE (use_p, new_temp);
2727 VEC_free (gimple, heap, phis);
2731 /* Function vectorizable_reduction.
2733 Check if STMT performs a reduction operation that can be vectorized.
2734 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2735 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2736 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2738 This function also handles reduction idioms (patterns) that have been
2739 recognized in advance during vect_pattern_recog. In this case, STMT may be
2741 X = pattern_expr (arg0, arg1, ..., X)
2742 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2743 sequence that had been detected and replaced by the pattern-stmt (STMT).
2745 In some cases of reduction patterns, the type of the reduction variable X is
2746 different than the type of the other arguments of STMT.
2747 In such cases, the vectype that is used when transforming STMT into a vector
2748 stmt is different than the vectype that is used to determine the
2749 vectorization factor, because it consists of a different number of elements
2750 than the actual number of elements that are being operated upon in parallel.
2752 For example, consider an accumulation of shorts into an int accumulator.
2753 On some targets it's possible to vectorize this pattern operating on 8
2754 shorts at a time (hence, the vectype for purposes of determining the
2755 vectorization factor should be V8HI); on the other hand, the vectype that
2756 is used to create the vector form is actually V4SI (the type of the result).
2758 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2759 indicates what is the actual level of parallelism (V8HI in the example), so
2760 that the right vectorization factor would be derived. This vectype
2761 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2762 be used to create the vectorized stmt. The right vectype for the vectorized
2763 stmt is obtained from the type of the result X:
2764 get_vectype_for_scalar_type (TREE_TYPE (X))
2766 This means that, contrary to "regular" reductions (or "regular" stmts in
2767 general), the following equation:
2768 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2769 does *NOT* necessarily hold for reduction patterns. */
2772 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
2777 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2778 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2779 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2780 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2781 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2782 enum tree_code code, orig_code, epilog_reduc_code = 0;
2783 enum machine_mode vec_mode;
2785 optab optab, reduc_optab;
2786 tree new_temp = NULL_TREE;
2789 enum vect_def_type dt;
2790 gimple new_phi = NULL;
2794 stmt_vec_info orig_stmt_info;
2795 tree expr = NULL_TREE;
2797 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2798 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2800 stmt_vec_info prev_stmt_info, prev_phi_info;
2801 gimple first_phi = NULL;
2802 bool single_defuse_cycle = false;
2804 gimple new_stmt = NULL;
2808 if (nested_in_vect_loop_p (loop, stmt))
2811 gcc_assert (ncopies >= 1);
2813 /* FORNOW: SLP not supported. */
2814 if (STMT_SLP_TYPE (stmt_info))
2817 /* 1. Is vectorizable reduction? */
2819 /* Not supportable if the reduction variable is used in the loop. */
2820 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2823 /* Reductions that are not used even in an enclosing outer-loop,
2824 are expected to be "live" (used out of the loop). */
2825 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2826 && !STMT_VINFO_LIVE_P (stmt_info))
2829 /* Make sure it was already recognized as a reduction computation. */
2830 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2833 /* 2. Has this been recognized as a reduction pattern?
2835 Check if STMT represents a pattern that has been recognized
2836 in earlier analysis stages. For stmts that represent a pattern,
2837 the STMT_VINFO_RELATED_STMT field records the last stmt in
2838 the original sequence that constitutes the pattern. */
2840 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2843 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2844 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2845 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2846 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2849 /* 3. Check the operands of the operation. The first operands are defined
2850 inside the loop body. The last operand is the reduction variable,
2851 which is defined by the loop-header-phi. */
2853 gcc_assert (is_gimple_assign (stmt));
2856 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2858 case GIMPLE_SINGLE_RHS:
2859 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
2860 if (op_type == ternary_op)
2862 tree rhs = gimple_assign_rhs1 (stmt);
2863 ops[0] = TREE_OPERAND (rhs, 0);
2864 ops[1] = TREE_OPERAND (rhs, 1);
2865 ops[2] = TREE_OPERAND (rhs, 2);
2866 code = TREE_CODE (rhs);
2872 case GIMPLE_BINARY_RHS:
2873 code = gimple_assign_rhs_code (stmt);
2874 op_type = TREE_CODE_LENGTH (code);
2875 gcc_assert (op_type == binary_op);
2876 ops[0] = gimple_assign_rhs1 (stmt);
2877 ops[1] = gimple_assign_rhs2 (stmt);
2880 case GIMPLE_UNARY_RHS:
2887 scalar_dest = gimple_assign_lhs (stmt);
2888 scalar_type = TREE_TYPE (scalar_dest);
2889 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2890 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2893 /* All uses but the last are expected to be defined in the loop.
2894 The last use is the reduction variable. */
2895 for (i = 0; i < op_type-1; i++)
2897 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt,
2899 gcc_assert (is_simple_use);
2900 if (dt != vect_loop_def
2901 && dt != vect_invariant_def
2902 && dt != vect_constant_def
2903 && dt != vect_induction_def)
2907 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt, &def, &dt);
2908 gcc_assert (is_simple_use);
2909 gcc_assert (dt == vect_reduction_def);
2910 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2912 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2914 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2916 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2919 /* 4. Supportable by target? */
2921 /* 4.1. check support for the operation in the loop */
2922 optab = optab_for_tree_code (code, vectype, optab_default);
2925 if (vect_print_dump_info (REPORT_DETAILS))
2926 fprintf (vect_dump, "no optab.");
2929 vec_mode = TYPE_MODE (vectype);
2930 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2932 if (vect_print_dump_info (REPORT_DETAILS))
2933 fprintf (vect_dump, "op not supported by target.");
2934 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2935 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2936 < vect_min_worthwhile_factor (code))
2938 if (vect_print_dump_info (REPORT_DETAILS))
2939 fprintf (vect_dump, "proceeding using word mode.");
2942 /* Worthwhile without SIMD support? */
2943 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2944 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2945 < vect_min_worthwhile_factor (code))
2947 if (vect_print_dump_info (REPORT_DETAILS))
2948 fprintf (vect_dump, "not worthwhile without SIMD support.");
2952 /* 4.2. Check support for the epilog operation.
2954 If STMT represents a reduction pattern, then the type of the
2955 reduction variable may be different than the type of the rest
2956 of the arguments. For example, consider the case of accumulation
2957 of shorts into an int accumulator; The original code:
2958 S1: int_a = (int) short_a;
2959 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2962 STMT: int_acc = widen_sum <short_a, int_acc>
2965 1. The tree-code that is used to create the vector operation in the
2966 epilog code (that reduces the partial results) is not the
2967 tree-code of STMT, but is rather the tree-code of the original
2968 stmt from the pattern that STMT is replacing. I.e, in the example
2969 above we want to use 'widen_sum' in the loop, but 'plus' in the
2971 2. The type (mode) we use to check available target support
2972 for the vector operation to be created in the *epilog*, is
2973 determined by the type of the reduction variable (in the example
2974 above we'd check this: plus_optab[vect_int_mode]).
2975 However the type (mode) we use to check available target support
2976 for the vector operation to be created *inside the loop*, is
2977 determined by the type of the other arguments to STMT (in the
2978 example we'd check this: widen_sum_optab[vect_short_mode]).
2980 This is contrary to "regular" reductions, in which the types of all
2981 the arguments are the same as the type of the reduction variable.
2982 For "regular" reductions we can therefore use the same vector type
2983 (and also the same tree-code) when generating the epilog code and
2984 when generating the code inside the loop. */
2988 /* This is a reduction pattern: get the vectype from the type of the
2989 reduction variable, and get the tree-code from orig_stmt. */
2990 orig_code = gimple_assign_rhs_code (orig_stmt);
2991 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
2994 if (vect_print_dump_info (REPORT_DETAILS))
2996 fprintf (vect_dump, "unsupported data-type ");
2997 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3002 vec_mode = TYPE_MODE (vectype);
3006 /* Regular reduction: use the same vectype and tree-code as used for
3007 the vector code inside the loop can be used for the epilog code. */
3011 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3013 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default);
3016 if (vect_print_dump_info (REPORT_DETAILS))
3017 fprintf (vect_dump, "no optab for reduction.");
3018 epilog_reduc_code = NUM_TREE_CODES;
3020 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
3022 if (vect_print_dump_info (REPORT_DETAILS))
3023 fprintf (vect_dump, "reduc op not supported by target.");
3024 epilog_reduc_code = NUM_TREE_CODES;
3027 if (!vec_stmt) /* transformation not required. */
3029 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3030 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3037 if (vect_print_dump_info (REPORT_DETAILS))
3038 fprintf (vect_dump, "transform reduction.");
3040 /* Create the destination vector */
3041 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3043 /* In case the vectorization factor (VF) is bigger than the number
3044 of elements that we can fit in a vectype (nunits), we have to generate
3045 more than one vector stmt - i.e - we need to "unroll" the
3046 vector stmt by a factor VF/nunits. For more details see documentation
3047 in vectorizable_operation. */
3049 /* If the reduction is used in an outer loop we need to generate
3050 VF intermediate results, like so (e.g. for ncopies=2):
3055 (i.e. we generate VF results in 2 registers).
3056 In this case we have a separate def-use cycle for each copy, and therefore
3057 for each copy we get the vector def for the reduction variable from the
3058 respective phi node created for this copy.
3060 Otherwise (the reduction is unused in the loop nest), we can combine
3061 together intermediate results, like so (e.g. for ncopies=2):
3065 (i.e. we generate VF/2 results in a single register).
3066 In this case for each copy we get the vector def for the reduction variable
3067 from the vectorized reduction operation generated in the previous iteration.
3070 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop)
3072 single_defuse_cycle = true;
3076 epilog_copies = ncopies;
3078 prev_stmt_info = NULL;
3079 prev_phi_info = NULL;
3080 for (j = 0; j < ncopies; j++)
3082 if (j == 0 || !single_defuse_cycle)
3084 /* Create the reduction-phi that defines the reduction-operand. */
3085 new_phi = create_phi_node (vec_dest, loop->header);
3086 set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo));
3092 loop_vec_def0 = vect_get_vec_def_for_operand (ops[0], stmt, NULL);
3093 if (op_type == ternary_op)
3095 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, NULL);
3098 /* Get the vector def for the reduction variable from the phi node */
3099 reduc_def = PHI_RESULT (new_phi);
3100 first_phi = new_phi;
3104 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3105 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3106 if (op_type == ternary_op)
3107 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3109 if (single_defuse_cycle)
3110 reduc_def = gimple_assign_lhs (new_stmt);
3112 reduc_def = PHI_RESULT (new_phi);
3114 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3117 /* Arguments are ready. create the new vector stmt. */
3118 if (op_type == binary_op)
3119 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3121 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3123 new_stmt = gimple_build_assign (vec_dest, expr);
3124 new_temp = make_ssa_name (vec_dest, new_stmt);
3125 gimple_assign_set_lhs (new_stmt, new_temp);
3126 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3129 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3131 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3132 prev_stmt_info = vinfo_for_stmt (new_stmt);
3133 prev_phi_info = vinfo_for_stmt (new_phi);
3136 /* Finalize the reduction-phi (set it's arguments) and create the
3137 epilog reduction code. */
3138 if (!single_defuse_cycle)
3139 new_temp = gimple_assign_lhs (*vec_stmt);
3140 vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3141 epilog_reduc_code, first_phi);
3145 /* Checks if CALL can be vectorized in type VECTYPE. Returns
3146 a function declaration if the target has a vectorized version
3147 of the function, or NULL_TREE if the function cannot be vectorized. */
3150 vectorizable_function (gimple call, tree vectype_out, tree vectype_in)
3152 tree fndecl = gimple_call_fndecl (call);
3153 enum built_in_function code;
3155 /* We only handle functions that do not read or clobber memory -- i.e.
3156 const or novops ones. */
3157 if (!(gimple_call_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3161 || TREE_CODE (fndecl) != FUNCTION_DECL
3162 || !DECL_BUILT_IN (fndecl))
3165 code = DECL_FUNCTION_CODE (fndecl);
3166 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3170 /* Function vectorizable_call.
3172 Check if STMT performs a function call that can be vectorized.
3173 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3174 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3175 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3178 vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
3183 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3184 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3185 tree vectype_out, vectype_in;
3188 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3189 tree fndecl, new_temp, def, rhs_type, lhs_type;
3191 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3194 VEC(tree, heap) *vargs = NULL;
3195 enum { NARROW, NONE, WIDEN } modifier;
3198 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3201 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3204 /* FORNOW: SLP not supported. */
3205 if (STMT_SLP_TYPE (stmt_info))
3208 /* Is STMT a vectorizable call? */
3209 if (!is_gimple_call (stmt))
3212 if (TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
3215 /* Process function arguments. */
3216 rhs_type = NULL_TREE;
3217 nargs = gimple_call_num_args (stmt);
3219 /* Bail out if the function has more than two arguments, we
3220 do not have interesting builtin functions to vectorize with
3221 more than two arguments. No arguments is also not good. */
3222 if (nargs == 0 || nargs > 2)
3225 for (i = 0; i < nargs; i++)
3227 op = gimple_call_arg (stmt, i);
3229 /* We can only handle calls with arguments of the same type. */
3231 && rhs_type != TREE_TYPE (op))
3233 if (vect_print_dump_info (REPORT_DETAILS))
3234 fprintf (vect_dump, "argument types differ.");
3237 rhs_type = TREE_TYPE (op);
3239 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[i]))
3241 if (vect_print_dump_info (REPORT_DETAILS))
3242 fprintf (vect_dump, "use not simple.");
3247 vectype_in = get_vectype_for_scalar_type (rhs_type);
3250 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3252 lhs_type = TREE_TYPE (gimple_call_lhs (stmt));
3253 vectype_out = get_vectype_for_scalar_type (lhs_type);
3256 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3259 if (nunits_in == nunits_out / 2)
3261 else if (nunits_out == nunits_in)
3263 else if (nunits_out == nunits_in / 2)
3268 /* For now, we only vectorize functions if a target specific builtin
3269 is available. TODO -- in some cases, it might be profitable to
3270 insert the calls for pieces of the vector, in order to be able
3271 to vectorize other operations in the loop. */
3272 fndecl = vectorizable_function (stmt, vectype_out, vectype_in);
3273 if (fndecl == NULL_TREE)
3275 if (vect_print_dump_info (REPORT_DETAILS))
3276 fprintf (vect_dump, "function is not vectorizable.");
3281 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3283 if (modifier == NARROW)
3284 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3286 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3288 /* Sanity check: make sure that at least one copy of the vectorized stmt
3289 needs to be generated. */
3290 gcc_assert (ncopies >= 1);
3292 if (!vec_stmt) /* transformation not required. */
3294 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3295 if (vect_print_dump_info (REPORT_DETAILS))
3296 fprintf (vect_dump, "=== vectorizable_call ===");
3297 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3303 if (vect_print_dump_info (REPORT_DETAILS))
3304 fprintf (vect_dump, "transform operation.");
3307 scalar_dest = gimple_call_lhs (stmt);
3308 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3310 prev_stmt_info = NULL;
3314 for (j = 0; j < ncopies; ++j)
3316 /* Build argument list for the vectorized call. */
3318 vargs = VEC_alloc (tree, heap, nargs);
3320 VEC_truncate (tree, vargs, 0);
3322 for (i = 0; i < nargs; i++)
3324 op = gimple_call_arg (stmt, i);
3327 = vect_get_vec_def_for_operand (op, stmt, NULL);
3330 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3332 VEC_quick_push (tree, vargs, vec_oprnd0);
3335 new_stmt = gimple_build_call_vec (fndecl, vargs);
3336 new_temp = make_ssa_name (vec_dest, new_stmt);
3337 gimple_call_set_lhs (new_stmt, new_temp);
3339 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3342 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3344 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3346 prev_stmt_info = vinfo_for_stmt (new_stmt);
3352 for (j = 0; j < ncopies; ++j)
3354 /* Build argument list for the vectorized call. */
3356 vargs = VEC_alloc (tree, heap, nargs * 2);
3358 VEC_truncate (tree, vargs, 0);
3360 for (i = 0; i < nargs; i++)
3362 op = gimple_call_arg (stmt, i);
3366 = vect_get_vec_def_for_operand (op, stmt, NULL);
3368 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3373 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3375 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3378 VEC_quick_push (tree, vargs, vec_oprnd0);
3379 VEC_quick_push (tree, vargs, vec_oprnd1);
3382 new_stmt = gimple_build_call_vec (fndecl, vargs);
3383 new_temp = make_ssa_name (vec_dest, new_stmt);
3384 gimple_call_set_lhs (new_stmt, new_temp);
3386 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3389 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3391 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3393 prev_stmt_info = vinfo_for_stmt (new_stmt);
3396 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3401 /* No current target implements this case. */
3405 VEC_free (tree, heap, vargs);
3407 /* The call in STMT might prevent it from being removed in dce.
3408 We however cannot remove it here, due to the way the ssa name
3409 it defines is mapped to the new definition. So just replace
3410 rhs of the statement with something harmless. */
3412 type = TREE_TYPE (scalar_dest);
3413 new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
3414 fold_convert (type, integer_zero_node));
3415 set_vinfo_for_stmt (new_stmt, stmt_info);
3416 set_vinfo_for_stmt (stmt, NULL);
3417 STMT_VINFO_STMT (stmt_info) = new_stmt;
3418 gsi_replace (gsi, new_stmt, false);
3419 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3425 /* Function vect_gen_widened_results_half
3427 Create a vector stmt whose code, type, number of arguments, and result
3428 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
3429 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3430 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3431 needs to be created (DECL is a function-decl of a target-builtin).
3432 STMT is the original scalar stmt that we are vectorizing. */
3435 vect_gen_widened_results_half (enum tree_code code,
3436 tree vectype ATTRIBUTE_UNUSED,
3438 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3439 tree vec_dest, gimple_stmt_iterator *gsi,
3447 /* Generate half of the widened result: */
3448 if (code == CALL_EXPR)
3450 /* Target specific support */
3451 if (op_type == binary_op)
3452 new_stmt = gimple_build_call (decl, 2, vec_oprnd0, vec_oprnd1);
3454 new_stmt = gimple_build_call (decl, 1, vec_oprnd0);
3455 new_temp = make_ssa_name (vec_dest, new_stmt);
3456 gimple_call_set_lhs (new_stmt, new_temp);
3460 /* Generic support */
3461 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3462 if (op_type != binary_op)
3464 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vec_oprnd0,
3466 new_temp = make_ssa_name (vec_dest, new_stmt);
3467 gimple_assign_set_lhs (new_stmt, new_temp);
3469 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3471 if (code == CALL_EXPR)
3473 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3475 if (TREE_CODE (sym) == SSA_NAME)
3476 sym = SSA_NAME_VAR (sym);
3477 mark_sym_for_renaming (sym);
3485 /* Check if STMT performs a conversion operation, that can be vectorized.
3486 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3487 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3488 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3491 vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
3492 gimple *vec_stmt, slp_tree slp_node)
3497 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3498 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3499 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3500 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3501 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3505 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3506 gimple new_stmt = NULL;
3507 stmt_vec_info prev_stmt_info;
3510 tree vectype_out, vectype_in;
3513 tree rhs_type, lhs_type;
3515 enum { NARROW, NONE, WIDEN } modifier;
3517 VEC(tree,heap) *vec_oprnds0 = NULL;
3523 /* Is STMT a vectorizable conversion? */
3525 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3528 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3531 if (!is_gimple_assign (stmt))
3534 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
3537 code = gimple_assign_rhs_code (stmt);
3538 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3541 /* Check types of lhs and rhs. */
3542 op0 = gimple_assign_rhs1 (stmt);
3543 rhs_type = TREE_TYPE (op0);
3544 vectype_in = get_vectype_for_scalar_type (rhs_type);
3547 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3549 scalar_dest = gimple_assign_lhs (stmt);
3550 lhs_type = TREE_TYPE (scalar_dest);
3551 vectype_out = get_vectype_for_scalar_type (lhs_type);
3554 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3557 if (nunits_in == nunits_out / 2)
3559 else if (nunits_out == nunits_in)
3561 else if (nunits_out == nunits_in / 2)
3566 if (modifier == NONE)
3567 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3569 /* Bail out if the types are both integral or non-integral. */
3570 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3571 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3574 integral_type = INTEGRAL_TYPE_P (rhs_type) ? vectype_in : vectype_out;
3576 if (modifier == NARROW)
3577 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3579 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3581 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3582 this, so we can safely override NCOPIES with 1 here. */
3586 /* Sanity check: make sure that at least one copy of the vectorized stmt
3587 needs to be generated. */
3588 gcc_assert (ncopies >= 1);
3590 /* Check the operands of the operation. */
3591 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3593 if (vect_print_dump_info (REPORT_DETAILS))
3594 fprintf (vect_dump, "use not simple.");
3598 /* Supportable by target? */
3599 if ((modifier == NONE
3600 && !targetm.vectorize.builtin_conversion (code, integral_type))
3601 || (modifier == WIDEN
3602 && !supportable_widening_operation (code, stmt, vectype_in,
3605 &dummy_bool, &dummy))
3606 || (modifier == NARROW
3607 && !supportable_narrowing_operation (code, stmt, vectype_in,
3608 &code1, &dummy_bool, &dummy)))
3610 if (vect_print_dump_info (REPORT_DETAILS))
3611 fprintf (vect_dump, "conversion not supported by target.");
3615 if (modifier != NONE)
3617 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3618 /* FORNOW: SLP not supported. */
3619 if (STMT_SLP_TYPE (stmt_info))
3623 if (!vec_stmt) /* transformation not required. */
3625 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3630 if (vect_print_dump_info (REPORT_DETAILS))
3631 fprintf (vect_dump, "transform conversion.");
3634 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3636 if (modifier == NONE && !slp_node)
3637 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3639 prev_stmt_info = NULL;
3643 for (j = 0; j < ncopies; j++)
3649 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3651 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3654 targetm.vectorize.builtin_conversion (code, integral_type);
3655 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3657 /* Arguments are ready. create the new vector stmt. */
3658 new_stmt = gimple_build_call (builtin_decl, 1, vop0);
3659 new_temp = make_ssa_name (vec_dest, new_stmt);
3660 gimple_call_set_lhs (new_stmt, new_temp);
3661 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3662 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3663 SSA_OP_ALL_VIRTUALS)
3665 if (TREE_CODE (sym) == SSA_NAME)
3666 sym = SSA_NAME_VAR (sym);
3667 mark_sym_for_renaming (sym);
3670 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3674 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3676 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3677 prev_stmt_info = vinfo_for_stmt (new_stmt);
3682 /* In case the vectorization factor (VF) is bigger than the number
3683 of elements that we can fit in a vectype (nunits), we have to
3684 generate more than one vector stmt - i.e - we need to "unroll"
3685 the vector stmt by a factor VF/nunits. */
3686 for (j = 0; j < ncopies; j++)
3689 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3691 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3693 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3695 /* Generate first half of the widened result: */
3697 = vect_gen_widened_results_half (code1, vectype_out, decl1,
3698 vec_oprnd0, vec_oprnd1,
3699 unary_op, vec_dest, gsi, stmt);
3701 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3703 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3704 prev_stmt_info = vinfo_for_stmt (new_stmt);
3706 /* Generate second half of the widened result: */
3708 = vect_gen_widened_results_half (code2, vectype_out, decl2,
3709 vec_oprnd0, vec_oprnd1,
3710 unary_op, vec_dest, gsi, stmt);
3711 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3712 prev_stmt_info = vinfo_for_stmt (new_stmt);
3717 /* In case the vectorization factor (VF) is bigger than the number
3718 of elements that we can fit in a vectype (nunits), we have to
3719 generate more than one vector stmt - i.e - we need to "unroll"
3720 the vector stmt by a factor VF/nunits. */
3721 for (j = 0; j < ncopies; j++)
3726 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3727 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3731 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3732 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3735 /* Arguments are ready. Create the new vector stmt. */
3736 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3737 new_stmt = gimple_build_assign_with_ops (code1, vec_dest, vec_oprnd0,
3739 new_temp = make_ssa_name (vec_dest, new_stmt);
3740 gimple_assign_set_lhs (new_stmt, new_temp);
3741 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3744 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3746 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3748 prev_stmt_info = vinfo_for_stmt (new_stmt);
3751 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3755 VEC_free (tree, heap, vec_oprnds0);
3761 /* Function vectorizable_assignment.
3763 Check if STMT performs an assignment (copy) that can be vectorized.
3764 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3765 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3766 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3769 vectorizable_assignment (gimple stmt, gimple_stmt_iterator *gsi,
3770 gimple *vec_stmt, slp_tree slp_node)
3775 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3776 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3777 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3781 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3782 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3783 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3785 VEC(tree,heap) *vec_oprnds = NULL;
3788 /* FORNOW: SLP with multiple types is not supported. The SLP analysis
3789 verifies this, so we can safely override NCOPIES with 1 here. */
3793 gcc_assert (ncopies >= 1);
3795 return false; /* FORNOW */
3797 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3800 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3803 /* Is vectorizable assignment? */
3804 if (!is_gimple_assign (stmt))
3807 scalar_dest = gimple_assign_lhs (stmt);
3808 if (TREE_CODE (scalar_dest) != SSA_NAME)
3811 if (gimple_assign_single_p (stmt)
3812 || gimple_assign_rhs_code (stmt) == PAREN_EXPR)
3813 op = gimple_assign_rhs1 (stmt);
3817 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3819 if (vect_print_dump_info (REPORT_DETAILS))
3820 fprintf (vect_dump, "use not simple.");
3824 if (!vec_stmt) /* transformation not required. */
3826 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3827 if (vect_print_dump_info (REPORT_DETAILS))
3828 fprintf (vect_dump, "=== vectorizable_assignment ===");
3829 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3834 if (vect_print_dump_info (REPORT_DETAILS))
3835 fprintf (vect_dump, "transform assignment.");
3838 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3841 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3843 /* Arguments are ready. create the new vector stmt. */
3844 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3846 *vec_stmt = gimple_build_assign (vec_dest, vop);
3847 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3848 gimple_assign_set_lhs (*vec_stmt, new_temp);
3849 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
3850 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3853 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3856 VEC_free (tree, heap, vec_oprnds);
3861 /* Function vect_min_worthwhile_factor.
3863 For a loop where we could vectorize the operation indicated by CODE,
3864 return the minimum vectorization factor that makes it worthwhile
3865 to use generic vectors. */
3867 vect_min_worthwhile_factor (enum tree_code code)
3888 /* Function vectorizable_induction
3890 Check if PHI performs an induction computation that can be vectorized.
3891 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3892 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3893 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3896 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3899 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3900 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3901 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3902 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3903 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3904 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3907 gcc_assert (ncopies >= 1);
3908 /* FORNOW. This restriction should be relaxed. */
3909 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
3911 if (vect_print_dump_info (REPORT_DETAILS))
3912 fprintf (vect_dump, "multiple types in nested loop.");
3916 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3919 /* FORNOW: SLP not supported. */
3920 if (STMT_SLP_TYPE (stmt_info))
3923 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3925 if (gimple_code (phi) != GIMPLE_PHI)
3928 if (!vec_stmt) /* transformation not required. */
3930 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3931 if (vect_print_dump_info (REPORT_DETAILS))
3932 fprintf (vect_dump, "=== vectorizable_induction ===");
3933 vect_model_induction_cost (stmt_info, ncopies);
3939 if (vect_print_dump_info (REPORT_DETAILS))
3940 fprintf (vect_dump, "transform induction phi.");
3942 vec_def = get_initial_def_for_induction (phi);
3943 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3948 /* Function vectorizable_operation.
3950 Check if STMT performs a binary or unary operation that can be vectorized.
3951 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3952 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3953 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3956 vectorizable_operation (gimple stmt, gimple_stmt_iterator *gsi,
3957 gimple *vec_stmt, slp_tree slp_node)
3961 tree op0, op1 = NULL;
3962 tree vec_oprnd1 = NULL_TREE;
3963 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3964 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3965 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3966 enum tree_code code;
3967 enum machine_mode vec_mode;
3972 enum machine_mode optab_op2_mode;
3975 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3976 gimple new_stmt = NULL;
3977 stmt_vec_info prev_stmt_info;
3978 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
3981 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3983 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
3986 bool shift_p = false;
3987 bool scalar_shift_arg = false;
3989 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3990 this, so we can safely override NCOPIES with 1 here. */
3993 gcc_assert (ncopies >= 1);
3995 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3998 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4001 /* Is STMT a vectorizable binary/unary operation? */
4002 if (!is_gimple_assign (stmt))
4005 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4008 scalar_dest = gimple_assign_lhs (stmt);
4009 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4012 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4013 if (nunits_out != nunits_in)
4016 code = gimple_assign_rhs_code (stmt);
4018 /* For pointer addition, we should use the normal plus for
4019 the vector addition. */
4020 if (code == POINTER_PLUS_EXPR)
4023 /* Support only unary or binary operations. */
4024 op_type = TREE_CODE_LENGTH (code);
4025 if (op_type != unary_op && op_type != binary_op)
4027 if (vect_print_dump_info (REPORT_DETAILS))
4028 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
4032 op0 = gimple_assign_rhs1 (stmt);
4033 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4035 if (vect_print_dump_info (REPORT_DETAILS))
4036 fprintf (vect_dump, "use not simple.");
4040 if (op_type == binary_op)
4042 op1 = gimple_assign_rhs2 (stmt);
4043 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4045 if (vect_print_dump_info (REPORT_DETAILS))
4046 fprintf (vect_dump, "use not simple.");
4051 /* If this is a shift/rotate, determine whether the shift amount is a vector,
4052 or scalar. If the shift/rotate amount is a vector, use the vector/vector
4054 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
4055 || code == RROTATE_EXPR)
4059 /* vector shifted by vector */
4060 if (dt[1] == vect_loop_def)
4062 optab = optab_for_tree_code (code, vectype, optab_vector);
4063 if (vect_print_dump_info (REPORT_DETAILS))
4064 fprintf (vect_dump, "vector/vector shift/rotate found.");
4067 /* See if the machine has a vector shifted by scalar insn and if not
4068 then see if it has a vector shifted by vector insn */
4069 else if (dt[1] == vect_constant_def || dt[1] == vect_invariant_def)
4071 optab = optab_for_tree_code (code, vectype, optab_scalar);
4073 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4074 != CODE_FOR_nothing))
4076 scalar_shift_arg = true;
4077 if (vect_print_dump_info (REPORT_DETAILS))
4078 fprintf (vect_dump, "vector/scalar shift/rotate found.");
4082 optab = optab_for_tree_code (code, vectype, optab_vector);
4083 if (vect_print_dump_info (REPORT_DETAILS)
4085 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4086 != CODE_FOR_nothing))
4087 fprintf (vect_dump, "vector/vector shift/rotate found.");
4093 if (vect_print_dump_info (REPORT_DETAILS))
4094 fprintf (vect_dump, "operand mode requires invariant argument.");
4099 optab = optab_for_tree_code (code, vectype, optab_default);
4101 /* Supportable by target? */
4104 if (vect_print_dump_info (REPORT_DETAILS))
4105 fprintf (vect_dump, "no optab.");
4108 vec_mode = TYPE_MODE (vectype);
4109 icode = (int) optab_handler (optab, vec_mode)->insn_code;
4110 if (icode == CODE_FOR_nothing)
4112 if (vect_print_dump_info (REPORT_DETAILS))
4113 fprintf (vect_dump, "op not supported by target.");
4114 /* Check only during analysis. */
4115 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4116 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4117 < vect_min_worthwhile_factor (code)
4120 if (vect_print_dump_info (REPORT_DETAILS))
4121 fprintf (vect_dump, "proceeding using word mode.");
4124 /* Worthwhile without SIMD support? Check only during analysis. */
4125 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
4126 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4127 < vect_min_worthwhile_factor (code)
4130 if (vect_print_dump_info (REPORT_DETAILS))
4131 fprintf (vect_dump, "not worthwhile without SIMD support.");
4135 if (!vec_stmt) /* transformation not required. */
4137 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
4138 if (vect_print_dump_info (REPORT_DETAILS))
4139 fprintf (vect_dump, "=== vectorizable_operation ===");
4140 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4146 if (vect_print_dump_info (REPORT_DETAILS))
4147 fprintf (vect_dump, "transform binary/unary operation.");
4150 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4152 /* Allocate VECs for vector operands. In case of SLP, vector operands are
4153 created in the previous stages of the recursion, so no allocation is
4154 needed, except for the case of shift with scalar shift argument. In that
4155 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
4156 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
4157 In case of loop-based vectorization we allocate VECs of size 1. We
4158 allocate VEC_OPRNDS1 only in case of binary operation. */
4161 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4162 if (op_type == binary_op)
4163 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4165 else if (scalar_shift_arg)
4166 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4168 /* In case the vectorization factor (VF) is bigger than the number
4169 of elements that we can fit in a vectype (nunits), we have to generate
4170 more than one vector stmt - i.e - we need to "unroll" the
4171 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4172 from one copy of the vector stmt to the next, in the field
4173 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4174 stages to find the correct vector defs to be used when vectorizing
4175 stmts that use the defs of the current stmt. The example below illustrates
4176 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4177 4 vectorized stmts):
4179 before vectorization:
4180 RELATED_STMT VEC_STMT
4184 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4186 RELATED_STMT VEC_STMT
4187 VS1_0: vx0 = memref0 VS1_1 -
4188 VS1_1: vx1 = memref1 VS1_2 -
4189 VS1_2: vx2 = memref2 VS1_3 -
4190 VS1_3: vx3 = memref3 - -
4191 S1: x = load - VS1_0
4194 step2: vectorize stmt S2 (done here):
4195 To vectorize stmt S2 we first need to find the relevant vector
4196 def for the first operand 'x'. This is, as usual, obtained from
4197 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4198 that defines 'x' (S1). This way we find the stmt VS1_0, and the
4199 relevant vector def 'vx0'. Having found 'vx0' we can generate
4200 the vector stmt VS2_0, and as usual, record it in the
4201 STMT_VINFO_VEC_STMT of stmt S2.
4202 When creating the second copy (VS2_1), we obtain the relevant vector
4203 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4204 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4205 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4206 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4207 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4208 chain of stmts and pointers:
4209 RELATED_STMT VEC_STMT
4210 VS1_0: vx0 = memref0 VS1_1 -
4211 VS1_1: vx1 = memref1 VS1_2 -
4212 VS1_2: vx2 = memref2 VS1_3 -
4213 VS1_3: vx3 = memref3 - -
4214 S1: x = load - VS1_0
4215 VS2_0: vz0 = vx0 + v1 VS2_1 -
4216 VS2_1: vz1 = vx1 + v1 VS2_2 -
4217 VS2_2: vz2 = vx2 + v1 VS2_3 -
4218 VS2_3: vz3 = vx3 + v1 - -
4219 S2: z = x + 1 - VS2_0 */
4221 prev_stmt_info = NULL;
4222 for (j = 0; j < ncopies; j++)
4227 if (op_type == binary_op && scalar_shift_arg)
4229 /* Vector shl and shr insn patterns can be defined with scalar
4230 operand 2 (shift operand). In this case, use constant or loop
4231 invariant op1 directly, without extending it to vector mode
4233 optab_op2_mode = insn_data[icode].operand[2].mode;
4234 if (!VECTOR_MODE_P (optab_op2_mode))
4236 if (vect_print_dump_info (REPORT_DETAILS))
4237 fprintf (vect_dump, "operand 1 using scalar mode.");
4239 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4242 /* Store vec_oprnd1 for every vector stmt to be created
4243 for SLP_NODE. We check during the analysis that all the
4244 shift arguments are the same.
4245 TODO: Allow different constants for different vector
4246 stmts generated for an SLP instance. */
4247 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4248 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4253 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4254 (a special case for certain kind of vector shifts); otherwise,
4255 operand 1 should be of a vector type (the usual case). */
4256 if (op_type == binary_op && !vec_oprnd1)
4257 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4260 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4264 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4266 /* Arguments are ready. Create the new vector stmt. */
4267 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4269 vop1 = ((op_type == binary_op)
4270 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4271 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4272 new_temp = make_ssa_name (vec_dest, new_stmt);
4273 gimple_assign_set_lhs (new_stmt, new_temp);
4274 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4276 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4280 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4282 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4283 prev_stmt_info = vinfo_for_stmt (new_stmt);
4286 VEC_free (tree, heap, vec_oprnds0);
4288 VEC_free (tree, heap, vec_oprnds1);
4294 /* Function vectorizable_type_demotion
4296 Check if STMT performs a binary or unary operation that involves
4297 type demotion, and if it can be vectorized.
4298 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4299 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4300 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4303 vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi,
4309 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4310 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4311 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4312 enum tree_code code, code1 = ERROR_MARK;
4316 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4318 stmt_vec_info prev_stmt_info;
4325 tree intermediate_type = NULL_TREE, narrow_type, double_vec_dest;
4326 bool double_op = false;
4327 tree first_vector, second_vector;
4328 tree vec_oprnd2 = NULL_TREE, vec_oprnd3 = NULL_TREE, last_oprnd = NULL_TREE;
4330 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4333 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4336 /* Is STMT a vectorizable type-demotion operation? */
4337 if (!is_gimple_assign (stmt))
4340 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4343 code = gimple_assign_rhs_code (stmt);
4344 if (code != NOP_EXPR && code != CONVERT_EXPR)
4347 op0 = gimple_assign_rhs1 (stmt);
4348 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4351 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4353 scalar_dest = gimple_assign_lhs (stmt);
4354 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4357 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4358 if (nunits_in != nunits_out / 2
4359 && nunits_in != nunits_out/4)
4362 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4363 gcc_assert (ncopies >= 1);
4365 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4366 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4367 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4368 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4369 && (code == NOP_EXPR || code == CONVERT_EXPR))))
4372 /* Check the operands of the operation. */
4373 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4375 if (vect_print_dump_info (REPORT_DETAILS))
4376 fprintf (vect_dump, "use not simple.");
4380 /* Supportable by target? */
4381 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1,
4382 &double_op, &intermediate_type))
4385 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4387 if (!vec_stmt) /* transformation not required. */
4389 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4390 if (vect_print_dump_info (REPORT_DETAILS))
4391 fprintf (vect_dump, "=== vectorizable_demotion ===");
4392 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4397 if (vect_print_dump_info (REPORT_DETAILS))
4398 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4402 /* In case of double demotion, we first generate demotion operation to the
4403 intermediate type, and then from that type to the final one. */
4405 narrow_type = intermediate_type;
4407 narrow_type = vectype_out;
4408 vec_dest = vect_create_destination_var (scalar_dest, narrow_type);
4409 double_vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4411 /* In case the vectorization factor (VF) is bigger than the number
4412 of elements that we can fit in a vectype (nunits), we have to generate
4413 more than one vector stmt - i.e - we need to "unroll" the
4414 vector stmt by a factor VF/nunits. */
4415 prev_stmt_info = NULL;
4416 for (j = 0; j < ncopies; j++)
4421 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4422 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4425 /* For double demotion we need four operands. */
4426 vec_oprnd2 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
4427 vec_oprnd3 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd2);
4432 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], last_oprnd);
4433 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4436 /* For double demotion we need four operands. */
4437 vec_oprnd2 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
4438 vec_oprnd3 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd2);
4442 /* Arguments are ready. Create the new vector stmts. */
4443 new_stmt = gimple_build_assign_with_ops (code1, vec_dest, vec_oprnd0,
4445 first_vector = make_ssa_name (vec_dest, new_stmt);
4446 gimple_assign_set_lhs (new_stmt, first_vector);
4447 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4449 /* In the next iteration we will get copy for this operand. */
4450 last_oprnd = vec_oprnd1;
4454 /* For double demotion operation we first generate two demotion
4455 operations from the source type to the intermediate type, and
4456 then combine the results in one demotion to the destination
4458 new_stmt = gimple_build_assign_with_ops (code1, vec_dest, vec_oprnd2,
4460 second_vector = make_ssa_name (vec_dest, new_stmt);
4461 gimple_assign_set_lhs (new_stmt, second_vector);
4462 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4464 new_stmt = gimple_build_assign_with_ops (code1, double_vec_dest,
4465 first_vector, second_vector);
4466 new_temp = make_ssa_name (double_vec_dest, new_stmt);
4467 gimple_assign_set_lhs (new_stmt, new_temp);
4468 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4470 /* In the next iteration we will get copy for this operand. */
4471 last_oprnd = vec_oprnd3;
4475 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4477 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4479 prev_stmt_info = vinfo_for_stmt (new_stmt);
4482 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4487 /* Function vectorizable_type_promotion
4489 Check if STMT performs a binary or unary operation that involves
4490 type promotion, and if it can be vectorized.
4491 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4492 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4493 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4496 vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
4501 tree op0, op1 = NULL;
4502 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4503 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4504 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4505 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4506 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4510 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4512 stmt_vec_info prev_stmt_info;
4519 tree intermediate_type = NULL_TREE, first_vector, second_vector;
4521 tree wide_type, double_vec_dest;
4523 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4526 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4529 /* Is STMT a vectorizable type-promotion operation? */
4530 if (!is_gimple_assign (stmt))
4533 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4536 code = gimple_assign_rhs_code (stmt);
4537 if (code != NOP_EXPR && code != CONVERT_EXPR
4538 && code != WIDEN_MULT_EXPR)
4541 op0 = gimple_assign_rhs1 (stmt);
4542 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4545 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4547 scalar_dest = gimple_assign_lhs (stmt);
4548 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4551 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4552 if (nunits_out != nunits_in / 2 && nunits_out != nunits_in/4)
4555 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4556 gcc_assert (ncopies >= 1);
4558 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4559 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4560 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4561 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4562 && (code == CONVERT_EXPR || code == NOP_EXPR))))
4565 /* Check the operands of the operation. */
4566 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4568 if (vect_print_dump_info (REPORT_DETAILS))
4569 fprintf (vect_dump, "use not simple.");
4573 op_type = TREE_CODE_LENGTH (code);
4574 if (op_type == binary_op)
4576 op1 = gimple_assign_rhs2 (stmt);
4577 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4579 if (vect_print_dump_info (REPORT_DETAILS))
4580 fprintf (vect_dump, "use not simple.");
4585 /* Supportable by target? */
4586 if (!supportable_widening_operation (code, stmt, vectype_in,
4587 &decl1, &decl2, &code1, &code2,
4588 &double_op, &intermediate_type))
4591 /* Binary widening operation can only be supported directly by the
4593 gcc_assert (!(double_op && op_type == binary_op));
4595 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4597 if (!vec_stmt) /* transformation not required. */
4599 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4600 if (vect_print_dump_info (REPORT_DETAILS))
4601 fprintf (vect_dump, "=== vectorizable_promotion ===");
4602 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4608 if (vect_print_dump_info (REPORT_DETAILS))
4609 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4614 wide_type = intermediate_type;
4616 wide_type = vectype_out;
4618 vec_dest = vect_create_destination_var (scalar_dest, wide_type);
4619 double_vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4621 /* In case the vectorization factor (VF) is bigger than the number
4622 of elements that we can fit in a vectype (nunits), we have to generate
4623 more than one vector stmt - i.e - we need to "unroll" the
4624 vector stmt by a factor VF/nunits. */
4626 prev_stmt_info = NULL;
4627 for (j = 0; j < ncopies; j++)
4632 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4633 if (op_type == binary_op)
4634 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4638 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4639 if (op_type == binary_op)
4640 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4643 /* Arguments are ready. Create the new vector stmt. We are creating
4644 two vector defs because the widened result does not fit in one vector.
4645 The vectorized stmt can be expressed as a call to a target builtin,
4646 or a using a tree-code. In case of double promotion (from char to int,
4647 for example), the promotion is performed in two phases: first we
4648 generate a promotion operation from the source type to the intermediate
4649 type (short in case of char->int promotion), and then for each of the
4650 created vectors we generate a promotion statement from the intermediate
4651 type to the destination type. */
4652 /* Generate first half of the widened result: */
4653 new_stmt = vect_gen_widened_results_half (code1, wide_type, decl1,
4654 vec_oprnd0, vec_oprnd1, op_type, vec_dest, gsi, stmt);
4655 if (is_gimple_call (new_stmt))
4656 first_vector = gimple_call_lhs (new_stmt);
4658 first_vector = gimple_assign_lhs (new_stmt);
4663 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4665 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4666 prev_stmt_info = vinfo_for_stmt (new_stmt);
4669 /* Generate second half of the widened result: */
4670 new_stmt = vect_gen_widened_results_half (code2, wide_type, decl2,
4671 vec_oprnd0, vec_oprnd1, op_type, vec_dest, gsi, stmt);
4672 if (is_gimple_call (new_stmt))
4673 second_vector = gimple_call_lhs (new_stmt);
4675 second_vector = gimple_assign_lhs (new_stmt);
4679 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4680 prev_stmt_info = vinfo_for_stmt (new_stmt);
4684 /* FIRST_VECTOR and SECOND_VECTOR are the results of source type
4685 to intermediate type promotion. Now we generate promotions
4686 for both of them to the destination type (i.e., four
4688 new_stmt = vect_gen_widened_results_half (code1, vectype_out,
4689 decl1, first_vector, NULL_TREE, op_type,
4690 double_vec_dest, gsi, stmt);
4692 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4694 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4695 prev_stmt_info = vinfo_for_stmt (new_stmt);
4697 new_stmt = vect_gen_widened_results_half (code2, vectype_out,
4698 decl2, first_vector, NULL_TREE, op_type,
4699 double_vec_dest, gsi, stmt);
4700 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4701 prev_stmt_info = vinfo_for_stmt (new_stmt);
4703 new_stmt = vect_gen_widened_results_half (code1, vectype_out,
4704 decl1, second_vector, NULL_TREE, op_type,
4705 double_vec_dest, gsi, stmt);
4706 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4707 prev_stmt_info = vinfo_for_stmt (new_stmt);
4709 new_stmt = vect_gen_widened_results_half (code2, vectype_out,
4710 decl2, second_vector, NULL_TREE, op_type,
4711 double_vec_dest, gsi, stmt);
4712 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4713 prev_stmt_info = vinfo_for_stmt (new_stmt);
4717 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4722 /* Function vect_strided_store_supported.
4724 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4725 and FALSE otherwise. */
4728 vect_strided_store_supported (tree vectype)
4730 optab interleave_high_optab, interleave_low_optab;
4733 mode = (int) TYPE_MODE (vectype);
4735 /* Check that the operation is supported. */
4736 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4737 vectype, optab_default);
4738 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4739 vectype, optab_default);
4740 if (!interleave_high_optab || !interleave_low_optab)
4742 if (vect_print_dump_info (REPORT_DETAILS))
4743 fprintf (vect_dump, "no optab for interleave.");
4747 if (optab_handler (interleave_high_optab, mode)->insn_code
4749 || optab_handler (interleave_low_optab, mode)->insn_code
4750 == CODE_FOR_nothing)
4752 if (vect_print_dump_info (REPORT_DETAILS))
4753 fprintf (vect_dump, "interleave op not supported by target.");
4761 /* Function vect_permute_store_chain.
4763 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4764 a power of 2, generate interleave_high/low stmts to reorder the data
4765 correctly for the stores. Return the final references for stores in
4768 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4769 The input is 4 vectors each containing 8 elements. We assign a number to each
4770 element, the input sequence is:
4772 1st vec: 0 1 2 3 4 5 6 7
4773 2nd vec: 8 9 10 11 12 13 14 15
4774 3rd vec: 16 17 18 19 20 21 22 23
4775 4th vec: 24 25 26 27 28 29 30 31
4777 The output sequence should be:
4779 1st vec: 0 8 16 24 1 9 17 25
4780 2nd vec: 2 10 18 26 3 11 19 27
4781 3rd vec: 4 12 20 28 5 13 21 30
4782 4th vec: 6 14 22 30 7 15 23 31
4784 i.e., we interleave the contents of the four vectors in their order.
4786 We use interleave_high/low instructions to create such output. The input of
4787 each interleave_high/low operation is two vectors:
4790 the even elements of the result vector are obtained left-to-right from the
4791 high/low elements of the first vector. The odd elements of the result are
4792 obtained left-to-right from the high/low elements of the second vector.
4793 The output of interleave_high will be: 0 4 1 5
4794 and of interleave_low: 2 6 3 7
4797 The permutation is done in log LENGTH stages. In each stage interleave_high
4798 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4799 where the first argument is taken from the first half of DR_CHAIN and the
4800 second argument from it's second half.
4803 I1: interleave_high (1st vec, 3rd vec)
4804 I2: interleave_low (1st vec, 3rd vec)
4805 I3: interleave_high (2nd vec, 4th vec)
4806 I4: interleave_low (2nd vec, 4th vec)
4808 The output for the first stage is:
4810 I1: 0 16 1 17 2 18 3 19
4811 I2: 4 20 5 21 6 22 7 23
4812 I3: 8 24 9 25 10 26 11 27
4813 I4: 12 28 13 29 14 30 15 31
4815 The output of the second stage, i.e. the final result is:
4817 I1: 0 8 16 24 1 9 17 25
4818 I2: 2 10 18 26 3 11 19 27
4819 I3: 4 12 20 28 5 13 21 30
4820 I4: 6 14 22 30 7 15 23 31. */
4823 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
4824 unsigned int length,
4826 gimple_stmt_iterator *gsi,
4827 VEC(tree,heap) **result_chain)
4829 tree perm_dest, vect1, vect2, high, low;
4831 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4835 enum tree_code high_code, low_code;
4837 scalar_dest = gimple_assign_lhs (stmt);
4839 /* Check that the operation is supported. */
4840 if (!vect_strided_store_supported (vectype))
4843 *result_chain = VEC_copy (tree, heap, dr_chain);
4845 for (i = 0; i < exact_log2 (length); i++)
4847 for (j = 0; j < length/2; j++)
4849 vect1 = VEC_index (tree, dr_chain, j);
4850 vect2 = VEC_index (tree, dr_chain, j+length/2);
4852 /* Create interleaving stmt:
4853 in the case of big endian:
4854 high = interleave_high (vect1, vect2)
4855 and in the case of little endian:
4856 high = interleave_low (vect1, vect2). */
4857 perm_dest = create_tmp_var (vectype, "vect_inter_high");
4858 DECL_GIMPLE_REG_P (perm_dest) = 1;
4859 add_referenced_var (perm_dest);
4860 if (BYTES_BIG_ENDIAN)
4862 high_code = VEC_INTERLEAVE_HIGH_EXPR;
4863 low_code = VEC_INTERLEAVE_LOW_EXPR;
4867 low_code = VEC_INTERLEAVE_HIGH_EXPR;
4868 high_code = VEC_INTERLEAVE_LOW_EXPR;
4870 perm_stmt = gimple_build_assign_with_ops (high_code, perm_dest,
4872 high = make_ssa_name (perm_dest, perm_stmt);
4873 gimple_assign_set_lhs (perm_stmt, high);
4874 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4875 VEC_replace (tree, *result_chain, 2*j, high);
4877 /* Create interleaving stmt:
4878 in the case of big endian:
4879 low = interleave_low (vect1, vect2)
4880 and in the case of little endian:
4881 low = interleave_high (vect1, vect2). */
4882 perm_dest = create_tmp_var (vectype, "vect_inter_low");
4883 DECL_GIMPLE_REG_P (perm_dest) = 1;
4884 add_referenced_var (perm_dest);
4885 perm_stmt = gimple_build_assign_with_ops (low_code, perm_dest,
4887 low = make_ssa_name (perm_dest, perm_stmt);
4888 gimple_assign_set_lhs (perm_stmt, low);
4889 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4890 VEC_replace (tree, *result_chain, 2*j+1, low);
4892 dr_chain = VEC_copy (tree, heap, *result_chain);
4898 /* Function vectorizable_store.
4900 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
4902 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4903 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4904 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4907 vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
4913 tree vec_oprnd = NULL_TREE;
4914 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4915 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
4916 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4917 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4918 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4919 enum machine_mode vec_mode;
4921 enum dr_alignment_support alignment_support_scheme;
4924 enum vect_def_type dt;
4925 stmt_vec_info prev_stmt_info = NULL;
4926 tree dataref_ptr = NULL_TREE;
4927 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4928 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4930 gimple next_stmt, first_stmt = NULL;
4931 bool strided_store = false;
4932 unsigned int group_size, i;
4933 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
4935 VEC(tree,heap) *vec_oprnds = NULL;
4936 bool slp = (slp_node != NULL);
4937 stmt_vec_info first_stmt_vinfo;
4938 unsigned int vec_num;
4940 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
4941 this, so we can safely override NCOPIES with 1 here. */
4945 gcc_assert (ncopies >= 1);
4947 /* FORNOW. This restriction should be relaxed. */
4948 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4950 if (vect_print_dump_info (REPORT_DETAILS))
4951 fprintf (vect_dump, "multiple types in nested loop.");
4955 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4958 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4961 /* Is vectorizable store? */
4963 if (!is_gimple_assign (stmt))
4966 scalar_dest = gimple_assign_lhs (stmt);
4967 if (TREE_CODE (scalar_dest) != ARRAY_REF
4968 && TREE_CODE (scalar_dest) != INDIRECT_REF
4969 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
4972 gcc_assert (gimple_assign_single_p (stmt));
4973 op = gimple_assign_rhs1 (stmt);
4974 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4976 if (vect_print_dump_info (REPORT_DETAILS))
4977 fprintf (vect_dump, "use not simple.");
4981 /* If accesses through a pointer to vectype do not alias the original
4982 memory reference we have a problem. */
4983 if (get_alias_set (vectype) != get_alias_set (TREE_TYPE (scalar_dest))
4984 && !alias_set_subset_of (get_alias_set (vectype),
4985 get_alias_set (TREE_TYPE (scalar_dest))))
4987 if (vect_print_dump_info (REPORT_DETAILS))
4988 fprintf (vect_dump, "vector type does not alias scalar type");
4992 if (!useless_type_conversion_p (TREE_TYPE (op), TREE_TYPE (scalar_dest)))
4994 if (vect_print_dump_info (REPORT_DETAILS))
4995 fprintf (vect_dump, "operands of different types");
4999 vec_mode = TYPE_MODE (vectype);
5000 /* FORNOW. In some cases can vectorize even if data-type not supported
5001 (e.g. - array initialization with 0). */
5002 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
5005 if (!STMT_VINFO_DATA_REF (stmt_info))
5008 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5010 strided_store = true;
5011 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5012 if (!vect_strided_store_supported (vectype)
5013 && !PURE_SLP_STMT (stmt_info) && !slp)
5016 if (first_stmt == stmt)
5018 /* STMT is the leader of the group. Check the operands of all the
5019 stmts of the group. */
5020 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
5023 gcc_assert (gimple_assign_single_p (next_stmt));
5024 op = gimple_assign_rhs1 (next_stmt);
5025 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5027 if (vect_print_dump_info (REPORT_DETAILS))
5028 fprintf (vect_dump, "use not simple.");
5031 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5036 if (!vec_stmt) /* transformation not required. */
5038 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
5039 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
5047 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5048 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5050 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
5053 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5055 /* We vectorize all the stmts of the interleaving group when we
5056 reach the last stmt in the group. */
5057 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
5058 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
5066 strided_store = false;
5068 /* VEC_NUM is the number of vect stmts to be created for this group. */
5069 if (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) < group_size)
5070 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5072 vec_num = group_size;
5078 group_size = vec_num = 1;
5079 first_stmt_vinfo = stmt_info;
5082 if (vect_print_dump_info (REPORT_DETAILS))
5083 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
5085 dr_chain = VEC_alloc (tree, heap, group_size);
5086 oprnds = VEC_alloc (tree, heap, group_size);
5088 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5089 gcc_assert (alignment_support_scheme);
5090 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
5092 /* In case the vectorization factor (VF) is bigger than the number
5093 of elements that we can fit in a vectype (nunits), we have to generate
5094 more than one vector stmt - i.e - we need to "unroll" the
5095 vector stmt by a factor VF/nunits. For more details see documentation in
5096 vect_get_vec_def_for_copy_stmt. */
5098 /* In case of interleaving (non-unit strided access):
5105 We create vectorized stores starting from base address (the access of the
5106 first stmt in the chain (S2 in the above example), when the last store stmt
5107 of the chain (S4) is reached:
5110 VS2: &base + vec_size*1 = vx0
5111 VS3: &base + vec_size*2 = vx1
5112 VS4: &base + vec_size*3 = vx3
5114 Then permutation statements are generated:
5116 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
5117 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
5120 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5121 (the order of the data-refs in the output of vect_permute_store_chain
5122 corresponds to the order of scalar stmts in the interleaving chain - see
5123 the documentation of vect_permute_store_chain()).
5125 In case of both multiple types and interleaving, above vector stores and
5126 permutation stmts are created for every copy. The result vector stmts are
5127 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5128 STMT_VINFO_RELATED_STMT for the next copies.
5131 prev_stmt_info = NULL;
5132 for (j = 0; j < ncopies; j++)
5141 /* Get vectorized arguments for SLP_NODE. */
5142 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
5144 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
5148 /* For interleaved stores we collect vectorized defs for all the
5149 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
5150 used as an input to vect_permute_store_chain(), and OPRNDS as
5151 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
5153 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5154 OPRNDS are of size 1. */
5155 next_stmt = first_stmt;
5156 for (i = 0; i < group_size; i++)
5158 /* Since gaps are not supported for interleaved stores,
5159 GROUP_SIZE is the exact number of stmts in the chain.
5160 Therefore, NEXT_STMT can't be NULL_TREE. In case that
5161 there is no interleaving, GROUP_SIZE is 1, and only one
5162 iteration of the loop will be executed. */
5163 gcc_assert (next_stmt);
5164 gcc_assert (gimple_assign_single_p (next_stmt));
5165 op = gimple_assign_rhs1 (next_stmt);
5167 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
5169 VEC_quick_push(tree, dr_chain, vec_oprnd);
5170 VEC_quick_push(tree, oprnds, vec_oprnd);
5171 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5175 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
5176 &dummy, &ptr_incr, false,
5178 gcc_assert (!inv_p);
5182 /* FORNOW SLP doesn't work for multiple types. */
5185 /* For interleaved stores we created vectorized defs for all the
5186 defs stored in OPRNDS in the previous iteration (previous copy).
5187 DR_CHAIN is then used as an input to vect_permute_store_chain(),
5188 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
5190 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5191 OPRNDS are of size 1. */
5192 for (i = 0; i < group_size; i++)
5194 op = VEC_index (tree, oprnds, i);
5195 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
5196 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
5197 VEC_replace(tree, dr_chain, i, vec_oprnd);
5198 VEC_replace(tree, oprnds, i, vec_oprnd);
5201 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
5206 result_chain = VEC_alloc (tree, heap, group_size);
5208 if (!vect_permute_store_chain (dr_chain, group_size, stmt, gsi,
5213 next_stmt = first_stmt;
5214 for (i = 0; i < vec_num; i++)
5217 /* Bump the vector pointer. */
5218 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
5222 vec_oprnd = VEC_index (tree, vec_oprnds, i);
5223 else if (strided_store)
5224 /* For strided stores vectorized defs are interleaved in
5225 vect_permute_store_chain(). */
5226 vec_oprnd = VEC_index (tree, result_chain, i);
5228 data_ref = build_fold_indirect_ref (dataref_ptr);
5229 /* Arguments are ready. Create the new vector stmt. */
5230 new_stmt = gimple_build_assign (data_ref, vec_oprnd);
5231 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5232 mark_symbols_for_renaming (new_stmt);
5235 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5237 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5239 prev_stmt_info = vinfo_for_stmt (new_stmt);
5240 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5246 VEC_free (tree, heap, dr_chain);
5247 VEC_free (tree, heap, oprnds);
5249 VEC_free (tree, heap, result_chain);
5255 /* Function vect_setup_realignment
5257 This function is called when vectorizing an unaligned load using
5258 the dr_explicit_realign[_optimized] scheme.
5259 This function generates the following code at the loop prolog:
5262 x msq_init = *(floor(p)); # prolog load
5263 realignment_token = call target_builtin;
5265 x msq = phi (msq_init, ---)
5267 The stmts marked with x are generated only for the case of
5268 dr_explicit_realign_optimized.
5270 The code above sets up a new (vector) pointer, pointing to the first
5271 location accessed by STMT, and a "floor-aligned" load using that pointer.
5272 It also generates code to compute the "realignment-token" (if the relevant
5273 target hook was defined), and creates a phi-node at the loop-header bb
5274 whose arguments are the result of the prolog-load (created by this
5275 function) and the result of a load that takes place in the loop (to be
5276 created by the caller to this function).
5278 For the case of dr_explicit_realign_optimized:
5279 The caller to this function uses the phi-result (msq) to create the
5280 realignment code inside the loop, and sets up the missing phi argument,
5283 msq = phi (msq_init, lsq)
5284 lsq = *(floor(p')); # load in loop
5285 result = realign_load (msq, lsq, realignment_token);
5287 For the case of dr_explicit_realign:
5289 msq = *(floor(p)); # load in loop
5291 lsq = *(floor(p')); # load in loop
5292 result = realign_load (msq, lsq, realignment_token);
5295 STMT - (scalar) load stmt to be vectorized. This load accesses
5296 a memory location that may be unaligned.
5297 BSI - place where new code is to be inserted.
5298 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5302 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5303 target hook, if defined.
5304 Return value - the result of the loop-header phi node. */
5307 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
5308 tree *realignment_token,
5309 enum dr_alignment_support alignment_support_scheme,
5311 struct loop **at_loop)
5313 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5314 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5315 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5316 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5318 tree scalar_dest = gimple_assign_lhs (stmt);
5325 tree msq_init = NULL_TREE;
5328 tree msq = NULL_TREE;
5329 gimple_seq stmts = NULL;
5331 bool compute_in_loop = false;
5332 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5333 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5334 struct loop *loop_for_initial_load;
5336 gcc_assert (alignment_support_scheme == dr_explicit_realign
5337 || alignment_support_scheme == dr_explicit_realign_optimized);
5339 /* We need to generate three things:
5340 1. the misalignment computation
5341 2. the extra vector load (for the optimized realignment scheme).
5342 3. the phi node for the two vectors from which the realignment is
5343 done (for the optimized realignment scheme).
5346 /* 1. Determine where to generate the misalignment computation.
5348 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5349 calculation will be generated by this function, outside the loop (in the
5350 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5351 caller, inside the loop.
5353 Background: If the misalignment remains fixed throughout the iterations of
5354 the loop, then both realignment schemes are applicable, and also the
5355 misalignment computation can be done outside LOOP. This is because we are
5356 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5357 are a multiple of VS (the Vector Size), and therefore the misalignment in
5358 different vectorized LOOP iterations is always the same.
5359 The problem arises only if the memory access is in an inner-loop nested
5360 inside LOOP, which is now being vectorized using outer-loop vectorization.
5361 This is the only case when the misalignment of the memory access may not
5362 remain fixed throughout the iterations of the inner-loop (as explained in
5363 detail in vect_supportable_dr_alignment). In this case, not only is the
5364 optimized realignment scheme not applicable, but also the misalignment
5365 computation (and generation of the realignment token that is passed to
5366 REALIGN_LOAD) have to be done inside the loop.
5368 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5369 or not, which in turn determines if the misalignment is computed inside
5370 the inner-loop, or outside LOOP. */
5372 if (init_addr != NULL_TREE)
5374 compute_in_loop = true;
5375 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5379 /* 2. Determine where to generate the extra vector load.
5381 For the optimized realignment scheme, instead of generating two vector
5382 loads in each iteration, we generate a single extra vector load in the
5383 preheader of the loop, and in each iteration reuse the result of the
5384 vector load from the previous iteration. In case the memory access is in
5385 an inner-loop nested inside LOOP, which is now being vectorized using
5386 outer-loop vectorization, we need to determine whether this initial vector
5387 load should be generated at the preheader of the inner-loop, or can be
5388 generated at the preheader of LOOP. If the memory access has no evolution
5389 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5390 to be generated inside LOOP (in the preheader of the inner-loop). */
5392 if (nested_in_vect_loop)
5394 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5395 bool invariant_in_outerloop =
5396 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5397 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5400 loop_for_initial_load = loop;
5402 *at_loop = loop_for_initial_load;
5404 /* 3. For the case of the optimized realignment, create the first vector
5405 load at the loop preheader. */
5407 if (alignment_support_scheme == dr_explicit_realign_optimized)
5409 /* Create msq_init = *(floor(p1)) in the loop preheader */
5411 gcc_assert (!compute_in_loop);
5412 pe = loop_preheader_edge (loop_for_initial_load);
5413 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5414 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5415 &init_addr, &inc, true, &inv_p);
5416 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5417 new_stmt = gimple_build_assign (vec_dest, data_ref);
5418 new_temp = make_ssa_name (vec_dest, new_stmt);
5419 gimple_assign_set_lhs (new_stmt, new_temp);
5420 mark_symbols_for_renaming (new_stmt);
5421 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5422 gcc_assert (!new_bb);
5423 msq_init = gimple_assign_lhs (new_stmt);
5426 /* 4. Create realignment token using a target builtin, if available.
5427 It is done either inside the containing loop, or before LOOP (as
5428 determined above). */
5430 if (targetm.vectorize.builtin_mask_for_load)
5434 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5435 if (compute_in_loop)
5436 gcc_assert (init_addr); /* already computed by the caller. */
5439 /* Generate the INIT_ADDR computation outside LOOP. */
5440 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5442 pe = loop_preheader_edge (loop);
5443 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5444 gcc_assert (!new_bb);
5447 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5448 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5450 vect_create_destination_var (scalar_dest,
5451 gimple_call_return_type (new_stmt));
5452 new_temp = make_ssa_name (vec_dest, new_stmt);
5453 gimple_call_set_lhs (new_stmt, new_temp);
5455 if (compute_in_loop)
5456 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5459 /* Generate the misalignment computation outside LOOP. */
5460 pe = loop_preheader_edge (loop);
5461 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5462 gcc_assert (!new_bb);
5465 *realignment_token = gimple_call_lhs (new_stmt);
5467 /* The result of the CALL_EXPR to this builtin is determined from
5468 the value of the parameter and no global variables are touched
5469 which makes the builtin a "const" function. Requiring the
5470 builtin to have the "const" attribute makes it unnecessary
5471 to call mark_call_clobbered. */
5472 gcc_assert (TREE_READONLY (builtin_decl));
5475 if (alignment_support_scheme == dr_explicit_realign)
5478 gcc_assert (!compute_in_loop);
5479 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5482 /* 5. Create msq = phi <msq_init, lsq> in loop */
5484 pe = loop_preheader_edge (containing_loop);
5485 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5486 msq = make_ssa_name (vec_dest, NULL);
5487 phi_stmt = create_phi_node (msq, containing_loop->header);
5488 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5489 add_phi_arg (phi_stmt, msq_init, pe);
5495 /* Function vect_strided_load_supported.
5497 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5498 and FALSE otherwise. */
5501 vect_strided_load_supported (tree vectype)
5503 optab perm_even_optab, perm_odd_optab;
5506 mode = (int) TYPE_MODE (vectype);
5508 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype,
5510 if (!perm_even_optab)
5512 if (vect_print_dump_info (REPORT_DETAILS))
5513 fprintf (vect_dump, "no optab for perm_even.");
5517 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5519 if (vect_print_dump_info (REPORT_DETAILS))
5520 fprintf (vect_dump, "perm_even op not supported by target.");
5524 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype,
5526 if (!perm_odd_optab)
5528 if (vect_print_dump_info (REPORT_DETAILS))
5529 fprintf (vect_dump, "no optab for perm_odd.");
5533 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5535 if (vect_print_dump_info (REPORT_DETAILS))
5536 fprintf (vect_dump, "perm_odd op not supported by target.");
5543 /* Function vect_permute_load_chain.
5545 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5546 a power of 2, generate extract_even/odd stmts to reorder the input data
5547 correctly. Return the final references for loads in RESULT_CHAIN.
5549 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5550 The input is 4 vectors each containing 8 elements. We assign a number to each
5551 element, the input sequence is:
5553 1st vec: 0 1 2 3 4 5 6 7
5554 2nd vec: 8 9 10 11 12 13 14 15
5555 3rd vec: 16 17 18 19 20 21 22 23
5556 4th vec: 24 25 26 27 28 29 30 31
5558 The output sequence should be:
5560 1st vec: 0 4 8 12 16 20 24 28
5561 2nd vec: 1 5 9 13 17 21 25 29
5562 3rd vec: 2 6 10 14 18 22 26 30
5563 4th vec: 3 7 11 15 19 23 27 31
5565 i.e., the first output vector should contain the first elements of each
5566 interleaving group, etc.
5568 We use extract_even/odd instructions to create such output. The input of each
5569 extract_even/odd operation is two vectors
5573 and the output is the vector of extracted even/odd elements. The output of
5574 extract_even will be: 0 2 4 6
5575 and of extract_odd: 1 3 5 7
5578 The permutation is done in log LENGTH stages. In each stage extract_even and
5579 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5580 order. In our example,
5582 E1: extract_even (1st vec, 2nd vec)
5583 E2: extract_odd (1st vec, 2nd vec)
5584 E3: extract_even (3rd vec, 4th vec)
5585 E4: extract_odd (3rd vec, 4th vec)
5587 The output for the first stage will be:
5589 E1: 0 2 4 6 8 10 12 14
5590 E2: 1 3 5 7 9 11 13 15
5591 E3: 16 18 20 22 24 26 28 30
5592 E4: 17 19 21 23 25 27 29 31
5594 In order to proceed and create the correct sequence for the next stage (or
5595 for the correct output, if the second stage is the last one, as in our
5596 example), we first put the output of extract_even operation and then the
5597 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5598 The input for the second stage is:
5600 1st vec (E1): 0 2 4 6 8 10 12 14
5601 2nd vec (E3): 16 18 20 22 24 26 28 30
5602 3rd vec (E2): 1 3 5 7 9 11 13 15
5603 4th vec (E4): 17 19 21 23 25 27 29 31
5605 The output of the second stage:
5607 E1: 0 4 8 12 16 20 24 28
5608 E2: 2 6 10 14 18 22 26 30
5609 E3: 1 5 9 13 17 21 25 29
5610 E4: 3 7 11 15 19 23 27 31
5612 And RESULT_CHAIN after reordering:
5614 1st vec (E1): 0 4 8 12 16 20 24 28
5615 2nd vec (E3): 1 5 9 13 17 21 25 29
5616 3rd vec (E2): 2 6 10 14 18 22 26 30
5617 4th vec (E4): 3 7 11 15 19 23 27 31. */
5620 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5621 unsigned int length,
5623 gimple_stmt_iterator *gsi,
5624 VEC(tree,heap) **result_chain)
5626 tree perm_dest, data_ref, first_vect, second_vect;
5628 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5632 /* Check that the operation is supported. */
5633 if (!vect_strided_load_supported (vectype))
5636 *result_chain = VEC_copy (tree, heap, dr_chain);
5637 for (i = 0; i < exact_log2 (length); i++)
5639 for (j = 0; j < length; j +=2)
5641 first_vect = VEC_index (tree, dr_chain, j);
5642 second_vect = VEC_index (tree, dr_chain, j+1);
5644 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5645 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5646 DECL_GIMPLE_REG_P (perm_dest) = 1;
5647 add_referenced_var (perm_dest);
5649 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_EVEN_EXPR,
5650 perm_dest, first_vect,
5653 data_ref = make_ssa_name (perm_dest, perm_stmt);
5654 gimple_assign_set_lhs (perm_stmt, data_ref);
5655 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5656 mark_symbols_for_renaming (perm_stmt);
5658 VEC_replace (tree, *result_chain, j/2, data_ref);
5660 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5661 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5662 DECL_GIMPLE_REG_P (perm_dest) = 1;
5663 add_referenced_var (perm_dest);
5665 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_ODD_EXPR,
5666 perm_dest, first_vect,
5668 data_ref = make_ssa_name (perm_dest, perm_stmt);
5669 gimple_assign_set_lhs (perm_stmt, data_ref);
5670 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5671 mark_symbols_for_renaming (perm_stmt);
5673 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5675 dr_chain = VEC_copy (tree, heap, *result_chain);
5681 /* Function vect_transform_strided_load.
5683 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5684 to perform their permutation and ascribe the result vectorized statements to
5685 the scalar statements.
5689 vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
5690 gimple_stmt_iterator *gsi)
5692 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5693 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5694 gimple next_stmt, new_stmt;
5695 VEC(tree,heap) *result_chain = NULL;
5696 unsigned int i, gap_count;
5699 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5700 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5701 vectors, that are ready for vector computation. */
5702 result_chain = VEC_alloc (tree, heap, size);
5704 if (!vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain))
5707 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5708 Since we scan the chain starting from it's first node, their order
5709 corresponds the order of data-refs in RESULT_CHAIN. */
5710 next_stmt = first_stmt;
5712 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5717 /* Skip the gaps. Loads created for the gaps will be removed by dead
5718 code elimination pass later. No need to check for the first stmt in
5719 the group, since it always exists.
5720 DR_GROUP_GAP is the number of steps in elements from the previous
5721 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5722 correspond to the gaps.
5724 if (next_stmt != first_stmt
5725 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5733 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5734 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5735 copies, and we put the new vector statement in the first available
5737 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5738 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5742 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5744 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5747 prev_stmt = rel_stmt;
5748 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5750 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
5752 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5754 /* If NEXT_STMT accesses the same DR as the previous statement,
5755 put the same TMP_DATA_REF as its vectorized statement; otherwise
5756 get the next data-ref from RESULT_CHAIN. */
5757 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5762 VEC_free (tree, heap, result_chain);
5767 /* vectorizable_load.
5769 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
5771 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5772 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5773 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5776 vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
5780 tree vec_dest = NULL;
5781 tree data_ref = NULL;
5782 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5783 stmt_vec_info prev_stmt_info;
5784 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5785 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5786 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5787 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5788 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
5789 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5792 gimple new_stmt = NULL;
5794 enum dr_alignment_support alignment_support_scheme;
5795 tree dataref_ptr = NULL_TREE;
5797 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5798 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5799 int i, j, group_size;
5800 tree msq = NULL_TREE, lsq;
5801 tree offset = NULL_TREE;
5802 tree realignment_token = NULL_TREE;
5804 VEC(tree,heap) *dr_chain = NULL;
5805 bool strided_load = false;
5809 bool compute_in_loop = false;
5810 struct loop *at_loop;
5812 bool slp = (slp_node != NULL);
5813 enum tree_code code;
5815 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
5816 this, so we can safely override NCOPIES with 1 here. */
5820 gcc_assert (ncopies >= 1);
5822 /* FORNOW. This restriction should be relaxed. */
5823 if (nested_in_vect_loop && ncopies > 1)
5825 if (vect_print_dump_info (REPORT_DETAILS))
5826 fprintf (vect_dump, "multiple types in nested loop.");
5830 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5833 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5836 /* Is vectorizable load? */
5837 if (!is_gimple_assign (stmt))
5840 scalar_dest = gimple_assign_lhs (stmt);
5841 if (TREE_CODE (scalar_dest) != SSA_NAME)
5844 code = gimple_assign_rhs_code (stmt);
5845 if (code != ARRAY_REF
5846 && code != INDIRECT_REF
5847 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5850 if (!STMT_VINFO_DATA_REF (stmt_info))
5853 scalar_type = TREE_TYPE (DR_REF (dr));
5854 mode = (int) TYPE_MODE (vectype);
5856 /* FORNOW. In some cases can vectorize even if data-type not supported
5857 (e.g. - data copies). */
5858 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
5860 if (vect_print_dump_info (REPORT_DETAILS))
5861 fprintf (vect_dump, "Aligned load, but unsupported type.");
5865 /* If accesses through a pointer to vectype do not alias the original
5866 memory reference we have a problem. */
5867 if (get_alias_set (vectype) != get_alias_set (scalar_type)
5868 && !alias_set_subset_of (get_alias_set (vectype),
5869 get_alias_set (scalar_type)))
5871 if (vect_print_dump_info (REPORT_DETAILS))
5872 fprintf (vect_dump, "vector type does not alias scalar type");
5876 /* Check if the load is a part of an interleaving chain. */
5877 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5879 strided_load = true;
5881 gcc_assert (! nested_in_vect_loop);
5883 /* Check if interleaving is supported. */
5884 if (!vect_strided_load_supported (vectype)
5885 && !PURE_SLP_STMT (stmt_info) && !slp)
5889 if (!vec_stmt) /* transformation not required. */
5891 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
5892 vect_model_load_cost (stmt_info, ncopies, NULL);
5896 if (vect_print_dump_info (REPORT_DETAILS))
5897 fprintf (vect_dump, "transform load.");
5903 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5904 /* Check if the chain of loads is already vectorized. */
5905 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
5907 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5910 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5911 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5912 dr_chain = VEC_alloc (tree, heap, group_size);
5914 /* VEC_NUM is the number of vect stmts to be created for this group. */
5917 strided_load = false;
5918 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5921 vec_num = group_size;
5927 group_size = vec_num = 1;
5930 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5931 gcc_assert (alignment_support_scheme);
5933 /* In case the vectorization factor (VF) is bigger than the number
5934 of elements that we can fit in a vectype (nunits), we have to generate
5935 more than one vector stmt - i.e - we need to "unroll" the
5936 vector stmt by a factor VF/nunits. In doing so, we record a pointer
5937 from one copy of the vector stmt to the next, in the field
5938 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
5939 stages to find the correct vector defs to be used when vectorizing
5940 stmts that use the defs of the current stmt. The example below illustrates
5941 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
5942 4 vectorized stmts):
5944 before vectorization:
5945 RELATED_STMT VEC_STMT
5949 step 1: vectorize stmt S1:
5950 We first create the vector stmt VS1_0, and, as usual, record a
5951 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
5952 Next, we create the vector stmt VS1_1, and record a pointer to
5953 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
5954 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
5956 RELATED_STMT VEC_STMT
5957 VS1_0: vx0 = memref0 VS1_1 -
5958 VS1_1: vx1 = memref1 VS1_2 -
5959 VS1_2: vx2 = memref2 VS1_3 -
5960 VS1_3: vx3 = memref3 - -
5961 S1: x = load - VS1_0
5964 See in documentation in vect_get_vec_def_for_stmt_copy for how the
5965 information we recorded in RELATED_STMT field is used to vectorize
5968 /* In case of interleaving (non-unit strided access):
5975 Vectorized loads are created in the order of memory accesses
5976 starting from the access of the first stmt of the chain:
5979 VS2: vx1 = &base + vec_size*1
5980 VS3: vx3 = &base + vec_size*2
5981 VS4: vx4 = &base + vec_size*3
5983 Then permutation statements are generated:
5985 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
5986 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
5989 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5990 (the order of the data-refs in the output of vect_permute_load_chain
5991 corresponds to the order of scalar stmts in the interleaving chain - see
5992 the documentation of vect_permute_load_chain()).
5993 The generation of permutation stmts and recording them in
5994 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
5996 In case of both multiple types and interleaving, the vector loads and
5997 permutation stmts above are created for every copy. The result vector stmts
5998 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5999 STMT_VINFO_RELATED_STMT for the next copies. */
6001 /* If the data reference is aligned (dr_aligned) or potentially unaligned
6002 on a target that supports unaligned accesses (dr_unaligned_supported)
6003 we generate the following code:
6007 p = p + indx * vectype_size;
6012 Otherwise, the data reference is potentially unaligned on a target that
6013 does not support unaligned accesses (dr_explicit_realign_optimized) -
6014 then generate the following code, in which the data in each iteration is
6015 obtained by two vector loads, one from the previous iteration, and one
6016 from the current iteration:
6018 msq_init = *(floor(p1))
6019 p2 = initial_addr + VS - 1;
6020 realignment_token = call target_builtin;
6023 p2 = p2 + indx * vectype_size
6025 vec_dest = realign_load (msq, lsq, realignment_token)
6030 /* If the misalignment remains the same throughout the execution of the
6031 loop, we can create the init_addr and permutation mask at the loop
6032 preheader. Otherwise, it needs to be created inside the loop.
6033 This can only occur when vectorizing memory accesses in the inner-loop
6034 nested within an outer-loop that is being vectorized. */
6036 if (nested_in_vect_loop_p (loop, stmt)
6037 && (TREE_INT_CST_LOW (DR_STEP (dr))
6038 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
6040 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
6041 compute_in_loop = true;
6044 if ((alignment_support_scheme == dr_explicit_realign_optimized
6045 || alignment_support_scheme == dr_explicit_realign)
6046 && !compute_in_loop)
6048 msq = vect_setup_realignment (first_stmt, gsi, &realignment_token,
6049 alignment_support_scheme, NULL_TREE,
6051 if (alignment_support_scheme == dr_explicit_realign_optimized)
6053 phi = SSA_NAME_DEF_STMT (msq);
6054 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6060 prev_stmt_info = NULL;
6061 for (j = 0; j < ncopies; j++)
6063 /* 1. Create the vector pointer update chain. */
6065 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
6067 &dummy, &ptr_incr, false,
6071 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
6073 for (i = 0; i < vec_num; i++)
6076 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
6079 /* 2. Create the vector-load in the loop. */
6080 switch (alignment_support_scheme)
6083 gcc_assert (aligned_access_p (first_dr));
6084 data_ref = build_fold_indirect_ref (dataref_ptr);
6086 case dr_unaligned_supported:
6088 int mis = DR_MISALIGNMENT (first_dr);
6089 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
6091 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
6093 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
6096 case dr_explicit_realign:
6099 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6101 if (compute_in_loop)
6102 msq = vect_setup_realignment (first_stmt, gsi,
6104 dr_explicit_realign,
6107 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6108 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6109 new_stmt = gimple_build_assign (vec_dest, data_ref);
6110 new_temp = make_ssa_name (vec_dest, new_stmt);
6111 gimple_assign_set_lhs (new_stmt, new_temp);
6112 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6113 copy_virtual_operands (new_stmt, stmt);
6114 mark_symbols_for_renaming (new_stmt);
6117 bump = size_binop (MULT_EXPR, vs_minus_1,
6118 TYPE_SIZE_UNIT (scalar_type));
6119 ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump);
6120 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
6123 case dr_explicit_realign_optimized:
6124 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6129 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6130 new_stmt = gimple_build_assign (vec_dest, data_ref);
6131 new_temp = make_ssa_name (vec_dest, new_stmt);
6132 gimple_assign_set_lhs (new_stmt, new_temp);
6133 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6134 mark_symbols_for_renaming (new_stmt);
6136 /* 3. Handle explicit realignment if necessary/supported. Create in
6137 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
6138 if (alignment_support_scheme == dr_explicit_realign_optimized
6139 || alignment_support_scheme == dr_explicit_realign)
6143 lsq = gimple_assign_lhs (new_stmt);
6144 if (!realignment_token)
6145 realignment_token = dataref_ptr;
6146 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6147 tmp = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
6149 new_stmt = gimple_build_assign (vec_dest, tmp);
6150 new_temp = make_ssa_name (vec_dest, new_stmt);
6151 gimple_assign_set_lhs (new_stmt, new_temp);
6152 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6154 if (alignment_support_scheme == dr_explicit_realign_optimized)
6157 if (i == vec_num - 1 && j == ncopies - 1)
6158 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
6163 /* 4. Handle invariant-load. */
6166 gcc_assert (!strided_load);
6167 gcc_assert (nested_in_vect_loop_p (loop, stmt));
6172 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
6174 /* CHECKME: bitpos depends on endianess? */
6175 bitpos = bitsize_zero_node;
6176 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6179 vect_create_destination_var (scalar_dest, NULL_TREE);
6180 new_stmt = gimple_build_assign (vec_dest, vec_inv);
6181 new_temp = make_ssa_name (vec_dest, new_stmt);
6182 gimple_assign_set_lhs (new_stmt, new_temp);
6183 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6185 for (k = nunits - 1; k >= 0; --k)
6186 t = tree_cons (NULL_TREE, new_temp, t);
6187 /* FIXME: use build_constructor directly. */
6188 vec_inv = build_constructor_from_list (vectype, t);
6189 new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
6190 new_stmt = SSA_NAME_DEF_STMT (new_temp);
6193 gcc_unreachable (); /* FORNOW. */
6196 /* Collect vector loads and later create their permutation in
6197 vect_transform_strided_load (). */
6199 VEC_quick_push (tree, dr_chain, new_temp);
6201 /* Store vector loads in the corresponding SLP_NODE. */
6203 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
6206 /* FORNOW: SLP with multiple types is unsupported. */
6212 if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi))
6214 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6215 VEC_free (tree, heap, dr_chain);
6216 dr_chain = VEC_alloc (tree, heap, group_size);
6221 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6223 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6224 prev_stmt_info = vinfo_for_stmt (new_stmt);
6229 VEC_free (tree, heap, dr_chain);
6235 /* Function vectorizable_live_operation.
6237 STMT computes a value that is used outside the loop. Check if
6238 it can be supported. */
6241 vectorizable_live_operation (gimple stmt,
6242 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6243 gimple *vec_stmt ATTRIBUTE_UNUSED)
6245 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6246 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6247 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6253 enum vect_def_type dt;
6254 enum tree_code code;
6255 enum gimple_rhs_class rhs_class;
6257 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6259 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6262 if (!is_gimple_assign (stmt))
6265 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6268 /* FORNOW. CHECKME. */
6269 if (nested_in_vect_loop_p (loop, stmt))
6272 code = gimple_assign_rhs_code (stmt);
6273 op_type = TREE_CODE_LENGTH (code);
6274 rhs_class = get_gimple_rhs_class (code);
6275 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
6276 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
6278 /* FORNOW: support only if all uses are invariant. This means
6279 that the scalar operations can remain in place, unvectorized.
6280 The original last scalar value that they compute will be used. */
6282 for (i = 0; i < op_type; i++)
6284 if (rhs_class == GIMPLE_SINGLE_RHS)
6285 op = TREE_OPERAND (gimple_op (stmt, 1), i);
6287 op = gimple_op (stmt, i + 1);
6288 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
6290 if (vect_print_dump_info (REPORT_DETAILS))
6291 fprintf (vect_dump, "use not simple.");
6295 if (dt != vect_invariant_def && dt != vect_constant_def)
6299 /* No transformation is required for the cases we currently support. */
6304 /* Function vect_is_simple_cond.
6307 LOOP - the loop that is being vectorized.
6308 COND - Condition that is checked for simple use.
6310 Returns whether a COND can be vectorized. Checks whether
6311 condition operands are supportable using vec_is_simple_use. */
6314 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6318 enum vect_def_type dt;
6320 if (!COMPARISON_CLASS_P (cond))
6323 lhs = TREE_OPERAND (cond, 0);
6324 rhs = TREE_OPERAND (cond, 1);
6326 if (TREE_CODE (lhs) == SSA_NAME)
6328 gimple lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6329 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6332 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6333 && TREE_CODE (lhs) != FIXED_CST)
6336 if (TREE_CODE (rhs) == SSA_NAME)
6338 gimple rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6339 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6342 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
6343 && TREE_CODE (rhs) != FIXED_CST)
6349 /* vectorizable_condition.
6351 Check if STMT is conditional modify expression that can be vectorized.
6352 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6353 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
6356 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6359 vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
6362 tree scalar_dest = NULL_TREE;
6363 tree vec_dest = NULL_TREE;
6364 tree op = NULL_TREE;
6365 tree cond_expr, then_clause, else_clause;
6366 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6367 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6368 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6369 tree vec_compare, vec_cond_expr;
6371 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6372 enum machine_mode vec_mode;
6374 enum vect_def_type dt;
6375 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6376 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6377 enum tree_code code;
6379 gcc_assert (ncopies >= 1);
6381 return false; /* FORNOW */
6383 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6386 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6389 /* FORNOW: SLP not supported. */
6390 if (STMT_SLP_TYPE (stmt_info))
6393 /* FORNOW: not yet supported. */
6394 if (STMT_VINFO_LIVE_P (stmt_info))
6396 if (vect_print_dump_info (REPORT_DETAILS))
6397 fprintf (vect_dump, "value used after loop.");
6401 /* Is vectorizable conditional operation? */
6402 if (!is_gimple_assign (stmt))
6405 code = gimple_assign_rhs_code (stmt);
6407 if (code != COND_EXPR)
6410 gcc_assert (gimple_assign_single_p (stmt));
6411 op = gimple_assign_rhs1 (stmt);
6412 cond_expr = TREE_OPERAND (op, 0);
6413 then_clause = TREE_OPERAND (op, 1);
6414 else_clause = TREE_OPERAND (op, 2);
6416 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6419 /* We do not handle two different vector types for the condition
6421 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6424 if (TREE_CODE (then_clause) == SSA_NAME)
6426 gimple then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6427 if (!vect_is_simple_use (then_clause, loop_vinfo,
6428 &then_def_stmt, &def, &dt))
6431 else if (TREE_CODE (then_clause) != INTEGER_CST
6432 && TREE_CODE (then_clause) != REAL_CST
6433 && TREE_CODE (then_clause) != FIXED_CST)
6436 if (TREE_CODE (else_clause) == SSA_NAME)
6438 gimple else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6439 if (!vect_is_simple_use (else_clause, loop_vinfo,
6440 &else_def_stmt, &def, &dt))
6443 else if (TREE_CODE (else_clause) != INTEGER_CST
6444 && TREE_CODE (else_clause) != REAL_CST
6445 && TREE_CODE (else_clause) != FIXED_CST)
6449 vec_mode = TYPE_MODE (vectype);
6453 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
6454 return expand_vec_cond_expr_p (op, vec_mode);
6460 scalar_dest = gimple_assign_lhs (stmt);
6461 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6463 /* Handle cond expr. */
6465 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
6467 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
6468 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
6469 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
6471 /* Arguments are ready. Create the new vector stmt. */
6472 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
6473 vec_cond_lhs, vec_cond_rhs);
6474 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
6475 vec_compare, vec_then_clause, vec_else_clause);
6477 *vec_stmt = gimple_build_assign (vec_dest, vec_cond_expr);
6478 new_temp = make_ssa_name (vec_dest, *vec_stmt);
6479 gimple_assign_set_lhs (*vec_stmt, new_temp);
6480 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
6486 /* Function vect_transform_stmt.
6488 Create a vectorized stmt to replace STMT, and insert it at BSI. */
6491 vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
6492 bool *strided_store, slp_tree slp_node)
6494 bool is_store = false;
6495 gimple vec_stmt = NULL;
6496 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6497 gimple orig_stmt_in_pattern;
6500 switch (STMT_VINFO_TYPE (stmt_info))
6502 case type_demotion_vec_info_type:
6503 gcc_assert (!slp_node);
6504 done = vectorizable_type_demotion (stmt, gsi, &vec_stmt);
6508 case type_promotion_vec_info_type:
6509 gcc_assert (!slp_node);
6510 done = vectorizable_type_promotion (stmt, gsi, &vec_stmt);
6514 case type_conversion_vec_info_type:
6515 done = vectorizable_conversion (stmt, gsi, &vec_stmt, slp_node);
6519 case induc_vec_info_type:
6520 gcc_assert (!slp_node);
6521 done = vectorizable_induction (stmt, gsi, &vec_stmt);
6525 case op_vec_info_type:
6526 done = vectorizable_operation (stmt, gsi, &vec_stmt, slp_node);
6530 case assignment_vec_info_type:
6531 done = vectorizable_assignment (stmt, gsi, &vec_stmt, slp_node);
6535 case load_vec_info_type:
6536 done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node);
6540 case store_vec_info_type:
6541 done = vectorizable_store (stmt, gsi, &vec_stmt, slp_node);
6543 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6545 /* In case of interleaving, the whole chain is vectorized when the
6546 last store in the chain is reached. Store stmts before the last
6547 one are skipped, and there vec_stmt_info shouldn't be freed
6549 *strided_store = true;
6550 if (STMT_VINFO_VEC_STMT (stmt_info))
6557 case condition_vec_info_type:
6558 gcc_assert (!slp_node);
6559 done = vectorizable_condition (stmt, gsi, &vec_stmt);
6563 case call_vec_info_type:
6564 gcc_assert (!slp_node);
6565 done = vectorizable_call (stmt, gsi, &vec_stmt);
6568 case reduc_vec_info_type:
6569 gcc_assert (!slp_node);
6570 done = vectorizable_reduction (stmt, gsi, &vec_stmt);
6575 if (!STMT_VINFO_LIVE_P (stmt_info))
6577 if (vect_print_dump_info (REPORT_DETAILS))
6578 fprintf (vect_dump, "stmt not supported.");
6583 if (STMT_VINFO_LIVE_P (stmt_info)
6584 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
6586 done = vectorizable_live_operation (stmt, gsi, &vec_stmt);
6592 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
6593 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
6594 if (orig_stmt_in_pattern)
6596 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
6597 /* STMT was inserted by the vectorizer to replace a computation idiom.
6598 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
6599 computed this idiom. We need to record a pointer to VEC_STMT in
6600 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
6601 documentation of vect_pattern_recog. */
6602 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
6604 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
6605 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
6614 /* This function builds ni_name = number of iterations loop executes
6615 on the loop preheader. */
6618 vect_build_loop_niters (loop_vec_info loop_vinfo)
6621 gimple_seq stmts = NULL;
6623 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6624 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6626 var = create_tmp_var (TREE_TYPE (ni), "niters");
6627 add_referenced_var (var);
6628 ni_name = force_gimple_operand (ni, &stmts, false, var);
6630 pe = loop_preheader_edge (loop);
6633 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6634 gcc_assert (!new_bb);
6641 /* This function generates the following statements:
6643 ni_name = number of iterations loop executes
6644 ratio = ni_name / vf
6645 ratio_mult_vf_name = ratio * vf
6647 and places them at the loop preheader edge. */
6650 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6652 tree *ratio_mult_vf_name_ptr,
6653 tree *ratio_name_ptr)
6662 tree ratio_mult_vf_name;
6663 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6664 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
6665 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6668 pe = loop_preheader_edge (loop);
6670 /* Generate temporary variable that contains
6671 number of iterations loop executes. */
6673 ni_name = vect_build_loop_niters (loop_vinfo);
6674 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
6676 /* Create: ratio = ni >> log2(vf) */
6678 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
6679 if (!is_gimple_val (ratio_name))
6681 var = create_tmp_var (TREE_TYPE (ni), "bnd");
6682 add_referenced_var (var);
6685 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
6686 pe = loop_preheader_edge (loop);
6687 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6688 gcc_assert (!new_bb);
6691 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6693 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6694 ratio_name, log_vf);
6695 if (!is_gimple_val (ratio_mult_vf_name))
6697 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
6698 add_referenced_var (var);
6701 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
6703 pe = loop_preheader_edge (loop);
6704 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6705 gcc_assert (!new_bb);
6708 *ni_name_ptr = ni_name;
6709 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6710 *ratio_name_ptr = ratio_name;
6716 /* Function vect_update_ivs_after_vectorizer.
6718 "Advance" the induction variables of LOOP to the value they should take
6719 after the execution of LOOP. This is currently necessary because the
6720 vectorizer does not handle induction variables that are used after the
6721 loop. Such a situation occurs when the last iterations of LOOP are
6723 1. We introduced new uses after LOOP for IVs that were not originally used
6724 after LOOP: the IVs of LOOP are now used by an epilog loop.
6725 2. LOOP is going to be vectorized; this means that it will iterate N/VF
6726 times, whereas the loop IVs should be bumped N times.
6729 - LOOP - a loop that is going to be vectorized. The last few iterations
6730 of LOOP were peeled.
6731 - NITERS - the number of iterations that LOOP executes (before it is
6732 vectorized). i.e, the number of times the ivs should be bumped.
6733 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
6734 coming out from LOOP on which there are uses of the LOOP ivs
6735 (this is the path from LOOP->exit to epilog_loop->preheader).
6737 The new definitions of the ivs are placed in LOOP->exit.
6738 The phi args associated with the edge UPDATE_E in the bb
6739 UPDATE_E->dest are updated accordingly.
6741 Assumption 1: Like the rest of the vectorizer, this function assumes
6742 a single loop exit that has a single predecessor.
6744 Assumption 2: The phi nodes in the LOOP header and in update_bb are
6745 organized in the same order.
6747 Assumption 3: The access function of the ivs is simple enough (see
6748 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
6750 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
6751 coming out of LOOP on which the ivs of LOOP are used (this is the path
6752 that leads to the epilog loop; other paths skip the epilog loop). This
6753 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
6754 needs to have its phis updated.
6758 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
6761 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6762 basic_block exit_bb = single_exit (loop)->dest;
6764 gimple_stmt_iterator gsi, gsi1;
6765 basic_block update_bb = update_e->dest;
6767 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
6769 /* Make sure there exists a single-predecessor exit bb: */
6770 gcc_assert (single_pred_p (exit_bb));
6772 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
6773 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
6774 gsi_next (&gsi), gsi_next (&gsi1))
6776 tree access_fn = NULL;
6777 tree evolution_part;
6780 tree var, ni, ni_name;
6781 gimple_stmt_iterator last_gsi;
6783 phi = gsi_stmt (gsi);
6784 phi1 = gsi_stmt (gsi1);
6785 if (vect_print_dump_info (REPORT_DETAILS))
6787 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
6788 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
6791 /* Skip virtual phi's. */
6792 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
6794 if (vect_print_dump_info (REPORT_DETAILS))
6795 fprintf (vect_dump, "virtual phi. skip.");
6799 /* Skip reduction phis. */
6800 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
6802 if (vect_print_dump_info (REPORT_DETAILS))
6803 fprintf (vect_dump, "reduc phi. skip.");
6807 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
6808 gcc_assert (access_fn);
6810 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
6811 gcc_assert (evolution_part != NULL_TREE);
6813 /* FORNOW: We do not support IVs whose evolution function is a polynomial
6814 of degree >= 2 or exponential. */
6815 gcc_assert (!tree_is_chrec (evolution_part));
6817 step_expr = evolution_part;
6818 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
6821 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
6822 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
6824 fold_convert (sizetype,
6825 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
6826 niters, step_expr)));
6828 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
6829 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
6830 fold_convert (TREE_TYPE (init_expr),
6837 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
6838 add_referenced_var (var);
6840 last_gsi = gsi_last_bb (exit_bb);
6841 ni_name = force_gimple_operand_gsi (&last_gsi, ni, false, var,
6842 true, GSI_SAME_STMT);
6844 /* Fix phi expressions in the successor bb. */
6845 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
6849 /* Return the more conservative threshold between the
6850 min_profitable_iters returned by the cost model and the user
6851 specified threshold, if provided. */
6854 conservative_cost_threshold (loop_vec_info loop_vinfo,
6855 int min_profitable_iters)
6858 int min_scalar_loop_bound;
6860 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
6861 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
6863 /* Use the cost model only if it is more conservative than user specified
6865 th = (unsigned) min_scalar_loop_bound;
6866 if (min_profitable_iters
6867 && (!min_scalar_loop_bound
6868 || min_profitable_iters > min_scalar_loop_bound))
6869 th = (unsigned) min_profitable_iters;
6871 if (th && vect_print_dump_info (REPORT_COST))
6872 fprintf (vect_dump, "Vectorization may not be profitable.");
6877 /* Function vect_do_peeling_for_loop_bound
6879 Peel the last iterations of the loop represented by LOOP_VINFO.
6880 The peeled iterations form a new epilog loop. Given that the loop now
6881 iterates NITERS times, the new epilog loop iterates
6882 NITERS % VECTORIZATION_FACTOR times.
6884 The original loop will later be made to iterate
6885 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
6888 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
6890 tree ni_name, ratio_mult_vf_name;
6891 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6892 struct loop *new_loop;
6894 basic_block preheader;
6896 bool check_profitability = false;
6897 unsigned int th = 0;
6898 int min_profitable_iters;
6900 if (vect_print_dump_info (REPORT_DETAILS))
6901 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
6903 initialize_original_copy_tables ();
6905 /* Generate the following variables on the preheader of original loop:
6907 ni_name = number of iteration the original loop executes
6908 ratio = ni_name / vf
6909 ratio_mult_vf_name = ratio * vf */
6910 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
6911 &ratio_mult_vf_name, ratio);
6913 loop_num = loop->num;
6915 /* If cost model check not done during versioning and
6916 peeling for alignment. */
6917 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6918 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
6919 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
6921 check_profitability = true;
6923 /* Get profitability threshold for vectorized loop. */
6924 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6926 th = conservative_cost_threshold (loop_vinfo,
6927 min_profitable_iters);
6930 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
6931 ratio_mult_vf_name, ni_name, false,
6932 th, check_profitability);
6933 gcc_assert (new_loop);
6934 gcc_assert (loop_num == loop->num);
6935 #ifdef ENABLE_CHECKING
6936 slpeel_verify_cfg_after_peeling (loop, new_loop);
6939 /* A guard that controls whether the new_loop is to be executed or skipped
6940 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
6941 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
6942 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
6943 is on the path where the LOOP IVs are used and need to be updated. */
6945 preheader = loop_preheader_edge (new_loop)->src;
6946 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
6947 update_e = EDGE_PRED (preheader, 0);
6949 update_e = EDGE_PRED (preheader, 1);
6951 /* Update IVs of original loop as if they were advanced
6952 by ratio_mult_vf_name steps. */
6953 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
6955 /* After peeling we have to reset scalar evolution analyzer. */
6958 free_original_copy_tables ();
6962 /* Function vect_gen_niters_for_prolog_loop
6964 Set the number of iterations for the loop represented by LOOP_VINFO
6965 to the minimum between LOOP_NITERS (the original iteration count of the loop)
6966 and the misalignment of DR - the data reference recorded in
6967 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
6968 this loop, the data reference DR will refer to an aligned location.
6970 The following computation is generated:
6972 If the misalignment of DR is known at compile time:
6973 addr_mis = int mis = DR_MISALIGNMENT (dr);
6974 Else, compute address misalignment in bytes:
6975 addr_mis = addr & (vectype_size - 1)
6977 prolog_niters = min (LOOP_NITERS, ((VF - addr_mis/elem_size)&(VF-1))/step)
6979 (elem_size = element type size; an element is the scalar element whose type
6980 is the inner type of the vectype)
6982 When the step of the data-ref in the loop is not 1 (as in interleaved data
6983 and SLP), the number of iterations of the prolog must be divided by the step
6984 (which is equal to the size of interleaved group).
6986 The above formulas assume that VF == number of elements in the vector. This
6987 may not hold when there are multiple-types in the loop.
6988 In this case, for some data-references in the loop the VF does not represent
6989 the number of elements that fit in the vector. Therefore, instead of VF we
6990 use TYPE_VECTOR_SUBPARTS. */
6993 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
6995 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
6996 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6999 tree iters, iters_name;
7002 gimple dr_stmt = DR_STMT (dr);
7003 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
7004 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7005 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
7006 tree niters_type = TREE_TYPE (loop_niters);
7008 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
7009 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
7011 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7012 step = DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info)));
7014 pe = loop_preheader_edge (loop);
7016 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
7018 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
7019 int elem_misalign = byte_misalign / element_size;
7021 if (vect_print_dump_info (REPORT_DETAILS))
7022 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
7024 iters = build_int_cst (niters_type,
7025 (((nelements - elem_misalign) & (nelements - 1)) / step));
7029 gimple_seq new_stmts = NULL;
7030 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
7031 &new_stmts, NULL_TREE, loop);
7032 tree ptr_type = TREE_TYPE (start_addr);
7033 tree size = TYPE_SIZE (ptr_type);
7034 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
7035 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
7036 tree elem_size_log =
7037 build_int_cst (type, exact_log2 (vectype_align/nelements));
7038 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
7039 tree nelements_tree = build_int_cst (type, nelements);
7043 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmts);
7044 gcc_assert (!new_bb);
7046 /* Create: byte_misalign = addr & (vectype_size - 1) */
7048 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
7050 /* Create: elem_misalign = byte_misalign / element_size */
7052 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
7054 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
7055 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
7056 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
7057 iters = fold_convert (niters_type, iters);
7060 /* Create: prolog_loop_niters = min (iters, loop_niters) */
7061 /* If the loop bound is known at compile time we already verified that it is
7062 greater than vf; since the misalignment ('iters') is at most vf, there's
7063 no need to generate the MIN_EXPR in this case. */
7064 if (TREE_CODE (loop_niters) != INTEGER_CST)
7065 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
7067 if (vect_print_dump_info (REPORT_DETAILS))
7069 fprintf (vect_dump, "niters for prolog loop: ");
7070 print_generic_expr (vect_dump, iters, TDF_SLIM);
7073 var = create_tmp_var (niters_type, "prolog_loop_niters");
7074 add_referenced_var (var);
7076 iters_name = force_gimple_operand (iters, &stmts, false, var);
7078 /* Insert stmt on loop preheader edge. */
7081 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7082 gcc_assert (!new_bb);
7089 /* Function vect_update_init_of_dr
7091 NITERS iterations were peeled from LOOP. DR represents a data reference
7092 in LOOP. This function updates the information recorded in DR to
7093 account for the fact that the first NITERS iterations had already been
7094 executed. Specifically, it updates the OFFSET field of DR. */
7097 vect_update_init_of_dr (struct data_reference *dr, tree niters)
7099 tree offset = DR_OFFSET (dr);
7101 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
7102 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
7103 DR_OFFSET (dr) = offset;
7107 /* Function vect_update_inits_of_drs
7109 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
7110 This function updates the information recorded for the data references in
7111 the loop to account for the fact that the first NITERS iterations had
7112 already been executed. Specifically, it updates the initial_condition of
7113 the access_function of all the data_references in the loop. */
7116 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
7119 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
7120 struct data_reference *dr;
7122 if (vect_print_dump_info (REPORT_DETAILS))
7123 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
7125 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
7126 vect_update_init_of_dr (dr, niters);
7130 /* Function vect_do_peeling_for_alignment
7132 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
7133 'niters' is set to the misalignment of one of the data references in the
7134 loop, thereby forcing it to refer to an aligned location at the beginning
7135 of the execution of this loop. The data reference for which we are
7136 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
7139 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
7141 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7142 tree niters_of_prolog_loop, ni_name;
7144 struct loop *new_loop;
7145 bool check_profitability = false;
7146 unsigned int th = 0;
7147 int min_profitable_iters;
7149 if (vect_print_dump_info (REPORT_DETAILS))
7150 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
7152 initialize_original_copy_tables ();
7154 ni_name = vect_build_loop_niters (loop_vinfo);
7155 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
7158 /* If cost model check not done during versioning. */
7159 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7160 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7162 check_profitability = true;
7164 /* Get profitability threshold for vectorized loop. */
7165 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7167 th = conservative_cost_threshold (loop_vinfo,
7168 min_profitable_iters);
7171 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
7173 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
7174 niters_of_prolog_loop, ni_name, true,
7175 th, check_profitability);
7177 gcc_assert (new_loop);
7178 #ifdef ENABLE_CHECKING
7179 slpeel_verify_cfg_after_peeling (new_loop, loop);
7182 /* Update number of times loop executes. */
7183 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
7184 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
7185 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
7187 /* Update the init conditions of the access functions of all data refs. */
7188 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
7190 /* After peeling we have to reset scalar evolution analyzer. */
7193 free_original_copy_tables ();
7197 /* Function vect_create_cond_for_align_checks.
7199 Create a conditional expression that represents the alignment checks for
7200 all of data references (array element references) whose alignment must be
7204 COND_EXPR - input conditional expression. New conditions will be chained
7205 with logical AND operation.
7206 LOOP_VINFO - two fields of the loop information are used.
7207 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
7208 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
7211 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7213 The returned value is the conditional expression to be used in the if
7214 statement that controls which version of the loop gets executed at runtime.
7216 The algorithm makes two assumptions:
7217 1) The number of bytes "n" in a vector is a power of 2.
7218 2) An address "a" is aligned if a%n is zero and that this
7219 test can be done as a&(n-1) == 0. For example, for 16
7220 byte vectors the test is a&0xf == 0. */
7223 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
7225 gimple_seq *cond_expr_stmt_list)
7227 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7228 VEC(gimple,heap) *may_misalign_stmts
7229 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
7231 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
7235 tree int_ptrsize_type;
7237 tree or_tmp_name = NULL_TREE;
7238 tree and_tmp, and_tmp_name;
7241 tree part_cond_expr;
7243 /* Check that mask is one less than a power of 2, i.e., mask is
7244 all zeros followed by all ones. */
7245 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
7247 /* CHECKME: what is the best integer or unsigned type to use to hold a
7248 cast from a pointer value? */
7249 psize = TYPE_SIZE (ptr_type_node);
7251 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
7253 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
7254 of the first vector of the i'th data reference. */
7256 for (i = 0; VEC_iterate (gimple, may_misalign_stmts, i, ref_stmt); i++)
7258 gimple_seq new_stmt_list = NULL;
7260 tree addr_tmp, addr_tmp_name;
7261 tree or_tmp, new_or_tmp_name;
7262 gimple addr_stmt, or_stmt;
7264 /* create: addr_tmp = (int)(address_of_first_vector) */
7266 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
7268 if (new_stmt_list != NULL)
7269 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
7271 sprintf (tmp_name, "%s%d", "addr2int", i);
7272 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7273 add_referenced_var (addr_tmp);
7274 addr_tmp_name = make_ssa_name (addr_tmp, NULL);
7275 addr_stmt = gimple_build_assign (addr_tmp_name, addr_base);
7276 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
7277 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
7279 /* The addresses are OR together. */
7281 if (or_tmp_name != NULL_TREE)
7283 /* create: or_tmp = or_tmp | addr_tmp */
7284 sprintf (tmp_name, "%s%d", "orptrs", i);
7285 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7286 add_referenced_var (or_tmp);
7287 new_or_tmp_name = make_ssa_name (or_tmp, NULL);
7288 or_stmt = gimple_build_assign_with_ops (BIT_IOR_EXPR,
7290 or_tmp_name, addr_tmp_name);
7291 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
7292 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
7293 or_tmp_name = new_or_tmp_name;
7296 or_tmp_name = addr_tmp_name;
7300 mask_cst = build_int_cst (int_ptrsize_type, mask);
7302 /* create: and_tmp = or_tmp & mask */
7303 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
7304 add_referenced_var (and_tmp);
7305 and_tmp_name = make_ssa_name (and_tmp, NULL);
7307 and_stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, and_tmp_name,
7308 or_tmp_name, mask_cst);
7309 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
7310 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
7312 /* Make and_tmp the left operand of the conditional test against zero.
7313 if and_tmp has a nonzero bit then some address is unaligned. */
7314 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
7315 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
7316 and_tmp_name, ptrsize_zero);
7318 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7319 *cond_expr, part_cond_expr);
7321 *cond_expr = part_cond_expr;
7324 /* Function vect_vfa_segment_size.
7326 Create an expression that computes the size of segment
7327 that will be accessed for a data reference. The functions takes into
7328 account that realignment loads may access one more vector.
7331 DR: The data reference.
7332 VECT_FACTOR: vectorization factor.
7334 Return an expression whose value is the size of segment which will be
7338 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7340 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7341 DR_STEP (dr), vect_factor);
7343 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7345 tree vector_size = TYPE_SIZE_UNIT
7346 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7348 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7349 segment_length, vector_size);
7351 return fold_convert (sizetype, segment_length);
7354 /* Function vect_create_cond_for_alias_checks.
7356 Create a conditional expression that represents the run-time checks for
7357 overlapping of address ranges represented by a list of data references
7358 relations passed as input.
7361 COND_EXPR - input conditional expression. New conditions will be chained
7362 with logical AND operation.
7363 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7367 COND_EXPR - conditional expression.
7368 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7372 The returned value is the conditional expression to be used in the if
7373 statement that controls which version of the loop gets executed at runtime.
7377 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7379 gimple_seq * cond_expr_stmt_list)
7381 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7382 VEC (ddr_p, heap) * may_alias_ddrs =
7383 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7385 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7389 tree part_cond_expr;
7391 /* Create expression
7392 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7393 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7397 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7398 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
7400 if (VEC_empty (ddr_p, may_alias_ddrs))
7403 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
7405 struct data_reference *dr_a, *dr_b;
7406 gimple dr_group_first_a, dr_group_first_b;
7407 tree addr_base_a, addr_base_b;
7408 tree segment_length_a, segment_length_b;
7409 gimple stmt_a, stmt_b;
7412 stmt_a = DR_STMT (DDR_A (ddr));
7413 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
7414 if (dr_group_first_a)
7416 stmt_a = dr_group_first_a;
7417 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
7421 stmt_b = DR_STMT (DDR_B (ddr));
7422 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
7423 if (dr_group_first_b)
7425 stmt_b = dr_group_first_b;
7426 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
7430 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
7433 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
7436 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
7437 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
7439 if (vect_print_dump_info (REPORT_DR_DETAILS))
7442 "create runtime check for data references ");
7443 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
7444 fprintf (vect_dump, " and ");
7445 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
7450 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
7451 fold_build2 (LT_EXPR, boolean_type_node,
7452 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
7456 fold_build2 (LT_EXPR, boolean_type_node,
7457 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
7463 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7464 *cond_expr, part_cond_expr);
7466 *cond_expr = part_cond_expr;
7468 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7469 fprintf (vect_dump, "created %u versioning for alias checks.\n",
7470 VEC_length (ddr_p, may_alias_ddrs));
7474 /* Function vect_loop_versioning.
7476 If the loop has data references that may or may not be aligned or/and
7477 has data reference relations whose independence was not proven then
7478 two versions of the loop need to be generated, one which is vectorized
7479 and one which isn't. A test is then generated to control which of the
7480 loops is executed. The test checks for the alignment of all of the
7481 data references that may or may not be aligned. An additional
7482 sequence of runtime tests is generated for each pairs of DDRs whose
7483 independence was not proven. The vectorized version of loop is
7484 executed only if both alias and alignment tests are passed.
7486 The test generated to check which version of loop is executed
7487 is modified to also check for profitability as indicated by the
7488 cost model initially. */
7491 vect_loop_versioning (loop_vec_info loop_vinfo)
7493 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7495 tree cond_expr = NULL_TREE;
7496 gimple_seq cond_expr_stmt_list = NULL;
7497 basic_block condition_bb;
7498 gimple_stmt_iterator gsi, cond_exp_gsi;
7499 basic_block merge_bb;
7500 basic_block new_exit_bb;
7502 gimple orig_phi, new_phi;
7504 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
7505 gimple_seq gimplify_stmt_list = NULL;
7506 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
7507 int min_profitable_iters = 0;
7510 /* Get profitability threshold for vectorized loop. */
7511 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7513 th = conservative_cost_threshold (loop_vinfo,
7514 min_profitable_iters);
7517 build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
7518 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
7520 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
7523 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
7524 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
7525 &cond_expr_stmt_list);
7527 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7528 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
7529 &cond_expr_stmt_list);
7532 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
7534 force_gimple_operand (cond_expr, &gimplify_stmt_list, true, NULL_TREE);
7535 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
7537 initialize_original_copy_tables ();
7538 nloop = loop_version (loop, cond_expr, &condition_bb,
7539 prob, prob, REG_BR_PROB_BASE - prob, true);
7540 free_original_copy_tables();
7542 /* Loop versioning violates an assumption we try to maintain during
7543 vectorization - that the loop exit block has a single predecessor.
7544 After versioning, the exit block of both loop versions is the same
7545 basic block (i.e. it has two predecessors). Just in order to simplify
7546 following transformations in the vectorizer, we fix this situation
7547 here by adding a new (empty) block on the exit-edge of the loop,
7548 with the proper loop-exit phis to maintain loop-closed-form. */
7550 merge_bb = single_exit (loop)->dest;
7551 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
7552 new_exit_bb = split_edge (single_exit (loop));
7553 new_exit_e = single_exit (loop);
7554 e = EDGE_SUCC (new_exit_bb, 0);
7556 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
7558 orig_phi = gsi_stmt (gsi);
7559 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
7561 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
7562 add_phi_arg (new_phi, arg, new_exit_e);
7563 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
7566 /* End loop-exit-fixes after versioning. */
7568 update_ssa (TODO_update_ssa);
7569 if (cond_expr_stmt_list)
7571 cond_exp_gsi = gsi_last_bb (condition_bb);
7572 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, GSI_SAME_STMT);
7576 /* Remove a group of stores (for SLP or interleaving), free their
7580 vect_remove_stores (gimple first_stmt)
7582 gimple next = first_stmt;
7584 gimple_stmt_iterator next_si;
7588 /* Free the attached stmt_vec_info and remove the stmt. */
7589 next_si = gsi_for_stmt (next);
7590 gsi_remove (&next_si, true);
7591 tmp = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
7592 free_stmt_vec_info (next);
7598 /* Vectorize SLP instance tree in postorder. */
7601 vect_schedule_slp_instance (slp_tree node, unsigned int vec_stmts_size)
7604 bool strided_store, is_store;
7605 gimple_stmt_iterator si;
7606 stmt_vec_info stmt_info;
7611 vect_schedule_slp_instance (SLP_TREE_LEFT (node), vec_stmts_size);
7612 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), vec_stmts_size);
7614 stmt = VEC_index(gimple, SLP_TREE_SCALAR_STMTS (node), 0);
7615 stmt_info = vinfo_for_stmt (stmt);
7616 SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size);
7617 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
7619 if (vect_print_dump_info (REPORT_DETAILS))
7621 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
7622 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
7625 si = gsi_for_stmt (stmt);
7626 is_store = vect_transform_stmt (stmt, &si, &strided_store, node);
7629 if (DR_GROUP_FIRST_DR (stmt_info))
7630 /* If IS_STORE is TRUE, the vectorization of the
7631 interleaving chain was completed - free all the stores in
7633 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
7635 /* FORNOW: SLP originates only from strided stores. */
7641 /* FORNOW: SLP originates only from strided stores. */
7647 vect_schedule_slp (loop_vec_info loop_vinfo, unsigned int nunits)
7649 VEC (slp_instance, heap) *slp_instances =
7650 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7651 slp_instance instance;
7652 unsigned int vec_stmts_size;
7653 unsigned int group_size, i;
7654 unsigned int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7655 bool is_store = false;
7657 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
7659 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
7660 /* For each SLP instance calculate number of vector stmts to be created
7661 for the scalar stmts in each node of the SLP tree. Number of vector
7662 elements in one vector iteration is the number of scalar elements in
7663 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
7665 vec_stmts_size = vectorization_factor * group_size / nunits;
7667 /* Schedule the tree of INSTANCE. */
7668 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
7671 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
7672 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
7673 fprintf (vect_dump, "vectorizing stmts using SLP.");
7679 /* Function vect_transform_loop.
7681 The analysis phase has determined that the loop is vectorizable.
7682 Vectorize the loop - created vectorized stmts to replace the scalar
7683 stmts in the loop, and update the loop exit condition. */
7686 vect_transform_loop (loop_vec_info loop_vinfo)
7688 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7689 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7690 int nbbs = loop->num_nodes;
7691 gimple_stmt_iterator si;
7694 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7696 bool slp_scheduled = false;
7697 unsigned int nunits;
7699 if (vect_print_dump_info (REPORT_DETAILS))
7700 fprintf (vect_dump, "=== vec_transform_loop ===");
7702 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7703 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7704 vect_loop_versioning (loop_vinfo);
7706 /* CHECKME: we wouldn't need this if we called update_ssa once
7708 bitmap_zero (vect_memsyms_to_rename);
7710 /* Peel the loop if there are data refs with unknown alignment.
7711 Only one data ref with unknown store is allowed. */
7713 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7714 vect_do_peeling_for_alignment (loop_vinfo);
7716 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
7717 compile time constant), or it is a constant that doesn't divide by the
7718 vectorization factor, then an epilog loop needs to be created.
7719 We therefore duplicate the loop: the original loop will be vectorized,
7720 and will compute the first (n/VF) iterations. The second copy of the loop
7721 will remain scalar and will compute the remaining (n%VF) iterations.
7722 (VF is the vectorization factor). */
7724 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7725 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7726 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
7727 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
7729 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7730 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
7732 /* 1) Make sure the loop header has exactly two entries
7733 2) Make sure we have a preheader basic block. */
7735 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7737 split_edge (loop_preheader_edge (loop));
7739 /* FORNOW: the vectorizer supports only loops which body consist
7740 of one basic block (header + empty latch). When the vectorizer will
7741 support more involved loop forms, the order by which the BBs are
7742 traversed need to be reconsidered. */
7744 for (i = 0; i < nbbs; i++)
7746 basic_block bb = bbs[i];
7747 stmt_vec_info stmt_info;
7750 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
7752 phi = gsi_stmt (si);
7753 if (vect_print_dump_info (REPORT_DETAILS))
7755 fprintf (vect_dump, "------>vectorizing phi: ");
7756 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
7758 stmt_info = vinfo_for_stmt (phi);
7762 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7763 && !STMT_VINFO_LIVE_P (stmt_info))
7766 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7767 != (unsigned HOST_WIDE_INT) vectorization_factor)
7768 && vect_print_dump_info (REPORT_DETAILS))
7769 fprintf (vect_dump, "multiple-types.");
7771 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
7773 if (vect_print_dump_info (REPORT_DETAILS))
7774 fprintf (vect_dump, "transform phi.");
7775 vect_transform_stmt (phi, NULL, NULL, NULL);
7779 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
7781 gimple stmt = gsi_stmt (si);
7784 if (vect_print_dump_info (REPORT_DETAILS))
7786 fprintf (vect_dump, "------>vectorizing statement: ");
7787 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
7790 stmt_info = vinfo_for_stmt (stmt);
7792 /* vector stmts created in the outer-loop during vectorization of
7793 stmts in an inner-loop may not have a stmt_info, and do not
7794 need to be vectorized. */
7801 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7802 && !STMT_VINFO_LIVE_P (stmt_info))
7808 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
7810 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7811 if (!STMT_SLP_TYPE (stmt_info)
7812 && nunits != (unsigned int) vectorization_factor
7813 && vect_print_dump_info (REPORT_DETAILS))
7814 /* For SLP VF is set according to unrolling factor, and not to
7815 vector size, hence for SLP this print is not valid. */
7816 fprintf (vect_dump, "multiple-types.");
7818 /* SLP. Schedule all the SLP instances when the first SLP stmt is
7820 if (STMT_SLP_TYPE (stmt_info))
7824 slp_scheduled = true;
7826 if (vect_print_dump_info (REPORT_DETAILS))
7827 fprintf (vect_dump, "=== scheduling SLP instances ===");
7829 is_store = vect_schedule_slp (loop_vinfo, nunits);
7831 /* IS_STORE is true if STMT is a store. Stores cannot be of
7832 hybrid SLP type. They are removed in
7833 vect_schedule_slp_instance and their vinfo is destroyed. */
7841 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
7842 if (PURE_SLP_STMT (stmt_info))
7849 /* -------- vectorize statement ------------ */
7850 if (vect_print_dump_info (REPORT_DETAILS))
7851 fprintf (vect_dump, "transform statement.");
7853 strided_store = false;
7854 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL);
7857 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7859 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7860 interleaving chain was completed - free all the stores in
7862 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
7863 gsi_remove (&si, true);
7868 /* Free the attached stmt_vec_info and remove the stmt. */
7869 free_stmt_vec_info (stmt);
7870 gsi_remove (&si, true);
7878 slpeel_make_loop_iterate_ntimes (loop, ratio);
7880 mark_set_for_renaming (vect_memsyms_to_rename);
7882 /* The memory tags and pointers in vectorized statements need to
7883 have their SSA forms updated. FIXME, why can't this be delayed
7884 until all the loops have been transformed? */
7885 update_ssa (TODO_update_ssa);
7887 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7888 fprintf (vect_dump, "LOOP VECTORIZED.");
7889 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7890 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");