1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (gimple, gimple_stmt_iterator *, bool *,
50 slp_tree, slp_instance);
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (gimple, struct loop*, tree, tree *, gimple *, bool, bool *, tree);
54 static tree vect_create_addr_base_for_vector_ref
55 (gimple, gimple_seq *, tree, struct loop *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, gimple, tree *);
58 static tree vect_init_vector (gimple, tree, tree, gimple_stmt_iterator *);
59 static void vect_finish_stmt_generation
60 (gimple stmt, gimple vec_stmt, gimple_stmt_iterator *);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void vect_create_epilog_for_reduction
63 (tree, gimple, int, enum tree_code, gimple);
64 static tree get_initial_def_for_reduction (gimple, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
78 cost_for_stmt (gimple stmt)
80 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
82 switch (STMT_VINFO_TYPE (stmt_info))
84 case load_vec_info_type:
85 return TARG_SCALAR_LOAD_COST;
86 case store_vec_info_type:
87 return TARG_SCALAR_STORE_COST;
88 case op_vec_info_type:
89 case condition_vec_info_type:
90 case assignment_vec_info_type:
91 case reduc_vec_info_type:
92 case induc_vec_info_type:
93 case type_promotion_vec_info_type:
94 case type_demotion_vec_info_type:
95 case type_conversion_vec_info_type:
96 case call_vec_info_type:
97 return TARG_SCALAR_STMT_COST;
98 case undef_vec_info_type:
105 /* Function vect_estimate_min_profitable_iters
107 Return the number of iterations required for the vector version of the
108 loop to be profitable relative to the cost of the scalar version of the
111 TODO: Take profile info into account before making vectorization
112 decisions, if available. */
115 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
118 int min_profitable_iters;
119 int peel_iters_prologue;
120 int peel_iters_epilogue;
121 int vec_inside_cost = 0;
122 int vec_outside_cost = 0;
123 int scalar_single_iter_cost = 0;
124 int scalar_outside_cost = 0;
125 bool runtime_test = false;
126 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
127 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
128 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
129 int nbbs = loop->num_nodes;
130 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
131 int peel_guard_costs = 0;
132 int innerloop_iters = 0, factor;
133 VEC (slp_instance, heap) *slp_instances;
134 slp_instance instance;
136 /* Cost model disabled. */
137 if (!flag_vect_cost_model)
139 if (vect_print_dump_info (REPORT_COST))
140 fprintf (vect_dump, "cost model disabled.");
144 /* If the number of iterations is unknown, or the
145 peeling-for-misalignment amount is unknown, we will have to generate
146 a runtime test to test the loop count against the threshold. */
147 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
148 || (byte_misalign < 0))
151 /* Requires loop versioning tests to handle misalignment. */
153 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
155 /* FIXME: Make cost depend on complexity of individual check. */
157 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
158 if (vect_print_dump_info (REPORT_COST))
159 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
160 "versioning to treat misalignment.\n");
163 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
165 /* FIXME: Make cost depend on complexity of individual check. */
167 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
168 if (vect_print_dump_info (REPORT_COST))
169 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
170 "versioning aliasing.\n");
173 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
174 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
176 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
179 /* Count statements in scalar loop. Using this as scalar cost for a single
182 TODO: Add outer loop support.
184 TODO: Consider assigning different costs to different scalar
189 innerloop_iters = 50; /* FIXME */
191 for (i = 0; i < nbbs; i++)
193 gimple_stmt_iterator si;
194 basic_block bb = bbs[i];
196 if (bb->loop_father == loop->inner)
197 factor = innerloop_iters;
201 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
203 gimple stmt = gsi_stmt (si);
204 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
205 /* Skip stmts that are not vectorized inside the loop. */
206 if (!STMT_VINFO_RELEVANT_P (stmt_info)
207 && (!STMT_VINFO_LIVE_P (stmt_info)
208 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
210 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
211 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
212 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
213 some of the "outside" costs are generated inside the outer-loop. */
214 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
218 /* Add additional cost for the peeled instructions in prologue and epilogue
221 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
222 at compile-time - we assume it's vf/2 (the worst would be vf-1).
224 TODO: Build an expression that represents peel_iters for prologue and
225 epilogue to be used in a run-time test. */
227 if (byte_misalign < 0)
229 peel_iters_prologue = vf/2;
230 if (vect_print_dump_info (REPORT_COST))
231 fprintf (vect_dump, "cost model: "
232 "prologue peel iters set to vf/2.");
234 /* If peeling for alignment is unknown, loop bound of main loop becomes
236 peel_iters_epilogue = vf/2;
237 if (vect_print_dump_info (REPORT_COST))
238 fprintf (vect_dump, "cost model: "
239 "epilogue peel iters set to vf/2 because "
240 "peeling for alignment is unknown .");
242 /* If peeled iterations are unknown, count a taken branch and a not taken
243 branch per peeled loop. Even if scalar loop iterations are known,
244 vector iterations are not known since peeled prologue iterations are
245 not known. Hence guards remain the same. */
246 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
247 + TARG_COND_NOT_TAKEN_BRANCH_COST);
254 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
255 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
256 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
257 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
259 peel_iters_prologue = nelements - (byte_misalign / element_size);
262 peel_iters_prologue = 0;
264 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
266 peel_iters_epilogue = vf/2;
267 if (vect_print_dump_info (REPORT_COST))
268 fprintf (vect_dump, "cost model: "
269 "epilogue peel iters set to vf/2 because "
270 "loop iterations are unknown .");
272 /* If peeled iterations are known but number of scalar loop
273 iterations are unknown, count a taken branch per peeled loop. */
274 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
279 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
280 peel_iters_prologue = niters < peel_iters_prologue ?
281 niters : peel_iters_prologue;
282 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
286 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
287 + (peel_iters_epilogue * scalar_single_iter_cost)
290 /* FORNOW: The scalar outside cost is incremented in one of the
293 1. The vectorizer checks for alignment and aliasing and generates
294 a condition that allows dynamic vectorization. A cost model
295 check is ANDED with the versioning condition. Hence scalar code
296 path now has the added cost of the versioning check.
298 if (cost > th & versioning_check)
301 Hence run-time scalar is incremented by not-taken branch cost.
303 2. The vectorizer then checks if a prologue is required. If the
304 cost model check was not done before during versioning, it has to
305 be done before the prologue check.
308 prologue = scalar_iters
313 if (prologue == num_iters)
316 Hence the run-time scalar cost is incremented by a taken branch,
317 plus a not-taken branch, plus a taken branch cost.
319 3. The vectorizer then checks if an epilogue is required. If the
320 cost model check was not done before during prologue check, it
321 has to be done with the epilogue check.
327 if (prologue == num_iters)
330 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
333 Hence the run-time scalar cost should be incremented by 2 taken
336 TODO: The back end may reorder the BBS's differently and reverse
337 conditions/branch directions. Change the estimates below to
338 something more reasonable. */
342 /* Cost model check occurs at versioning. */
343 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
344 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
345 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
348 /* Cost model occurs at prologue generation. */
349 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
350 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
351 + TARG_COND_NOT_TAKEN_BRANCH_COST;
352 /* Cost model check occurs at epilogue generation. */
354 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
359 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
360 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
362 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
363 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
366 /* Calculate number of iterations required to make the vector version
367 profitable, relative to the loop bodies only. The following condition
369 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
371 SIC = scalar iteration cost, VIC = vector iteration cost,
372 VOC = vector outside cost, VF = vectorization factor,
373 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
374 SOC = scalar outside cost for run time cost model check. */
376 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
378 if (vec_outside_cost <= 0)
379 min_profitable_iters = 1;
382 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
383 - vec_inside_cost * peel_iters_prologue
384 - vec_inside_cost * peel_iters_epilogue)
385 / ((scalar_single_iter_cost * vf)
388 if ((scalar_single_iter_cost * vf * min_profitable_iters)
389 <= ((vec_inside_cost * min_profitable_iters)
390 + ((vec_outside_cost - scalar_outside_cost) * vf)))
391 min_profitable_iters++;
394 /* vector version will never be profitable. */
397 if (vect_print_dump_info (REPORT_COST))
398 fprintf (vect_dump, "cost model: vector iteration cost = %d "
399 "is divisible by scalar iteration cost = %d by a factor "
400 "greater than or equal to the vectorization factor = %d .",
401 vec_inside_cost, scalar_single_iter_cost, vf);
405 if (vect_print_dump_info (REPORT_COST))
407 fprintf (vect_dump, "Cost model analysis: \n");
408 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
410 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
412 fprintf (vect_dump, " Scalar iteration cost: %d\n",
413 scalar_single_iter_cost);
414 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
415 fprintf (vect_dump, " prologue iterations: %d\n",
416 peel_iters_prologue);
417 fprintf (vect_dump, " epilogue iterations: %d\n",
418 peel_iters_epilogue);
419 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
420 min_profitable_iters);
423 min_profitable_iters =
424 min_profitable_iters < vf ? vf : min_profitable_iters;
426 /* Because the condition we create is:
427 if (niters <= min_profitable_iters)
428 then skip the vectorized loop. */
429 min_profitable_iters--;
431 if (vect_print_dump_info (REPORT_COST))
432 fprintf (vect_dump, " Profitability threshold = %d\n",
433 min_profitable_iters);
435 return min_profitable_iters;
439 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
440 functions. Design better to avoid maintenance issues. */
442 /* Function vect_model_reduction_cost.
444 Models cost for a reduction operation, including the vector ops
445 generated within the strip-mine loop, the initial definition before
446 the loop, and the epilogue code that must be generated. */
449 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
456 gimple stmt, orig_stmt;
458 enum machine_mode mode;
459 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
460 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
463 /* Cost of reduction op inside loop. */
464 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
466 stmt = STMT_VINFO_STMT (stmt_info);
468 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
470 case GIMPLE_SINGLE_RHS:
471 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
472 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
474 case GIMPLE_UNARY_RHS:
475 reduction_op = gimple_assign_rhs1 (stmt);
477 case GIMPLE_BINARY_RHS:
478 reduction_op = gimple_assign_rhs2 (stmt);
484 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
487 if (vect_print_dump_info (REPORT_COST))
489 fprintf (vect_dump, "unsupported data-type ");
490 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
495 mode = TYPE_MODE (vectype);
496 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
499 orig_stmt = STMT_VINFO_STMT (stmt_info);
501 code = gimple_assign_rhs_code (orig_stmt);
503 /* Add in cost for initial definition. */
504 outer_cost += TARG_SCALAR_TO_VEC_COST;
506 /* Determine cost of epilogue code.
508 We have a reduction operator that will reduce the vector in one statement.
509 Also requires scalar extract. */
511 if (!nested_in_vect_loop_p (loop, orig_stmt))
513 if (reduc_code < NUM_TREE_CODES)
514 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
517 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
519 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
520 int element_bitsize = tree_low_cst (bitsize, 1);
521 int nelements = vec_size_in_bits / element_bitsize;
523 optab = optab_for_tree_code (code, vectype, optab_default);
525 /* We have a whole vector shift available. */
526 if (VECTOR_MODE_P (mode)
527 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
528 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
529 /* Final reduction via vector shifts and the reduction operator. Also
530 requires scalar extract. */
531 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
532 + TARG_VEC_TO_SCALAR_COST);
534 /* Use extracts and reduction op for final reduction. For N elements,
535 we have N extracts and N-1 reduction ops. */
536 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
540 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
542 if (vect_print_dump_info (REPORT_COST))
543 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
544 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
545 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
551 /* Function vect_model_induction_cost.
553 Models cost for induction operations. */
556 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
558 /* loop cost for vec_loop. */
559 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
560 /* prologue cost for vec_init and vec_step. */
561 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
563 if (vect_print_dump_info (REPORT_COST))
564 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
565 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
566 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
570 /* Function vect_model_simple_cost.
572 Models cost for simple operations, i.e. those that only emit ncopies of a
573 single op. Right now, this does not account for multiple insns that could
574 be generated for the single vector op. We will handle that shortly. */
577 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
578 enum vect_def_type *dt, slp_tree slp_node)
581 int inside_cost = 0, outside_cost = 0;
583 /* The SLP costs were already calculated during SLP tree build. */
584 if (PURE_SLP_STMT (stmt_info))
587 inside_cost = ncopies * TARG_VEC_STMT_COST;
589 /* FORNOW: Assuming maximum 2 args per stmts. */
590 for (i = 0; i < 2; i++)
592 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
593 outside_cost += TARG_SCALAR_TO_VEC_COST;
596 if (vect_print_dump_info (REPORT_COST))
597 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
598 "outside_cost = %d .", inside_cost, outside_cost);
600 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
601 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
602 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
606 /* Function vect_cost_strided_group_size
608 For strided load or store, return the group_size only if it is the first
609 load or store of a group, else return 1. This ensures that group size is
610 only returned once per group. */
613 vect_cost_strided_group_size (stmt_vec_info stmt_info)
615 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
617 if (first_stmt == STMT_VINFO_STMT (stmt_info))
618 return DR_GROUP_SIZE (stmt_info);
624 /* Function vect_model_store_cost
626 Models cost for stores. In the case of strided accesses, one access
627 has the overhead of the strided access attributed to it. */
630 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
631 enum vect_def_type dt, slp_tree slp_node)
634 int inside_cost = 0, outside_cost = 0;
636 /* The SLP costs were already calculated during SLP tree build. */
637 if (PURE_SLP_STMT (stmt_info))
640 if (dt == vect_constant_def || dt == vect_invariant_def)
641 outside_cost = TARG_SCALAR_TO_VEC_COST;
643 /* Strided access? */
644 if (DR_GROUP_FIRST_DR (stmt_info) && !slp_node)
645 group_size = vect_cost_strided_group_size (stmt_info);
646 /* Not a strided access. */
650 /* Is this an access in a group of stores, which provide strided access?
651 If so, add in the cost of the permutes. */
654 /* Uses a high and low interleave operation for each needed permute. */
655 inside_cost = ncopies * exact_log2(group_size) * group_size
656 * TARG_VEC_STMT_COST;
658 if (vect_print_dump_info (REPORT_COST))
659 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
664 /* Costs of the stores. */
665 inside_cost += ncopies * TARG_VEC_STORE_COST;
667 if (vect_print_dump_info (REPORT_COST))
668 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
669 "outside_cost = %d .", inside_cost, outside_cost);
671 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
672 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
673 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
677 /* Function vect_model_load_cost
679 Models cost for loads. In the case of strided accesses, the last access
680 has the overhead of the strided access attributed to it. Since unaligned
681 accesses are supported for loads, we also account for the costs of the
682 access scheme chosen. */
685 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
689 int alignment_support_cheme;
691 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
692 int inside_cost = 0, outside_cost = 0;
694 /* The SLP costs were already calculated during SLP tree build. */
695 if (PURE_SLP_STMT (stmt_info))
698 /* Strided accesses? */
699 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
700 if (first_stmt && !slp_node)
702 group_size = vect_cost_strided_group_size (stmt_info);
703 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
705 /* Not a strided access. */
712 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
714 /* Is this an access in a group of loads providing strided access?
715 If so, add in the cost of the permutes. */
718 /* Uses an even and odd extract operations for each needed permute. */
719 inside_cost = ncopies * exact_log2(group_size) * group_size
720 * TARG_VEC_STMT_COST;
722 if (vect_print_dump_info (REPORT_COST))
723 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
728 /* The loads themselves. */
729 switch (alignment_support_cheme)
733 inside_cost += ncopies * TARG_VEC_LOAD_COST;
735 if (vect_print_dump_info (REPORT_COST))
736 fprintf (vect_dump, "vect_model_load_cost: aligned.");
740 case dr_unaligned_supported:
742 /* Here, we assign an additional cost for the unaligned load. */
743 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
745 if (vect_print_dump_info (REPORT_COST))
746 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
751 case dr_explicit_realign:
753 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
755 /* FIXME: If the misalignment remains fixed across the iterations of
756 the containing loop, the following cost should be added to the
758 if (targetm.vectorize.builtin_mask_for_load)
759 inside_cost += TARG_VEC_STMT_COST;
763 case dr_explicit_realign_optimized:
765 if (vect_print_dump_info (REPORT_COST))
766 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
769 /* Unaligned software pipeline has a load of an address, an initial
770 load, and possibly a mask operation to "prime" the loop. However,
771 if this is an access in a group of loads, which provide strided
772 access, then the above cost should only be considered for one
773 access in the group. Inside the loop, there is a load op
774 and a realignment op. */
776 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
778 outside_cost = 2*TARG_VEC_STMT_COST;
779 if (targetm.vectorize.builtin_mask_for_load)
780 outside_cost += TARG_VEC_STMT_COST;
783 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
792 if (vect_print_dump_info (REPORT_COST))
793 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
794 "outside_cost = %d .", inside_cost, outside_cost);
796 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
797 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
798 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
802 /* Function vect_get_new_vect_var.
804 Returns a name for a new variable. The current naming scheme appends the
805 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
806 the name of vectorizer generated variables, and appends that to NAME if
810 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
817 case vect_simple_var:
820 case vect_scalar_var:
823 case vect_pointer_var:
832 char* tmp = concat (prefix, name, NULL);
833 new_vect_var = create_tmp_var (type, tmp);
837 new_vect_var = create_tmp_var (type, prefix);
839 /* Mark vector typed variable as a gimple register variable. */
840 if (TREE_CODE (type) == VECTOR_TYPE)
841 DECL_GIMPLE_REG_P (new_vect_var) = true;
847 /* Function vect_create_addr_base_for_vector_ref.
849 Create an expression that computes the address of the first memory location
850 that will be accessed for a data reference.
853 STMT: The statement containing the data reference.
854 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
855 OFFSET: Optional. If supplied, it is be added to the initial address.
856 LOOP: Specify relative to which loop-nest should the address be computed.
857 For example, when the dataref is in an inner-loop nested in an
858 outer-loop that is now being vectorized, LOOP can be either the
859 outer-loop, or the inner-loop. The first memory location accessed
860 by the following dataref ('in' points to short):
867 if LOOP=i_loop: &in (relative to i_loop)
868 if LOOP=j_loop: &in+i*2B (relative to j_loop)
871 1. Return an SSA_NAME whose value is the address of the memory location of
872 the first vector of the data reference.
873 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
874 these statement(s) which define the returned SSA_NAME.
876 FORNOW: We are only handling array accesses with step 1. */
879 vect_create_addr_base_for_vector_ref (gimple stmt,
880 gimple_seq *new_stmt_list,
884 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
885 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
886 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
887 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
889 tree data_ref_base_var;
891 tree addr_base, addr_expr;
893 gimple_seq seq = NULL;
894 tree base_offset = unshare_expr (DR_OFFSET (dr));
895 tree init = unshare_expr (DR_INIT (dr));
896 tree vect_ptr_type, addr_expr2;
897 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
900 if (loop != containing_loop)
902 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
903 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
905 gcc_assert (nested_in_vect_loop_p (loop, stmt));
907 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
908 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
909 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
912 /* Create data_ref_base */
913 base_name = build_fold_indirect_ref (data_ref_base);
914 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
915 add_referenced_var (data_ref_base_var);
916 data_ref_base = force_gimple_operand (data_ref_base, &seq, true,
918 gimple_seq_add_seq (new_stmt_list, seq);
920 /* Create base_offset */
921 base_offset = size_binop (PLUS_EXPR,
922 fold_convert (sizetype, base_offset),
923 fold_convert (sizetype, init));
924 dest = create_tmp_var (sizetype, "base_off");
925 add_referenced_var (dest);
926 base_offset = force_gimple_operand (base_offset, &seq, true, dest);
927 gimple_seq_add_seq (new_stmt_list, seq);
931 tree tmp = create_tmp_var (sizetype, "offset");
933 add_referenced_var (tmp);
934 offset = fold_build2 (MULT_EXPR, sizetype,
935 fold_convert (sizetype, offset), step);
936 base_offset = fold_build2 (PLUS_EXPR, sizetype,
937 base_offset, offset);
938 base_offset = force_gimple_operand (base_offset, &seq, false, tmp);
939 gimple_seq_add_seq (new_stmt_list, seq);
942 /* base + base_offset */
943 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
944 data_ref_base, base_offset);
946 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
948 /* addr_expr = addr_base */
949 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
950 get_name (base_name));
951 add_referenced_var (addr_expr);
952 vec_stmt = fold_convert (vect_ptr_type, addr_base);
953 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
954 get_name (base_name));
955 add_referenced_var (addr_expr2);
956 vec_stmt = force_gimple_operand (vec_stmt, &seq, false, addr_expr2);
957 gimple_seq_add_seq (new_stmt_list, seq);
959 if (vect_print_dump_info (REPORT_DETAILS))
961 fprintf (vect_dump, "created ");
962 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
968 /* Function vect_create_data_ref_ptr.
970 Create a new pointer to vector type (vp), that points to the first location
971 accessed in the loop by STMT, along with the def-use update chain to
972 appropriately advance the pointer through the loop iterations. Also set
973 aliasing information for the pointer. This vector pointer is used by the
974 callers to this function to create a memory reference expression for vector
978 1. STMT: a stmt that references memory. Expected to be of the form
979 GIMPLE_ASSIGN <name, data-ref> or
980 GIMPLE_ASSIGN <data-ref, name>.
981 2. AT_LOOP: the loop where the vector memref is to be created.
982 3. OFFSET (optional): an offset to be added to the initial address accessed
983 by the data-ref in STMT.
984 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
985 pointing to the initial address.
986 5. TYPE: if not NULL indicates the required type of the data-ref.
989 1. Declare a new ptr to vector_type, and have it point to the base of the
990 data reference (initial addressed accessed by the data reference).
991 For example, for vector of type V8HI, the following code is generated:
994 vp = (v8hi *)initial_address;
996 if OFFSET is not supplied:
997 initial_address = &a[init];
998 if OFFSET is supplied:
999 initial_address = &a[init + OFFSET];
1001 Return the initial_address in INITIAL_ADDRESS.
1003 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
1004 update the pointer in each iteration of the loop.
1006 Return the increment stmt that updates the pointer in PTR_INCR.
1008 3. Set INV_P to true if the access pattern of the data reference in the
1009 vectorized loop is invariant. Set it to false otherwise.
1011 4. Return the pointer. */
1014 vect_create_data_ref_ptr (gimple stmt, struct loop *at_loop,
1015 tree offset, tree *initial_address, gimple *ptr_incr,
1016 bool only_init, bool *inv_p, tree type)
1019 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1020 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1021 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1022 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
1023 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
1024 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1030 gimple_seq new_stmt_list = NULL;
1034 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1036 gimple_stmt_iterator incr_gsi;
1038 tree indx_before_incr, indx_after_incr;
1042 /* Check the step (evolution) of the load in LOOP, and record
1043 whether it's invariant. */
1044 if (nested_in_vect_loop)
1045 step = STMT_VINFO_DR_STEP (stmt_info);
1047 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1049 if (tree_int_cst_compare (step, size_zero_node) == 0)
1054 /* Create an expression for the first address accessed by this load
1056 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1058 if (vect_print_dump_info (REPORT_DETAILS))
1060 tree data_ref_base = base_name;
1061 fprintf (vect_dump, "create vector-pointer variable to type: ");
1062 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1063 if (TREE_CODE (data_ref_base) == VAR_DECL)
1064 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
1065 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1066 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
1067 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1068 fprintf (vect_dump, " vectorizing a record based array ref: ");
1069 else if (TREE_CODE (data_ref_base) == SSA_NAME)
1070 fprintf (vect_dump, " vectorizing a pointer ref: ");
1071 print_generic_expr (vect_dump, base_name, TDF_SLIM);
1074 /** (1) Create the new vector-pointer variable: **/
1076 vect_ptr_type = build_pointer_type (type);
1078 vect_ptr_type = build_pointer_type (vectype);
1080 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1081 get_name (base_name));
1082 add_referenced_var (vect_ptr);
1084 /** (2) Add aliasing information to the new vector-pointer:
1085 (The points-to info (DR_PTR_INFO) may be defined later.) **/
1087 tag = DR_SYMBOL_TAG (dr);
1090 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1091 tag must be created with tag added to its may alias list. */
1093 new_type_alias (vect_ptr, tag, DR_REF (dr));
1095 set_symbol_mem_tag (vect_ptr, tag);
1097 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1098 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1099 def-use update cycles for the pointer: One relative to the outer-loop
1100 (LOOP), which is what steps (3) and (4) below do. The other is relative
1101 to the inner-loop (which is the inner-most loop containing the dataref),
1102 and this is done be step (5) below.
1104 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1105 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1106 redundant. Steps (3),(4) create the following:
1109 LOOP: vp1 = phi(vp0,vp2)
1115 If there is an inner-loop nested in loop, then step (5) will also be
1116 applied, and an additional update in the inner-loop will be created:
1119 LOOP: vp1 = phi(vp0,vp2)
1121 inner: vp3 = phi(vp1,vp4)
1122 vp4 = vp3 + inner_step
1128 /** (3) Calculate the initial address the vector-pointer, and set
1129 the vector-pointer to point to it before the loop: **/
1131 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1133 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1135 pe = loop_preheader_edge (loop);
1138 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
1139 gcc_assert (!new_bb);
1142 *initial_address = new_temp;
1144 /* Create: p = (vectype *) initial_base */
1145 vec_stmt = gimple_build_assign (vect_ptr,
1146 fold_convert (vect_ptr_type, new_temp));
1147 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1148 gimple_assign_set_lhs (vec_stmt, vect_ptr_init);
1149 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
1150 gcc_assert (!new_bb);
1153 /** (4) Handle the updating of the vector-pointer inside the loop.
1154 This is needed when ONLY_INIT is false, and also when AT_LOOP
1155 is the inner-loop nested in LOOP (during outer-loop vectorization).
1158 if (only_init && at_loop == loop) /* No update in loop is required. */
1160 /* Copy the points-to information if it exists. */
1161 if (DR_PTR_INFO (dr))
1162 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1163 vptr = vect_ptr_init;
1167 /* The step of the vector pointer is the Vector Size. */
1168 tree step = TYPE_SIZE_UNIT (vectype);
1169 /* One exception to the above is when the scalar step of the load in
1170 LOOP is zero. In this case the step here is also zero. */
1172 step = size_zero_node;
1174 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1176 create_iv (vect_ptr_init,
1177 fold_convert (vect_ptr_type, step),
1178 NULL_TREE, loop, &incr_gsi, insert_after,
1179 &indx_before_incr, &indx_after_incr);
1180 incr = gsi_stmt (incr_gsi);
1181 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1183 /* Copy the points-to information if it exists. */
1184 if (DR_PTR_INFO (dr))
1186 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1187 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1189 merge_alias_info (vect_ptr_init, indx_before_incr);
1190 merge_alias_info (vect_ptr_init, indx_after_incr);
1194 vptr = indx_before_incr;
1197 if (!nested_in_vect_loop || only_init)
1201 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1202 nested in LOOP, if exists: **/
1204 gcc_assert (nested_in_vect_loop);
1207 standard_iv_increment_position (containing_loop, &incr_gsi,
1209 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE,
1210 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
1212 incr = gsi_stmt (incr_gsi);
1213 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1215 /* Copy the points-to information if it exists. */
1216 if (DR_PTR_INFO (dr))
1218 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1219 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1221 merge_alias_info (vect_ptr_init, indx_before_incr);
1222 merge_alias_info (vect_ptr_init, indx_after_incr);
1226 return indx_before_incr;
1233 /* Function bump_vector_ptr
1235 Increment a pointer (to a vector type) by vector-size. If requested,
1236 i.e. if PTR-INCR is given, then also connect the new increment stmt
1237 to the existing def-use update-chain of the pointer, by modifying
1238 the PTR_INCR as illustrated below:
1240 The pointer def-use update-chain before this function:
1241 DATAREF_PTR = phi (p_0, p_2)
1243 PTR_INCR: p_2 = DATAREF_PTR + step
1245 The pointer def-use update-chain after this function:
1246 DATAREF_PTR = phi (p_0, p_2)
1248 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1250 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1253 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1255 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1256 the loop. The increment amount across iterations is expected
1258 BSI - location where the new update stmt is to be placed.
1259 STMT - the original scalar memory-access stmt that is being vectorized.
1260 BUMP - optional. The offset by which to bump the pointer. If not given,
1261 the offset is assumed to be vector_size.
1263 Output: Return NEW_DATAREF_PTR as illustrated above.
1268 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
1269 gimple stmt, tree bump)
1271 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1272 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1273 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1274 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1275 tree update = TYPE_SIZE_UNIT (vectype);
1278 use_operand_p use_p;
1279 tree new_dataref_ptr;
1284 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, ptr_var,
1285 dataref_ptr, update);
1286 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1287 gimple_assign_set_lhs (incr_stmt, new_dataref_ptr);
1288 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
1290 /* Copy the points-to information if it exists. */
1291 if (DR_PTR_INFO (dr))
1292 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1293 merge_alias_info (new_dataref_ptr, dataref_ptr);
1296 return new_dataref_ptr;
1298 /* Update the vector-pointer's cross-iteration increment. */
1299 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1301 tree use = USE_FROM_PTR (use_p);
1303 if (use == dataref_ptr)
1304 SET_USE (use_p, new_dataref_ptr);
1306 gcc_assert (tree_int_cst_compare (use, update) == 0);
1309 return new_dataref_ptr;
1313 /* Function vect_create_destination_var.
1315 Create a new temporary of type VECTYPE. */
1318 vect_create_destination_var (tree scalar_dest, tree vectype)
1321 const char *new_name;
1323 enum vect_var_kind kind;
1325 kind = vectype ? vect_simple_var : vect_scalar_var;
1326 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1328 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1330 new_name = get_name (scalar_dest);
1333 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1334 add_referenced_var (vec_dest);
1340 /* Function vect_init_vector.
1342 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1343 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1344 is not NULL. Otherwise, place the initialization at the loop preheader.
1345 Return the DEF of INIT_STMT.
1346 It will be used in the vectorization of STMT. */
1349 vect_init_vector (gimple stmt, tree vector_var, tree vector_type,
1350 gimple_stmt_iterator *gsi)
1352 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1360 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1361 add_referenced_var (new_var);
1362 init_stmt = gimple_build_assign (new_var, vector_var);
1363 new_temp = make_ssa_name (new_var, init_stmt);
1364 gimple_assign_set_lhs (init_stmt, new_temp);
1367 vect_finish_stmt_generation (stmt, init_stmt, gsi);
1370 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1371 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1373 if (nested_in_vect_loop_p (loop, stmt))
1375 pe = loop_preheader_edge (loop);
1376 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1377 gcc_assert (!new_bb);
1380 if (vect_print_dump_info (REPORT_DETAILS))
1382 fprintf (vect_dump, "created new init_stmt: ");
1383 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1386 vec_oprnd = gimple_assign_lhs (init_stmt);
1391 /* For constant and loop invariant defs of SLP_NODE this function returns
1392 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1393 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1394 stmts. NUMBER_OF_VECTORS is the number of vector defs to create. */
1397 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1398 unsigned int op_num, unsigned int number_of_vectors)
1400 VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1401 gimple stmt = VEC_index (gimple, stmts, 0);
1402 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1403 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1404 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1407 int j, number_of_places_left_in_vector;
1410 int group_size = VEC_length (gimple, stmts);
1411 unsigned int vec_num, i;
1412 int number_of_copies = 1;
1413 bool is_store = false;
1414 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1417 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1420 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1421 created vectors. It is greater than 1 if unrolling is performed.
1423 For example, we have two scalar operands, s1 and s2 (e.g., group of
1424 strided accesses of size two), while NUNITS is four (i.e., four scalars
1425 of this type can be packed in a vector). The output vector will contain
1426 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1429 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1430 containing the operands.
1432 For example, NUNITS is four as before, and the group size is 8
1433 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1434 {s5, s6, s7, s8}. */
1436 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1438 number_of_places_left_in_vector = nunits;
1440 for (j = 0; j < number_of_copies; j++)
1442 for (i = group_size - 1; VEC_iterate (gimple, stmts, i, stmt); i--)
1445 op = gimple_assign_rhs1 (stmt);
1447 op = gimple_op (stmt, op_num + 1);
1448 if (!CONSTANT_CLASS_P (op))
1451 /* Create 'vect_ = {op0,op1,...,opn}'. */
1452 t = tree_cons (NULL_TREE, op, t);
1454 number_of_places_left_in_vector--;
1456 if (number_of_places_left_in_vector == 0)
1458 number_of_places_left_in_vector = nunits;
1460 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1461 gcc_assert (vector_type);
1463 vec_cst = build_vector (vector_type, t);
1465 vec_cst = build_constructor_from_list (vector_type, t);
1467 VEC_quick_push (tree, voprnds,
1468 vect_init_vector (stmt, vec_cst, vector_type,
1475 /* Since the vectors are created in the reverse order, we should invert
1477 vec_num = VEC_length (tree, voprnds);
1478 for (j = vec_num - 1; j >= 0; j--)
1480 vop = VEC_index (tree, voprnds, j);
1481 VEC_quick_push (tree, *vec_oprnds, vop);
1484 VEC_free (tree, heap, voprnds);
1486 /* In case that VF is greater than the unrolling factor needed for the SLP
1487 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1488 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1489 to replicate the vectors. */
1490 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1492 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1493 VEC_quick_push (tree, *vec_oprnds, vop);
1498 /* Get vectorized definitions from SLP_NODE that contains corresponding
1499 vectorized def-stmts. */
1502 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1505 gimple vec_def_stmt;
1508 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1511 VEC_iterate (gimple, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1514 gcc_assert (vec_def_stmt);
1515 vec_oprnd = gimple_get_lhs (vec_def_stmt);
1516 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1521 /* Get vectorized definitions for SLP_NODE.
1522 If the scalar definitions are loop invariants or constants, collect them and
1523 call vect_get_constant_vectors() to create vector stmts.
1524 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1525 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1526 vect_get_slp_vect_defs() to retrieve them.
1527 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1528 the right node. This is used when the second operand must remain scalar. */
1531 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1532 VEC (tree,heap) **vec_oprnds1)
1535 enum tree_code code;
1536 int number_of_vects;
1537 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
1539 first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1540 /* The number of vector defs is determined by the number of vector statements
1541 in the node from which we get those statements. */
1542 if (SLP_TREE_LEFT (slp_node))
1543 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_LEFT (slp_node));
1546 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1547 /* Number of vector stmts was calculated according to LHS in
1548 vect_schedule_slp_instance(), fix it by replacing LHS with RHS, if
1549 necessary. See vect_get_smallest_scalar_type() for details. */
1550 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
1552 if (rhs_size_unit != lhs_size_unit)
1554 number_of_vects *= rhs_size_unit;
1555 number_of_vects /= lhs_size_unit;
1559 /* Allocate memory for vectorized defs. */
1560 *vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects);
1562 /* SLP_NODE corresponds either to a group of stores or to a group of
1563 unary/binary operations. We don't call this function for loads. */
1564 if (SLP_TREE_LEFT (slp_node))
1565 /* The defs are already vectorized. */
1566 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1568 /* Build vectors from scalar defs. */
1569 vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects);
1571 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1572 /* Since we don't call this function with loads, this is a group of
1576 code = gimple_assign_rhs_code (first_stmt);
1577 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1)
1580 /* The number of vector defs is determined by the number of vector statements
1581 in the node from which we get those statements. */
1582 if (SLP_TREE_RIGHT (slp_node))
1583 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_RIGHT (slp_node));
1585 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1587 *vec_oprnds1 = VEC_alloc (tree, heap, number_of_vects);
1589 if (SLP_TREE_RIGHT (slp_node))
1590 /* The defs are already vectorized. */
1591 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1593 /* Build vectors from scalar defs. */
1594 vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects);
1598 /* Function get_initial_def_for_induction
1601 STMT - a stmt that performs an induction operation in the loop.
1602 IV_PHI - the initial value of the induction variable
1605 Return a vector variable, initialized with the first VF values of
1606 the induction variable. E.g., for an iv with IV_PHI='X' and
1607 evolution S, for a vector of 4 units, we want to return:
1608 [X, X + S, X + 2*S, X + 3*S]. */
1611 get_initial_def_for_induction (gimple iv_phi)
1613 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1614 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1615 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1616 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
1619 edge pe = loop_preheader_edge (loop);
1620 struct loop *iv_loop;
1622 tree vec, vec_init, vec_step, t;
1626 gimple init_stmt, induction_phi, new_stmt;
1627 tree induc_def, vec_def, vec_dest;
1628 tree init_expr, step_expr;
1629 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1634 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1635 bool nested_in_vect_loop = false;
1636 gimple_seq stmts = NULL;
1637 imm_use_iterator imm_iter;
1638 use_operand_p use_p;
1642 gimple_stmt_iterator si;
1643 basic_block bb = gimple_bb (iv_phi);
1645 vectype = get_vectype_for_scalar_type (scalar_type);
1646 gcc_assert (vectype);
1647 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1648 ncopies = vf / nunits;
1650 gcc_assert (phi_info);
1651 gcc_assert (ncopies >= 1);
1653 /* Find the first insertion point in the BB. */
1654 si = gsi_after_labels (bb);
1656 if (INTEGRAL_TYPE_P (scalar_type) || POINTER_TYPE_P (scalar_type))
1657 step_expr = build_int_cst (scalar_type, 0);
1659 step_expr = build_real (scalar_type, dconst0);
1661 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1662 if (nested_in_vect_loop_p (loop, iv_phi))
1664 nested_in_vect_loop = true;
1665 iv_loop = loop->inner;
1669 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
1671 latch_e = loop_latch_edge (iv_loop);
1672 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1674 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1675 gcc_assert (access_fn);
1676 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1677 &init_expr, &step_expr);
1679 pe = loop_preheader_edge (iv_loop);
1681 /* Create the vector that holds the initial_value of the induction. */
1682 if (nested_in_vect_loop)
1684 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1685 been created during vectorization of previous stmts; We obtain it from
1686 the STMT_VINFO_VEC_STMT of the defining stmt. */
1687 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1688 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1692 /* iv_loop is the loop to be vectorized. Create:
1693 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1694 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1695 add_referenced_var (new_var);
1697 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1700 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
1701 gcc_assert (!new_bb);
1705 t = tree_cons (NULL_TREE, init_expr, t);
1706 for (i = 1; i < nunits; i++)
1708 /* Create: new_name_i = new_name + step_expr */
1709 enum tree_code code = POINTER_TYPE_P (scalar_type)
1710 ? POINTER_PLUS_EXPR : PLUS_EXPR;
1711 init_stmt = gimple_build_assign_with_ops (code, new_var,
1712 new_name, step_expr);
1713 new_name = make_ssa_name (new_var, init_stmt);
1714 gimple_assign_set_lhs (init_stmt, new_name);
1716 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1717 gcc_assert (!new_bb);
1719 if (vect_print_dump_info (REPORT_DETAILS))
1721 fprintf (vect_dump, "created new init_stmt: ");
1722 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1724 t = tree_cons (NULL_TREE, new_name, t);
1726 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1727 vec = build_constructor_from_list (vectype, nreverse (t));
1728 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1732 /* Create the vector that holds the step of the induction. */
1733 if (nested_in_vect_loop)
1734 /* iv_loop is nested in the loop to be vectorized. Generate:
1735 vec_step = [S, S, S, S] */
1736 new_name = step_expr;
1739 /* iv_loop is the loop to be vectorized. Generate:
1740 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1741 expr = build_int_cst (scalar_type, vf);
1742 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1746 for (i = 0; i < nunits; i++)
1747 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1748 gcc_assert (CONSTANT_CLASS_P (new_name));
1749 vec = build_vector (vectype, t);
1750 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1753 /* Create the following def-use cycle:
1758 vec_iv = PHI <vec_init, vec_loop>
1762 vec_loop = vec_iv + vec_step; */
1764 /* Create the induction-phi that defines the induction-operand. */
1765 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1766 add_referenced_var (vec_dest);
1767 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1768 set_vinfo_for_stmt (induction_phi,
1769 new_stmt_vec_info (induction_phi, loop_vinfo));
1770 induc_def = PHI_RESULT (induction_phi);
1772 /* Create the iv update inside the loop */
1773 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1774 induc_def, vec_step);
1775 vec_def = make_ssa_name (vec_dest, new_stmt);
1776 gimple_assign_set_lhs (new_stmt, vec_def);
1777 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1778 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
1780 /* Set the arguments of the phi node: */
1781 add_phi_arg (induction_phi, vec_init, pe);
1782 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1785 /* In case that vectorization factor (VF) is bigger than the number
1786 of elements that we can fit in a vectype (nunits), we have to generate
1787 more than one vector stmt - i.e - we need to "unroll" the
1788 vector stmt by a factor VF/nunits. For more details see documentation
1789 in vectorizable_operation. */
1793 stmt_vec_info prev_stmt_vinfo;
1794 /* FORNOW. This restriction should be relaxed. */
1795 gcc_assert (!nested_in_vect_loop);
1797 /* Create the vector that holds the step of the induction. */
1798 expr = build_int_cst (scalar_type, nunits);
1799 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1801 for (i = 0; i < nunits; i++)
1802 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1803 gcc_assert (CONSTANT_CLASS_P (new_name));
1804 vec = build_vector (vectype, t);
1805 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1807 vec_def = induc_def;
1808 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1809 for (i = 1; i < ncopies; i++)
1811 /* vec_i = vec_prev + vec_step */
1812 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1814 vec_def = make_ssa_name (vec_dest, new_stmt);
1815 gimple_assign_set_lhs (new_stmt, vec_def);
1817 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1818 set_vinfo_for_stmt (new_stmt,
1819 new_stmt_vec_info (new_stmt, loop_vinfo));
1820 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1821 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1825 if (nested_in_vect_loop)
1827 /* Find the loop-closed exit-phi of the induction, and record
1828 the final vector of induction results: */
1830 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1832 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
1834 exit_phi = USE_STMT (use_p);
1840 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1841 /* FORNOW. Currently not supporting the case that an inner-loop induction
1842 is not used in the outer-loop (i.e. only outside the outer-loop). */
1843 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1844 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1846 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1847 if (vect_print_dump_info (REPORT_DETAILS))
1849 fprintf (vect_dump, "vector of inductions after inner-loop:");
1850 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
1856 if (vect_print_dump_info (REPORT_DETAILS))
1858 fprintf (vect_dump, "transform induction: created def-use cycle: ");
1859 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
1860 fprintf (vect_dump, "\n");
1861 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
1864 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1869 /* Function vect_get_vec_def_for_operand.
1871 OP is an operand in STMT. This function returns a (vector) def that will be
1872 used in the vectorized stmt for STMT.
1874 In the case that OP is an SSA_NAME which is defined in the loop, then
1875 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1877 In case OP is an invariant or constant, a new stmt that creates a vector def
1878 needs to be introduced. */
1881 vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
1886 stmt_vec_info def_stmt_info = NULL;
1887 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1888 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1889 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1890 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1896 enum vect_def_type dt;
1900 if (vect_print_dump_info (REPORT_DETAILS))
1902 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1903 print_generic_expr (vect_dump, op, TDF_SLIM);
1906 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1907 gcc_assert (is_simple_use);
1908 if (vect_print_dump_info (REPORT_DETAILS))
1912 fprintf (vect_dump, "def = ");
1913 print_generic_expr (vect_dump, def, TDF_SLIM);
1917 fprintf (vect_dump, " def_stmt = ");
1918 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1924 /* Case 1: operand is a constant. */
1925 case vect_constant_def:
1930 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1931 if (vect_print_dump_info (REPORT_DETAILS))
1932 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1934 for (i = nunits - 1; i >= 0; --i)
1936 t = tree_cons (NULL_TREE, op, t);
1938 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1939 gcc_assert (vector_type);
1940 vec_cst = build_vector (vector_type, t);
1942 return vect_init_vector (stmt, vec_cst, vector_type, NULL);
1945 /* Case 2: operand is defined outside the loop - loop invariant. */
1946 case vect_invariant_def:
1951 /* Create 'vec_inv = {inv,inv,..,inv}' */
1952 if (vect_print_dump_info (REPORT_DETAILS))
1953 fprintf (vect_dump, "Create vector_inv.");
1955 for (i = nunits - 1; i >= 0; --i)
1957 t = tree_cons (NULL_TREE, def, t);
1960 /* FIXME: use build_constructor directly. */
1961 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1962 gcc_assert (vector_type);
1963 vec_inv = build_constructor_from_list (vector_type, t);
1964 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1967 /* Case 3: operand is defined inside the loop. */
1971 *scalar_def = NULL/* FIXME tuples: def_stmt*/;
1973 /* Get the def from the vectorized stmt. */
1974 def_stmt_info = vinfo_for_stmt (def_stmt);
1975 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1976 gcc_assert (vec_stmt);
1977 if (gimple_code (vec_stmt) == GIMPLE_PHI)
1978 vec_oprnd = PHI_RESULT (vec_stmt);
1979 else if (is_gimple_call (vec_stmt))
1980 vec_oprnd = gimple_call_lhs (vec_stmt);
1982 vec_oprnd = gimple_assign_lhs (vec_stmt);
1986 /* Case 4: operand is defined by a loop header phi - reduction */
1987 case vect_reduction_def:
1991 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
1992 loop = (gimple_bb (def_stmt))->loop_father;
1994 /* Get the def before the loop */
1995 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1996 return get_initial_def_for_reduction (stmt, op, scalar_def);
1999 /* Case 5: operand is defined by loop-header phi - induction. */
2000 case vect_induction_def:
2002 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2004 /* Get the def from the vectorized stmt. */
2005 def_stmt_info = vinfo_for_stmt (def_stmt);
2006 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
2007 gcc_assert (vec_stmt && gimple_code (vec_stmt) == GIMPLE_PHI);
2008 vec_oprnd = PHI_RESULT (vec_stmt);
2018 /* Function vect_get_vec_def_for_stmt_copy
2020 Return a vector-def for an operand. This function is used when the
2021 vectorized stmt to be created (by the caller to this function) is a "copy"
2022 created in case the vectorized result cannot fit in one vector, and several
2023 copies of the vector-stmt are required. In this case the vector-def is
2024 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
2025 of the stmt that defines VEC_OPRND.
2026 DT is the type of the vector def VEC_OPRND.
2029 In case the vectorization factor (VF) is bigger than the number
2030 of elements that can fit in a vectype (nunits), we have to generate
2031 more than one vector stmt to vectorize the scalar stmt. This situation
2032 arises when there are multiple data-types operated upon in the loop; the
2033 smallest data-type determines the VF, and as a result, when vectorizing
2034 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
2035 vector stmt (each computing a vector of 'nunits' results, and together
2036 computing 'VF' results in each iteration). This function is called when
2037 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
2038 which VF=16 and nunits=4, so the number of copies required is 4):
2040 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
2042 S1: x = load VS1.0: vx.0 = memref0 VS1.1
2043 VS1.1: vx.1 = memref1 VS1.2
2044 VS1.2: vx.2 = memref2 VS1.3
2045 VS1.3: vx.3 = memref3
2047 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
2048 VSnew.1: vz1 = vx.1 + ... VSnew.2
2049 VSnew.2: vz2 = vx.2 + ... VSnew.3
2050 VSnew.3: vz3 = vx.3 + ...
2052 The vectorization of S1 is explained in vectorizable_load.
2053 The vectorization of S2:
2054 To create the first vector-stmt out of the 4 copies - VSnew.0 -
2055 the function 'vect_get_vec_def_for_operand' is called to
2056 get the relevant vector-def for each operand of S2. For operand x it
2057 returns the vector-def 'vx.0'.
2059 To create the remaining copies of the vector-stmt (VSnew.j), this
2060 function is called to get the relevant vector-def for each operand. It is
2061 obtained from the respective VS1.j stmt, which is recorded in the
2062 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2064 For example, to obtain the vector-def 'vx.1' in order to create the
2065 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2066 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2067 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2068 and return its def ('vx.1').
2069 Overall, to create the above sequence this function will be called 3 times:
2070 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2071 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2072 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
2075 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2077 gimple vec_stmt_for_operand;
2078 stmt_vec_info def_stmt_info;
2080 /* Do nothing; can reuse same def. */
2081 if (dt == vect_invariant_def || dt == vect_constant_def )
2084 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2085 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2086 gcc_assert (def_stmt_info);
2087 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2088 gcc_assert (vec_stmt_for_operand);
2089 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2090 if (gimple_code (vec_stmt_for_operand) == GIMPLE_PHI)
2091 vec_oprnd = PHI_RESULT (vec_stmt_for_operand);
2093 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2098 /* Get vectorized definitions for the operands to create a copy of an original
2099 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
2102 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2103 VEC(tree,heap) **vec_oprnds0,
2104 VEC(tree,heap) **vec_oprnds1)
2106 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2108 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2109 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2111 if (vec_oprnds1 && *vec_oprnds1)
2113 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2114 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2115 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2120 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
2123 vect_get_vec_defs (tree op0, tree op1, gimple stmt,
2124 VEC(tree,heap) **vec_oprnds0, VEC(tree,heap) **vec_oprnds1,
2128 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2133 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2134 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2135 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2139 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2140 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2141 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2147 /* Function vect_finish_stmt_generation.
2149 Insert a new stmt. */
2152 vect_finish_stmt_generation (gimple stmt, gimple vec_stmt,
2153 gimple_stmt_iterator *gsi)
2155 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2156 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2158 gcc_assert (gimple_code (stmt) != GIMPLE_LABEL);
2160 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
2162 set_vinfo_for_stmt (vec_stmt, new_stmt_vec_info (vec_stmt, loop_vinfo));
2164 if (vect_print_dump_info (REPORT_DETAILS))
2166 fprintf (vect_dump, "add new stmt: ");
2167 print_gimple_stmt (vect_dump, vec_stmt, 0, TDF_SLIM);
2170 gimple_set_location (vec_stmt, gimple_location (gsi_stmt (*gsi)));
2174 /* Function get_initial_def_for_reduction
2177 STMT - a stmt that performs a reduction operation in the loop.
2178 INIT_VAL - the initial value of the reduction variable
2181 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2182 of the reduction (used for adjusting the epilog - see below).
2183 Return a vector variable, initialized according to the operation that STMT
2184 performs. This vector will be used as the initial value of the
2185 vector of partial results.
2187 Option1 (adjust in epilog): Initialize the vector as follows:
2190 min/max: [init_val,init_val,..,init_val,init_val]
2191 bit and/or: [init_val,init_val,..,init_val,init_val]
2192 and when necessary (e.g. add/mult case) let the caller know
2193 that it needs to adjust the result by init_val.
2195 Option2: Initialize the vector as follows:
2196 add: [0,0,...,0,init_val]
2197 mult: [1,1,...,1,init_val]
2198 min/max: [init_val,init_val,...,init_val]
2199 bit and/or: [init_val,init_val,...,init_val]
2200 and no adjustments are needed.
2202 For example, for the following code:
2208 STMT is 's = s + a[i]', and the reduction variable is 's'.
2209 For a vector of 4 units, we want to return either [0,0,0,init_val],
2210 or [0,0,0,0] and let the caller know that it needs to adjust
2211 the result at the end by 'init_val'.
2213 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2214 initialization vector is simpler (same element in all entries).
2215 A cost model should help decide between these two schemes. */
2218 get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
2220 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2221 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2222 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2223 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2224 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2225 enum tree_code code = gimple_assign_rhs_code (stmt);
2226 tree type = TREE_TYPE (init_val);
2233 bool nested_in_vect_loop = false;
2235 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2236 if (nested_in_vect_loop_p (loop, stmt))
2237 nested_in_vect_loop = true;
2239 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2241 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2245 case WIDEN_SUM_EXPR:
2248 if (nested_in_vect_loop)
2249 *adjustment_def = vecdef;
2251 *adjustment_def = init_val;
2252 /* Create a vector of zeros for init_def. */
2253 if (SCALAR_FLOAT_TYPE_P (type))
2254 def_for_init = build_real (type, dconst0);
2256 def_for_init = build_int_cst (type, 0);
2257 for (i = nunits - 1; i >= 0; --i)
2258 t = tree_cons (NULL_TREE, def_for_init, t);
2259 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
2260 gcc_assert (vector_type);
2261 init_def = build_vector (vector_type, t);
2266 *adjustment_def = NULL_TREE;
2278 /* Function vect_create_epilog_for_reduction
2280 Create code at the loop-epilog to finalize the result of a reduction
2283 VECT_DEF is a vector of partial results.
2284 REDUC_CODE is the tree-code for the epilog reduction.
2285 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2286 number of elements that we can fit in a vectype (nunits). In this case
2287 we have to generate more than one vector stmt - i.e - we need to "unroll"
2288 the vector stmt by a factor VF/nunits. For more details see documentation
2289 in vectorizable_operation.
2290 STMT is the scalar reduction stmt that is being vectorized.
2291 REDUCTION_PHI is the phi-node that carries the reduction computation.
2294 1. Creates the reduction def-use cycle: sets the arguments for
2296 The loop-entry argument is the vectorized initial-value of the reduction.
2297 The loop-latch argument is VECT_DEF - the vector of partial sums.
2298 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2299 by applying the operation specified by REDUC_CODE if available, or by
2300 other means (whole-vector shifts or a scalar loop).
2301 The function also creates a new phi node at the loop exit to preserve
2302 loop-closed form, as illustrated below.
2304 The flow at the entry to this function:
2307 vec_def = phi <null, null> # REDUCTION_PHI
2308 VECT_DEF = vector_stmt # vectorized form of STMT
2309 s_loop = scalar_stmt # (scalar) STMT
2311 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2315 The above is transformed by this function into:
2318 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2319 VECT_DEF = vector_stmt # vectorized form of STMT
2320 s_loop = scalar_stmt # (scalar) STMT
2322 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2323 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2324 v_out2 = reduce <v_out1>
2325 s_out3 = extract_field <v_out2, 0>
2326 s_out4 = adjust_result <s_out3>
2332 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2334 enum tree_code reduc_code,
2335 gimple reduction_phi)
2337 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2338 stmt_vec_info prev_phi_info;
2340 enum machine_mode mode;
2341 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2342 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2343 basic_block exit_bb;
2346 gimple new_phi = NULL, phi;
2347 gimple_stmt_iterator exit_gsi;
2349 tree new_temp = NULL_TREE;
2351 gimple epilog_stmt = NULL;
2352 tree new_scalar_dest, new_dest;
2354 tree bitsize, bitpos, bytesize;
2355 enum tree_code code = gimple_assign_rhs_code (stmt);
2356 tree adjustment_def;
2357 tree vec_initial_def, def;
2359 imm_use_iterator imm_iter;
2360 use_operand_p use_p;
2361 bool extract_scalar_result = false;
2362 tree reduction_op, expr;
2365 bool nested_in_vect_loop = false;
2366 VEC(gimple,heap) *phis = NULL;
2367 enum vect_def_type dt = vect_unknown_def_type;
2370 if (nested_in_vect_loop_p (loop, stmt))
2373 nested_in_vect_loop = true;
2376 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2378 case GIMPLE_SINGLE_RHS:
2379 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2380 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2382 case GIMPLE_UNARY_RHS:
2383 reduction_op = gimple_assign_rhs1 (stmt);
2385 case GIMPLE_BINARY_RHS:
2386 reduction_op = gimple_assign_rhs2 (stmt);
2392 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2393 gcc_assert (vectype);
2394 mode = TYPE_MODE (vectype);
2396 /*** 1. Create the reduction def-use cycle ***/
2398 /* For the case of reduction, vect_get_vec_def_for_operand returns
2399 the scalar def before the loop, that defines the initial value
2400 of the reduction variable. */
2401 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2404 phi = reduction_phi;
2406 for (j = 0; j < ncopies; j++)
2408 /* 1.1 set the loop-entry arg of the reduction-phi: */
2409 add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop));
2411 /* 1.2 set the loop-latch arg for the reduction-phi: */
2413 def = vect_get_vec_def_for_stmt_copy (dt, def);
2414 add_phi_arg (phi, def, loop_latch_edge (loop));
2416 if (vect_print_dump_info (REPORT_DETAILS))
2418 fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2419 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2420 fprintf (vect_dump, "\n");
2421 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2424 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2427 /*** 2. Create epilog code
2428 The reduction epilog code operates across the elements of the vector
2429 of partial results computed by the vectorized loop.
2430 The reduction epilog code consists of:
2431 step 1: compute the scalar result in a vector (v_out2)
2432 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2433 step 3: adjust the scalar result (s_out3) if needed.
2435 Step 1 can be accomplished using one the following three schemes:
2436 (scheme 1) using reduc_code, if available.
2437 (scheme 2) using whole-vector shifts, if available.
2438 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2441 The overall epilog code looks like this:
2443 s_out0 = phi <s_loop> # original EXIT_PHI
2444 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2445 v_out2 = reduce <v_out1> # step 1
2446 s_out3 = extract_field <v_out2, 0> # step 2
2447 s_out4 = adjust_result <s_out3> # step 3
2449 (step 3 is optional, and steps 1 and 2 may be combined).
2450 Lastly, the uses of s_out0 are replaced by s_out4.
2454 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2455 v_out1 = phi <v_loop> */
2457 exit_bb = single_exit (loop)->dest;
2459 prev_phi_info = NULL;
2460 for (j = 0; j < ncopies; j++)
2462 phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2463 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
2468 def = vect_get_vec_def_for_stmt_copy (dt, def);
2469 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
2471 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
2472 prev_phi_info = vinfo_for_stmt (phi);
2474 exit_gsi = gsi_after_labels (exit_bb);
2476 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2477 (i.e. when reduc_code is not available) and in the final adjustment
2478 code (if needed). Also get the original scalar reduction variable as
2479 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2480 represents a reduction pattern), the tree-code and scalar-def are
2481 taken from the original stmt that the pattern-stmt (STMT) replaces.
2482 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2483 are taken from STMT. */
2485 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2488 /* Regular reduction */
2493 /* Reduction pattern */
2494 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2495 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2496 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2498 code = gimple_assign_rhs_code (orig_stmt);
2499 scalar_dest = gimple_assign_lhs (orig_stmt);
2500 scalar_type = TREE_TYPE (scalar_dest);
2501 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2502 bitsize = TYPE_SIZE (scalar_type);
2503 bytesize = TYPE_SIZE_UNIT (scalar_type);
2506 /* In case this is a reduction in an inner-loop while vectorizing an outer
2507 loop - we don't need to extract a single scalar result at the end of the
2508 inner-loop. The final vector of partial results will be used in the
2509 vectorized outer-loop, or reduced to a scalar result at the end of the
2511 if (nested_in_vect_loop)
2512 goto vect_finalize_reduction;
2515 gcc_assert (ncopies == 1);
2517 /* 2.3 Create the reduction code, using one of the three schemes described
2520 if (reduc_code < NUM_TREE_CODES)
2524 /*** Case 1: Create:
2525 v_out2 = reduc_expr <v_out1> */
2527 if (vect_print_dump_info (REPORT_DETAILS))
2528 fprintf (vect_dump, "Reduce using direct vector reduction.");
2530 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2531 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2532 epilog_stmt = gimple_build_assign (vec_dest, tmp);
2533 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2534 gimple_assign_set_lhs (epilog_stmt, new_temp);
2535 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2537 extract_scalar_result = true;
2541 enum tree_code shift_code = 0;
2542 bool have_whole_vector_shift = true;
2544 int element_bitsize = tree_low_cst (bitsize, 1);
2545 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2548 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2549 shift_code = VEC_RSHIFT_EXPR;
2551 have_whole_vector_shift = false;
2553 /* Regardless of whether we have a whole vector shift, if we're
2554 emulating the operation via tree-vect-generic, we don't want
2555 to use it. Only the first round of the reduction is likely
2556 to still be profitable via emulation. */
2557 /* ??? It might be better to emit a reduction tree code here, so that
2558 tree-vect-generic can expand the first round via bit tricks. */
2559 if (!VECTOR_MODE_P (mode))
2560 have_whole_vector_shift = false;
2563 optab optab = optab_for_tree_code (code, vectype, optab_default);
2564 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2565 have_whole_vector_shift = false;
2568 if (have_whole_vector_shift)
2570 /*** Case 2: Create:
2571 for (offset = VS/2; offset >= element_size; offset/=2)
2573 Create: va' = vec_shift <va, offset>
2574 Create: va = vop <va, va'>
2577 if (vect_print_dump_info (REPORT_DETAILS))
2578 fprintf (vect_dump, "Reduce using vector shifts");
2580 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2581 new_temp = PHI_RESULT (new_phi);
2583 for (bit_offset = vec_size_in_bits/2;
2584 bit_offset >= element_bitsize;
2587 tree bitpos = size_int (bit_offset);
2588 epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
2590 new_name = make_ssa_name (vec_dest, epilog_stmt);
2591 gimple_assign_set_lhs (epilog_stmt, new_name);
2592 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2594 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
2595 new_name, new_temp);
2596 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2597 gimple_assign_set_lhs (epilog_stmt, new_temp);
2598 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2601 extract_scalar_result = true;
2607 /*** Case 3: Create:
2608 s = extract_field <v_out2, 0>
2609 for (offset = element_size;
2610 offset < vector_size;
2611 offset += element_size;)
2613 Create: s' = extract_field <v_out2, offset>
2614 Create: s = op <s, s'>
2617 if (vect_print_dump_info (REPORT_DETAILS))
2618 fprintf (vect_dump, "Reduce using scalar code. ");
2620 vec_temp = PHI_RESULT (new_phi);
2621 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2622 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2624 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2625 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2626 gimple_assign_set_lhs (epilog_stmt, new_temp);
2627 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2629 for (bit_offset = element_bitsize;
2630 bit_offset < vec_size_in_bits;
2631 bit_offset += element_bitsize)
2633 tree bitpos = bitsize_int (bit_offset);
2634 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2637 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2638 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2639 gimple_assign_set_lhs (epilog_stmt, new_name);
2640 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2642 epilog_stmt = gimple_build_assign_with_ops (code,
2644 new_name, new_temp);
2645 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2646 gimple_assign_set_lhs (epilog_stmt, new_temp);
2647 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2650 extract_scalar_result = false;
2654 /* 2.4 Extract the final scalar result. Create:
2655 s_out3 = extract_field <v_out2, bitpos> */
2657 if (extract_scalar_result)
2661 gcc_assert (!nested_in_vect_loop);
2662 if (vect_print_dump_info (REPORT_DETAILS))
2663 fprintf (vect_dump, "extract scalar result");
2665 if (BYTES_BIG_ENDIAN)
2666 bitpos = size_binop (MULT_EXPR,
2667 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2668 TYPE_SIZE (scalar_type));
2670 bitpos = bitsize_zero_node;
2672 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2673 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2674 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2675 gimple_assign_set_lhs (epilog_stmt, new_temp);
2676 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2679 vect_finalize_reduction:
2681 /* 2.5 Adjust the final result by the initial value of the reduction
2682 variable. (When such adjustment is not needed, then
2683 'adjustment_def' is zero). For example, if code is PLUS we create:
2684 new_temp = loop_exit_def + adjustment_def */
2688 if (nested_in_vect_loop)
2690 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2691 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2692 new_dest = vect_create_destination_var (scalar_dest, vectype);
2696 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2697 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2698 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2700 epilog_stmt = gimple_build_assign (new_dest, expr);
2701 new_temp = make_ssa_name (new_dest, epilog_stmt);
2702 gimple_assign_set_lhs (epilog_stmt, new_temp);
2703 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
2704 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2708 /* 2.6 Handle the loop-exit phi */
2710 /* Replace uses of s_out0 with uses of s_out3:
2711 Find the loop-closed-use at the loop exit of the original scalar result.
2712 (The reduction result is expected to have two immediate uses - one at the
2713 latch block, and one at the loop exit). */
2714 phis = VEC_alloc (gimple, heap, 10);
2715 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2717 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2719 exit_phi = USE_STMT (use_p);
2720 VEC_quick_push (gimple, phis, exit_phi);
2723 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2724 gcc_assert (!VEC_empty (gimple, phis));
2726 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
2728 if (nested_in_vect_loop)
2730 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2732 /* FORNOW. Currently not supporting the case that an inner-loop
2733 reduction is not used in the outer-loop (but only outside the
2735 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2736 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2738 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2739 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2740 set_vinfo_for_stmt (epilog_stmt,
2741 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2743 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
2744 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
2748 /* Replace the uses: */
2749 orig_name = PHI_RESULT (exit_phi);
2750 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2751 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2752 SET_USE (use_p, new_temp);
2754 VEC_free (gimple, heap, phis);
2758 /* Function vectorizable_reduction.
2760 Check if STMT performs a reduction operation that can be vectorized.
2761 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2762 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2763 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2765 This function also handles reduction idioms (patterns) that have been
2766 recognized in advance during vect_pattern_recog. In this case, STMT may be
2768 X = pattern_expr (arg0, arg1, ..., X)
2769 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2770 sequence that had been detected and replaced by the pattern-stmt (STMT).
2772 In some cases of reduction patterns, the type of the reduction variable X is
2773 different than the type of the other arguments of STMT.
2774 In such cases, the vectype that is used when transforming STMT into a vector
2775 stmt is different than the vectype that is used to determine the
2776 vectorization factor, because it consists of a different number of elements
2777 than the actual number of elements that are being operated upon in parallel.
2779 For example, consider an accumulation of shorts into an int accumulator.
2780 On some targets it's possible to vectorize this pattern operating on 8
2781 shorts at a time (hence, the vectype for purposes of determining the
2782 vectorization factor should be V8HI); on the other hand, the vectype that
2783 is used to create the vector form is actually V4SI (the type of the result).
2785 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2786 indicates what is the actual level of parallelism (V8HI in the example), so
2787 that the right vectorization factor would be derived. This vectype
2788 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2789 be used to create the vectorized stmt. The right vectype for the vectorized
2790 stmt is obtained from the type of the result X:
2791 get_vectype_for_scalar_type (TREE_TYPE (X))
2793 This means that, contrary to "regular" reductions (or "regular" stmts in
2794 general), the following equation:
2795 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2796 does *NOT* necessarily hold for reduction patterns. */
2799 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
2804 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2805 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2806 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2807 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2808 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2809 enum tree_code code, orig_code, epilog_reduc_code = 0;
2810 enum machine_mode vec_mode;
2812 optab optab, reduc_optab;
2813 tree new_temp = NULL_TREE;
2816 enum vect_def_type dt;
2817 gimple new_phi = NULL;
2821 stmt_vec_info orig_stmt_info;
2822 tree expr = NULL_TREE;
2824 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2825 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2827 stmt_vec_info prev_stmt_info, prev_phi_info;
2828 gimple first_phi = NULL;
2829 bool single_defuse_cycle = false;
2831 gimple new_stmt = NULL;
2835 if (nested_in_vect_loop_p (loop, stmt))
2838 gcc_assert (ncopies >= 1);
2840 /* FORNOW: SLP not supported. */
2841 if (STMT_SLP_TYPE (stmt_info))
2844 /* 1. Is vectorizable reduction? */
2846 /* Not supportable if the reduction variable is used in the loop. */
2847 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2850 /* Reductions that are not used even in an enclosing outer-loop,
2851 are expected to be "live" (used out of the loop). */
2852 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2853 && !STMT_VINFO_LIVE_P (stmt_info))
2856 /* Make sure it was already recognized as a reduction computation. */
2857 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2860 /* 2. Has this been recognized as a reduction pattern?
2862 Check if STMT represents a pattern that has been recognized
2863 in earlier analysis stages. For stmts that represent a pattern,
2864 the STMT_VINFO_RELATED_STMT field records the last stmt in
2865 the original sequence that constitutes the pattern. */
2867 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2870 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2871 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2872 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2873 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2876 /* 3. Check the operands of the operation. The first operands are defined
2877 inside the loop body. The last operand is the reduction variable,
2878 which is defined by the loop-header-phi. */
2880 gcc_assert (is_gimple_assign (stmt));
2883 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2885 case GIMPLE_SINGLE_RHS:
2886 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
2887 if (op_type == ternary_op)
2889 tree rhs = gimple_assign_rhs1 (stmt);
2890 ops[0] = TREE_OPERAND (rhs, 0);
2891 ops[1] = TREE_OPERAND (rhs, 1);
2892 ops[2] = TREE_OPERAND (rhs, 2);
2893 code = TREE_CODE (rhs);
2899 case GIMPLE_BINARY_RHS:
2900 code = gimple_assign_rhs_code (stmt);
2901 op_type = TREE_CODE_LENGTH (code);
2902 gcc_assert (op_type == binary_op);
2903 ops[0] = gimple_assign_rhs1 (stmt);
2904 ops[1] = gimple_assign_rhs2 (stmt);
2907 case GIMPLE_UNARY_RHS:
2914 scalar_dest = gimple_assign_lhs (stmt);
2915 scalar_type = TREE_TYPE (scalar_dest);
2916 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2917 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2920 /* All uses but the last are expected to be defined in the loop.
2921 The last use is the reduction variable. */
2922 for (i = 0; i < op_type-1; i++)
2924 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt,
2926 gcc_assert (is_simple_use);
2927 if (dt != vect_loop_def
2928 && dt != vect_invariant_def
2929 && dt != vect_constant_def
2930 && dt != vect_induction_def)
2934 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt, &def, &dt);
2935 gcc_assert (is_simple_use);
2936 gcc_assert (dt == vect_reduction_def);
2937 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2939 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2941 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2943 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2946 /* 4. Supportable by target? */
2948 /* 4.1. check support for the operation in the loop */
2949 optab = optab_for_tree_code (code, vectype, optab_default);
2952 if (vect_print_dump_info (REPORT_DETAILS))
2953 fprintf (vect_dump, "no optab.");
2956 vec_mode = TYPE_MODE (vectype);
2957 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2959 if (vect_print_dump_info (REPORT_DETAILS))
2960 fprintf (vect_dump, "op not supported by target.");
2961 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2962 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2963 < vect_min_worthwhile_factor (code))
2965 if (vect_print_dump_info (REPORT_DETAILS))
2966 fprintf (vect_dump, "proceeding using word mode.");
2969 /* Worthwhile without SIMD support? */
2970 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2971 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2972 < vect_min_worthwhile_factor (code))
2974 if (vect_print_dump_info (REPORT_DETAILS))
2975 fprintf (vect_dump, "not worthwhile without SIMD support.");
2979 /* 4.2. Check support for the epilog operation.
2981 If STMT represents a reduction pattern, then the type of the
2982 reduction variable may be different than the type of the rest
2983 of the arguments. For example, consider the case of accumulation
2984 of shorts into an int accumulator; The original code:
2985 S1: int_a = (int) short_a;
2986 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2989 STMT: int_acc = widen_sum <short_a, int_acc>
2992 1. The tree-code that is used to create the vector operation in the
2993 epilog code (that reduces the partial results) is not the
2994 tree-code of STMT, but is rather the tree-code of the original
2995 stmt from the pattern that STMT is replacing. I.e, in the example
2996 above we want to use 'widen_sum' in the loop, but 'plus' in the
2998 2. The type (mode) we use to check available target support
2999 for the vector operation to be created in the *epilog*, is
3000 determined by the type of the reduction variable (in the example
3001 above we'd check this: plus_optab[vect_int_mode]).
3002 However the type (mode) we use to check available target support
3003 for the vector operation to be created *inside the loop*, is
3004 determined by the type of the other arguments to STMT (in the
3005 example we'd check this: widen_sum_optab[vect_short_mode]).
3007 This is contrary to "regular" reductions, in which the types of all
3008 the arguments are the same as the type of the reduction variable.
3009 For "regular" reductions we can therefore use the same vector type
3010 (and also the same tree-code) when generating the epilog code and
3011 when generating the code inside the loop. */
3015 /* This is a reduction pattern: get the vectype from the type of the
3016 reduction variable, and get the tree-code from orig_stmt. */
3017 orig_code = gimple_assign_rhs_code (orig_stmt);
3018 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3021 if (vect_print_dump_info (REPORT_DETAILS))
3023 fprintf (vect_dump, "unsupported data-type ");
3024 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3029 vec_mode = TYPE_MODE (vectype);
3033 /* Regular reduction: use the same vectype and tree-code as used for
3034 the vector code inside the loop can be used for the epilog code. */
3038 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3040 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default);
3043 if (vect_print_dump_info (REPORT_DETAILS))
3044 fprintf (vect_dump, "no optab for reduction.");
3045 epilog_reduc_code = NUM_TREE_CODES;
3047 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
3049 if (vect_print_dump_info (REPORT_DETAILS))
3050 fprintf (vect_dump, "reduc op not supported by target.");
3051 epilog_reduc_code = NUM_TREE_CODES;
3054 if (!vec_stmt) /* transformation not required. */
3056 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3057 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3064 if (vect_print_dump_info (REPORT_DETAILS))
3065 fprintf (vect_dump, "transform reduction.");
3067 /* Create the destination vector */
3068 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3070 /* In case the vectorization factor (VF) is bigger than the number
3071 of elements that we can fit in a vectype (nunits), we have to generate
3072 more than one vector stmt - i.e - we need to "unroll" the
3073 vector stmt by a factor VF/nunits. For more details see documentation
3074 in vectorizable_operation. */
3076 /* If the reduction is used in an outer loop we need to generate
3077 VF intermediate results, like so (e.g. for ncopies=2):
3082 (i.e. we generate VF results in 2 registers).
3083 In this case we have a separate def-use cycle for each copy, and therefore
3084 for each copy we get the vector def for the reduction variable from the
3085 respective phi node created for this copy.
3087 Otherwise (the reduction is unused in the loop nest), we can combine
3088 together intermediate results, like so (e.g. for ncopies=2):
3092 (i.e. we generate VF/2 results in a single register).
3093 In this case for each copy we get the vector def for the reduction variable
3094 from the vectorized reduction operation generated in the previous iteration.
3097 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop)
3099 single_defuse_cycle = true;
3103 epilog_copies = ncopies;
3105 prev_stmt_info = NULL;
3106 prev_phi_info = NULL;
3107 for (j = 0; j < ncopies; j++)
3109 if (j == 0 || !single_defuse_cycle)
3111 /* Create the reduction-phi that defines the reduction-operand. */
3112 new_phi = create_phi_node (vec_dest, loop->header);
3113 set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo));
3119 loop_vec_def0 = vect_get_vec_def_for_operand (ops[0], stmt, NULL);
3120 if (op_type == ternary_op)
3122 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, NULL);
3125 /* Get the vector def for the reduction variable from the phi node */
3126 reduc_def = PHI_RESULT (new_phi);
3127 first_phi = new_phi;
3131 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3132 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3133 if (op_type == ternary_op)
3134 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3136 if (single_defuse_cycle)
3137 reduc_def = gimple_assign_lhs (new_stmt);
3139 reduc_def = PHI_RESULT (new_phi);
3141 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3144 /* Arguments are ready. create the new vector stmt. */
3145 if (op_type == binary_op)
3146 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3148 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3150 new_stmt = gimple_build_assign (vec_dest, expr);
3151 new_temp = make_ssa_name (vec_dest, new_stmt);
3152 gimple_assign_set_lhs (new_stmt, new_temp);
3153 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3156 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3158 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3159 prev_stmt_info = vinfo_for_stmt (new_stmt);
3160 prev_phi_info = vinfo_for_stmt (new_phi);
3163 /* Finalize the reduction-phi (set its arguments) and create the
3164 epilog reduction code. */
3165 if (!single_defuse_cycle)
3166 new_temp = gimple_assign_lhs (*vec_stmt);
3167 vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3168 epilog_reduc_code, first_phi);
3172 /* Checks if CALL can be vectorized in type VECTYPE. Returns
3173 a function declaration if the target has a vectorized version
3174 of the function, or NULL_TREE if the function cannot be vectorized. */
3177 vectorizable_function (gimple call, tree vectype_out, tree vectype_in)
3179 tree fndecl = gimple_call_fndecl (call);
3180 enum built_in_function code;
3182 /* We only handle functions that do not read or clobber memory -- i.e.
3183 const or novops ones. */
3184 if (!(gimple_call_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3188 || TREE_CODE (fndecl) != FUNCTION_DECL
3189 || !DECL_BUILT_IN (fndecl))
3192 code = DECL_FUNCTION_CODE (fndecl);
3193 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3197 /* Function vectorizable_call.
3199 Check if STMT performs a function call that can be vectorized.
3200 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3201 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3202 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3205 vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
3210 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3211 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3212 tree vectype_out, vectype_in;
3215 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3216 tree fndecl, new_temp, def, rhs_type, lhs_type;
3218 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3221 VEC(tree, heap) *vargs = NULL;
3222 enum { NARROW, NONE, WIDEN } modifier;
3225 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3228 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3231 /* FORNOW: SLP not supported. */
3232 if (STMT_SLP_TYPE (stmt_info))
3235 /* Is STMT a vectorizable call? */
3236 if (!is_gimple_call (stmt))
3239 if (TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
3242 /* Process function arguments. */
3243 rhs_type = NULL_TREE;
3244 nargs = gimple_call_num_args (stmt);
3246 /* Bail out if the function has more than two arguments, we
3247 do not have interesting builtin functions to vectorize with
3248 more than two arguments. No arguments is also not good. */
3249 if (nargs == 0 || nargs > 2)
3252 for (i = 0; i < nargs; i++)
3254 op = gimple_call_arg (stmt, i);
3256 /* We can only handle calls with arguments of the same type. */
3258 && rhs_type != TREE_TYPE (op))
3260 if (vect_print_dump_info (REPORT_DETAILS))
3261 fprintf (vect_dump, "argument types differ.");
3264 rhs_type = TREE_TYPE (op);
3266 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[i]))
3268 if (vect_print_dump_info (REPORT_DETAILS))
3269 fprintf (vect_dump, "use not simple.");
3274 vectype_in = get_vectype_for_scalar_type (rhs_type);
3277 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3279 lhs_type = TREE_TYPE (gimple_call_lhs (stmt));
3280 vectype_out = get_vectype_for_scalar_type (lhs_type);
3283 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3286 if (nunits_in == nunits_out / 2)
3288 else if (nunits_out == nunits_in)
3290 else if (nunits_out == nunits_in / 2)
3295 /* For now, we only vectorize functions if a target specific builtin
3296 is available. TODO -- in some cases, it might be profitable to
3297 insert the calls for pieces of the vector, in order to be able
3298 to vectorize other operations in the loop. */
3299 fndecl = vectorizable_function (stmt, vectype_out, vectype_in);
3300 if (fndecl == NULL_TREE)
3302 if (vect_print_dump_info (REPORT_DETAILS))
3303 fprintf (vect_dump, "function is not vectorizable.");
3308 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3310 if (modifier == NARROW)
3311 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3313 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3315 /* Sanity check: make sure that at least one copy of the vectorized stmt
3316 needs to be generated. */
3317 gcc_assert (ncopies >= 1);
3319 if (!vec_stmt) /* transformation not required. */
3321 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3322 if (vect_print_dump_info (REPORT_DETAILS))
3323 fprintf (vect_dump, "=== vectorizable_call ===");
3324 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3330 if (vect_print_dump_info (REPORT_DETAILS))
3331 fprintf (vect_dump, "transform operation.");
3334 scalar_dest = gimple_call_lhs (stmt);
3335 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3337 prev_stmt_info = NULL;
3341 for (j = 0; j < ncopies; ++j)
3343 /* Build argument list for the vectorized call. */
3345 vargs = VEC_alloc (tree, heap, nargs);
3347 VEC_truncate (tree, vargs, 0);
3349 for (i = 0; i < nargs; i++)
3351 op = gimple_call_arg (stmt, i);
3354 = vect_get_vec_def_for_operand (op, stmt, NULL);
3357 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3359 VEC_quick_push (tree, vargs, vec_oprnd0);
3362 new_stmt = gimple_build_call_vec (fndecl, vargs);
3363 new_temp = make_ssa_name (vec_dest, new_stmt);
3364 gimple_call_set_lhs (new_stmt, new_temp);
3366 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3369 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3371 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3373 prev_stmt_info = vinfo_for_stmt (new_stmt);
3379 for (j = 0; j < ncopies; ++j)
3381 /* Build argument list for the vectorized call. */
3383 vargs = VEC_alloc (tree, heap, nargs * 2);
3385 VEC_truncate (tree, vargs, 0);
3387 for (i = 0; i < nargs; i++)
3389 op = gimple_call_arg (stmt, i);
3393 = vect_get_vec_def_for_operand (op, stmt, NULL);
3395 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3400 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3402 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3405 VEC_quick_push (tree, vargs, vec_oprnd0);
3406 VEC_quick_push (tree, vargs, vec_oprnd1);
3409 new_stmt = gimple_build_call_vec (fndecl, vargs);
3410 new_temp = make_ssa_name (vec_dest, new_stmt);
3411 gimple_call_set_lhs (new_stmt, new_temp);
3413 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3416 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3418 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3420 prev_stmt_info = vinfo_for_stmt (new_stmt);
3423 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3428 /* No current target implements this case. */
3432 VEC_free (tree, heap, vargs);
3434 /* The call in STMT might prevent it from being removed in dce.
3435 We however cannot remove it here, due to the way the ssa name
3436 it defines is mapped to the new definition. So just replace
3437 rhs of the statement with something harmless. */
3439 type = TREE_TYPE (scalar_dest);
3440 new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
3441 fold_convert (type, integer_zero_node));
3442 set_vinfo_for_stmt (new_stmt, stmt_info);
3443 set_vinfo_for_stmt (stmt, NULL);
3444 STMT_VINFO_STMT (stmt_info) = new_stmt;
3445 gsi_replace (gsi, new_stmt, false);
3446 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3452 /* Function vect_gen_widened_results_half
3454 Create a vector stmt whose code, type, number of arguments, and result
3455 variable are CODE, OP_TYPE, and VEC_DEST, and its arguments are
3456 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3457 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3458 needs to be created (DECL is a function-decl of a target-builtin).
3459 STMT is the original scalar stmt that we are vectorizing. */
3462 vect_gen_widened_results_half (enum tree_code code,
3464 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3465 tree vec_dest, gimple_stmt_iterator *gsi,
3473 /* Generate half of the widened result: */
3474 if (code == CALL_EXPR)
3476 /* Target specific support */
3477 if (op_type == binary_op)
3478 new_stmt = gimple_build_call (decl, 2, vec_oprnd0, vec_oprnd1);
3480 new_stmt = gimple_build_call (decl, 1, vec_oprnd0);
3481 new_temp = make_ssa_name (vec_dest, new_stmt);
3482 gimple_call_set_lhs (new_stmt, new_temp);
3486 /* Generic support */
3487 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3488 if (op_type != binary_op)
3490 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vec_oprnd0,
3492 new_temp = make_ssa_name (vec_dest, new_stmt);
3493 gimple_assign_set_lhs (new_stmt, new_temp);
3495 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3497 if (code == CALL_EXPR)
3499 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3501 if (TREE_CODE (sym) == SSA_NAME)
3502 sym = SSA_NAME_VAR (sym);
3503 mark_sym_for_renaming (sym);
3511 /* Check if STMT performs a conversion operation, that can be vectorized.
3512 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3513 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3514 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3517 vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
3518 gimple *vec_stmt, slp_tree slp_node)
3523 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3524 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3525 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3526 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3527 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3531 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3532 gimple new_stmt = NULL;
3533 stmt_vec_info prev_stmt_info;
3536 tree vectype_out, vectype_in;
3539 tree rhs_type, lhs_type;
3541 enum { NARROW, NONE, WIDEN } modifier;
3543 VEC(tree,heap) *vec_oprnds0 = NULL;
3546 VEC(tree,heap) *dummy = NULL;
3549 /* Is STMT a vectorizable conversion? */
3551 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3554 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3557 if (!is_gimple_assign (stmt))
3560 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
3563 code = gimple_assign_rhs_code (stmt);
3564 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3567 /* Check types of lhs and rhs. */
3568 op0 = gimple_assign_rhs1 (stmt);
3569 rhs_type = TREE_TYPE (op0);
3570 vectype_in = get_vectype_for_scalar_type (rhs_type);
3573 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3575 scalar_dest = gimple_assign_lhs (stmt);
3576 lhs_type = TREE_TYPE (scalar_dest);
3577 vectype_out = get_vectype_for_scalar_type (lhs_type);
3580 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3583 if (nunits_in == nunits_out / 2)
3585 else if (nunits_out == nunits_in)
3587 else if (nunits_out == nunits_in / 2)
3592 if (modifier == NONE)
3593 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3595 /* Bail out if the types are both integral or non-integral. */
3596 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3597 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3600 integral_type = INTEGRAL_TYPE_P (rhs_type) ? vectype_in : vectype_out;
3602 if (modifier == NARROW)
3603 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3605 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3607 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3608 this, so we can safely override NCOPIES with 1 here. */
3612 /* Sanity check: make sure that at least one copy of the vectorized stmt
3613 needs to be generated. */
3614 gcc_assert (ncopies >= 1);
3616 /* Check the operands of the operation. */
3617 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3619 if (vect_print_dump_info (REPORT_DETAILS))
3620 fprintf (vect_dump, "use not simple.");
3624 /* Supportable by target? */
3625 if ((modifier == NONE
3626 && !targetm.vectorize.builtin_conversion (code, integral_type))
3627 || (modifier == WIDEN
3628 && !supportable_widening_operation (code, stmt, vectype_in,
3631 &dummy_int, &dummy))
3632 || (modifier == NARROW
3633 && !supportable_narrowing_operation (code, stmt, vectype_in,
3634 &code1, &dummy_int, &dummy)))
3636 if (vect_print_dump_info (REPORT_DETAILS))
3637 fprintf (vect_dump, "conversion not supported by target.");
3641 if (modifier != NONE)
3643 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3644 /* FORNOW: SLP not supported. */
3645 if (STMT_SLP_TYPE (stmt_info))
3649 if (!vec_stmt) /* transformation not required. */
3651 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3656 if (vect_print_dump_info (REPORT_DETAILS))
3657 fprintf (vect_dump, "transform conversion.");
3660 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3662 if (modifier == NONE && !slp_node)
3663 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3665 prev_stmt_info = NULL;
3669 for (j = 0; j < ncopies; j++)
3675 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3677 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3680 targetm.vectorize.builtin_conversion (code, integral_type);
3681 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3683 /* Arguments are ready. create the new vector stmt. */
3684 new_stmt = gimple_build_call (builtin_decl, 1, vop0);
3685 new_temp = make_ssa_name (vec_dest, new_stmt);
3686 gimple_call_set_lhs (new_stmt, new_temp);
3687 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3688 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3689 SSA_OP_ALL_VIRTUALS)
3691 if (TREE_CODE (sym) == SSA_NAME)
3692 sym = SSA_NAME_VAR (sym);
3693 mark_sym_for_renaming (sym);
3696 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3700 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3702 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3703 prev_stmt_info = vinfo_for_stmt (new_stmt);
3708 /* In case the vectorization factor (VF) is bigger than the number
3709 of elements that we can fit in a vectype (nunits), we have to
3710 generate more than one vector stmt - i.e - we need to "unroll"
3711 the vector stmt by a factor VF/nunits. */
3712 for (j = 0; j < ncopies; j++)
3715 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3717 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3719 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3721 /* Generate first half of the widened result: */
3723 = vect_gen_widened_results_half (code1, decl1,
3724 vec_oprnd0, vec_oprnd1,
3725 unary_op, vec_dest, gsi, stmt);
3727 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3729 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3730 prev_stmt_info = vinfo_for_stmt (new_stmt);
3732 /* Generate second half of the widened result: */
3734 = vect_gen_widened_results_half (code2, decl2,
3735 vec_oprnd0, vec_oprnd1,
3736 unary_op, vec_dest, gsi, stmt);
3737 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3738 prev_stmt_info = vinfo_for_stmt (new_stmt);
3743 /* In case the vectorization factor (VF) is bigger than the number
3744 of elements that we can fit in a vectype (nunits), we have to
3745 generate more than one vector stmt - i.e - we need to "unroll"
3746 the vector stmt by a factor VF/nunits. */
3747 for (j = 0; j < ncopies; j++)
3752 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3753 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3757 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3758 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3761 /* Arguments are ready. Create the new vector stmt. */
3762 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3763 new_stmt = gimple_build_assign_with_ops (code1, vec_dest, vec_oprnd0,
3765 new_temp = make_ssa_name (vec_dest, new_stmt);
3766 gimple_assign_set_lhs (new_stmt, new_temp);
3767 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3770 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3772 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3774 prev_stmt_info = vinfo_for_stmt (new_stmt);
3777 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3781 VEC_free (tree, heap, vec_oprnds0);
3787 /* Function vectorizable_assignment.
3789 Check if STMT performs an assignment (copy) that can be vectorized.
3790 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3791 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3792 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3795 vectorizable_assignment (gimple stmt, gimple_stmt_iterator *gsi,
3796 gimple *vec_stmt, slp_tree slp_node)
3801 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3802 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3803 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3807 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3808 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3811 VEC(tree,heap) *vec_oprnds = NULL;
3814 /* Multiple types in SLP are handled by creating the appropriate number of
3815 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
3820 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3822 gcc_assert (ncopies >= 1);
3824 return false; /* FORNOW */
3826 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3829 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3832 /* Is vectorizable assignment? */
3833 if (!is_gimple_assign (stmt))
3836 scalar_dest = gimple_assign_lhs (stmt);
3837 if (TREE_CODE (scalar_dest) != SSA_NAME)
3840 if (gimple_assign_single_p (stmt)
3841 || gimple_assign_rhs_code (stmt) == PAREN_EXPR)
3842 op = gimple_assign_rhs1 (stmt);
3846 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3848 if (vect_print_dump_info (REPORT_DETAILS))
3849 fprintf (vect_dump, "use not simple.");
3853 if (!vec_stmt) /* transformation not required. */
3855 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3856 if (vect_print_dump_info (REPORT_DETAILS))
3857 fprintf (vect_dump, "=== vectorizable_assignment ===");
3858 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3863 if (vect_print_dump_info (REPORT_DETAILS))
3864 fprintf (vect_dump, "transform assignment.");
3867 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3870 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3872 /* Arguments are ready. create the new vector stmt. */
3873 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3875 *vec_stmt = gimple_build_assign (vec_dest, vop);
3876 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3877 gimple_assign_set_lhs (*vec_stmt, new_temp);
3878 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
3879 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3882 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3885 VEC_free (tree, heap, vec_oprnds);
3890 /* Function vect_min_worthwhile_factor.
3892 For a loop where we could vectorize the operation indicated by CODE,
3893 return the minimum vectorization factor that makes it worthwhile
3894 to use generic vectors. */
3896 vect_min_worthwhile_factor (enum tree_code code)
3917 /* Function vectorizable_induction
3919 Check if PHI performs an induction computation that can be vectorized.
3920 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3921 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3922 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3925 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3928 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3929 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3930 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3931 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3932 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3933 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3936 gcc_assert (ncopies >= 1);
3937 /* FORNOW. This restriction should be relaxed. */
3938 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
3940 if (vect_print_dump_info (REPORT_DETAILS))
3941 fprintf (vect_dump, "multiple types in nested loop.");
3945 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3948 /* FORNOW: SLP not supported. */
3949 if (STMT_SLP_TYPE (stmt_info))
3952 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3954 if (gimple_code (phi) != GIMPLE_PHI)
3957 if (!vec_stmt) /* transformation not required. */
3959 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3960 if (vect_print_dump_info (REPORT_DETAILS))
3961 fprintf (vect_dump, "=== vectorizable_induction ===");
3962 vect_model_induction_cost (stmt_info, ncopies);
3968 if (vect_print_dump_info (REPORT_DETAILS))
3969 fprintf (vect_dump, "transform induction phi.");
3971 vec_def = get_initial_def_for_induction (phi);
3972 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3977 /* Function vectorizable_operation.
3979 Check if STMT performs a binary or unary operation that can be vectorized.
3980 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3981 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3982 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3985 vectorizable_operation (gimple stmt, gimple_stmt_iterator *gsi,
3986 gimple *vec_stmt, slp_tree slp_node)
3990 tree op0, op1 = NULL;
3991 tree vec_oprnd1 = NULL_TREE;
3992 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3993 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3994 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3995 enum tree_code code;
3996 enum machine_mode vec_mode;
4001 enum machine_mode optab_op2_mode;
4004 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4005 gimple new_stmt = NULL;
4006 stmt_vec_info prev_stmt_info;
4007 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
4012 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4015 bool shift_p = false;
4016 bool scalar_shift_arg = false;
4018 /* Multiple types in SLP are handled by creating the appropriate number of
4019 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4024 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4026 gcc_assert (ncopies >= 1);
4028 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4031 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4034 /* Is STMT a vectorizable binary/unary operation? */
4035 if (!is_gimple_assign (stmt))
4038 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4041 scalar_dest = gimple_assign_lhs (stmt);
4042 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4045 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4046 if (nunits_out != nunits_in)
4049 code = gimple_assign_rhs_code (stmt);
4051 /* For pointer addition, we should use the normal plus for
4052 the vector addition. */
4053 if (code == POINTER_PLUS_EXPR)
4056 /* Support only unary or binary operations. */
4057 op_type = TREE_CODE_LENGTH (code);
4058 if (op_type != unary_op && op_type != binary_op)
4060 if (vect_print_dump_info (REPORT_DETAILS))
4061 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
4065 op0 = gimple_assign_rhs1 (stmt);
4066 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4068 if (vect_print_dump_info (REPORT_DETAILS))
4069 fprintf (vect_dump, "use not simple.");
4073 if (op_type == binary_op)
4075 op1 = gimple_assign_rhs2 (stmt);
4076 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4078 if (vect_print_dump_info (REPORT_DETAILS))
4079 fprintf (vect_dump, "use not simple.");
4084 /* If this is a shift/rotate, determine whether the shift amount is a vector,
4085 or scalar. If the shift/rotate amount is a vector, use the vector/vector
4087 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
4088 || code == RROTATE_EXPR)
4092 /* vector shifted by vector */
4093 if (dt[1] == vect_loop_def)
4095 optab = optab_for_tree_code (code, vectype, optab_vector);
4096 if (vect_print_dump_info (REPORT_DETAILS))
4097 fprintf (vect_dump, "vector/vector shift/rotate found.");
4100 /* See if the machine has a vector shifted by scalar insn and if not
4101 then see if it has a vector shifted by vector insn */
4102 else if (dt[1] == vect_constant_def || dt[1] == vect_invariant_def)
4104 optab = optab_for_tree_code (code, vectype, optab_scalar);
4106 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4107 != CODE_FOR_nothing))
4109 scalar_shift_arg = true;
4110 if (vect_print_dump_info (REPORT_DETAILS))
4111 fprintf (vect_dump, "vector/scalar shift/rotate found.");
4115 optab = optab_for_tree_code (code, vectype, optab_vector);
4116 if (vect_print_dump_info (REPORT_DETAILS)
4118 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4119 != CODE_FOR_nothing))
4120 fprintf (vect_dump, "vector/vector shift/rotate found.");
4126 if (vect_print_dump_info (REPORT_DETAILS))
4127 fprintf (vect_dump, "operand mode requires invariant argument.");
4132 optab = optab_for_tree_code (code, vectype, optab_default);
4134 /* Supportable by target? */
4137 if (vect_print_dump_info (REPORT_DETAILS))
4138 fprintf (vect_dump, "no optab.");
4141 vec_mode = TYPE_MODE (vectype);
4142 icode = (int) optab_handler (optab, vec_mode)->insn_code;
4143 if (icode == CODE_FOR_nothing)
4145 if (vect_print_dump_info (REPORT_DETAILS))
4146 fprintf (vect_dump, "op not supported by target.");
4147 /* Check only during analysis. */
4148 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4149 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4150 < vect_min_worthwhile_factor (code)
4153 if (vect_print_dump_info (REPORT_DETAILS))
4154 fprintf (vect_dump, "proceeding using word mode.");
4157 /* Worthwhile without SIMD support? Check only during analysis. */
4158 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
4159 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4160 < vect_min_worthwhile_factor (code)
4163 if (vect_print_dump_info (REPORT_DETAILS))
4164 fprintf (vect_dump, "not worthwhile without SIMD support.");
4168 if (!vec_stmt) /* transformation not required. */
4170 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
4171 if (vect_print_dump_info (REPORT_DETAILS))
4172 fprintf (vect_dump, "=== vectorizable_operation ===");
4173 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4179 if (vect_print_dump_info (REPORT_DETAILS))
4180 fprintf (vect_dump, "transform binary/unary operation.");
4183 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4185 /* Allocate VECs for vector operands. In case of SLP, vector operands are
4186 created in the previous stages of the recursion, so no allocation is
4187 needed, except for the case of shift with scalar shift argument. In that
4188 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
4189 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
4190 In case of loop-based vectorization we allocate VECs of size 1. We
4191 allocate VEC_OPRNDS1 only in case of binary operation. */
4194 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4195 if (op_type == binary_op)
4196 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4198 else if (scalar_shift_arg)
4199 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4201 /* In case the vectorization factor (VF) is bigger than the number
4202 of elements that we can fit in a vectype (nunits), we have to generate
4203 more than one vector stmt - i.e - we need to "unroll" the
4204 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4205 from one copy of the vector stmt to the next, in the field
4206 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4207 stages to find the correct vector defs to be used when vectorizing
4208 stmts that use the defs of the current stmt. The example below illustrates
4209 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4210 4 vectorized stmts):
4212 before vectorization:
4213 RELATED_STMT VEC_STMT
4217 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4219 RELATED_STMT VEC_STMT
4220 VS1_0: vx0 = memref0 VS1_1 -
4221 VS1_1: vx1 = memref1 VS1_2 -
4222 VS1_2: vx2 = memref2 VS1_3 -
4223 VS1_3: vx3 = memref3 - -
4224 S1: x = load - VS1_0
4227 step2: vectorize stmt S2 (done here):
4228 To vectorize stmt S2 we first need to find the relevant vector
4229 def for the first operand 'x'. This is, as usual, obtained from
4230 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4231 that defines 'x' (S1). This way we find the stmt VS1_0, and the
4232 relevant vector def 'vx0'. Having found 'vx0' we can generate
4233 the vector stmt VS2_0, and as usual, record it in the
4234 STMT_VINFO_VEC_STMT of stmt S2.
4235 When creating the second copy (VS2_1), we obtain the relevant vector
4236 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4237 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4238 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4239 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4240 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4241 chain of stmts and pointers:
4242 RELATED_STMT VEC_STMT
4243 VS1_0: vx0 = memref0 VS1_1 -
4244 VS1_1: vx1 = memref1 VS1_2 -
4245 VS1_2: vx2 = memref2 VS1_3 -
4246 VS1_3: vx3 = memref3 - -
4247 S1: x = load - VS1_0
4248 VS2_0: vz0 = vx0 + v1 VS2_1 -
4249 VS2_1: vz1 = vx1 + v1 VS2_2 -
4250 VS2_2: vz2 = vx2 + v1 VS2_3 -
4251 VS2_3: vz3 = vx3 + v1 - -
4252 S2: z = x + 1 - VS2_0 */
4254 prev_stmt_info = NULL;
4255 for (j = 0; j < ncopies; j++)
4260 if (op_type == binary_op && scalar_shift_arg)
4262 /* Vector shl and shr insn patterns can be defined with scalar
4263 operand 2 (shift operand). In this case, use constant or loop
4264 invariant op1 directly, without extending it to vector mode
4266 optab_op2_mode = insn_data[icode].operand[2].mode;
4267 if (!VECTOR_MODE_P (optab_op2_mode))
4269 if (vect_print_dump_info (REPORT_DETAILS))
4270 fprintf (vect_dump, "operand 1 using scalar mode.");
4272 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4275 /* Store vec_oprnd1 for every vector stmt to be created
4276 for SLP_NODE. We check during the analysis that all the
4277 shift arguments are the same.
4278 TODO: Allow different constants for different vector
4279 stmts generated for an SLP instance. */
4280 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4281 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4286 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4287 (a special case for certain kind of vector shifts); otherwise,
4288 operand 1 should be of a vector type (the usual case). */
4289 if (op_type == binary_op && !vec_oprnd1)
4290 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4293 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4297 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4299 /* Arguments are ready. Create the new vector stmt. */
4300 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4302 vop1 = ((op_type == binary_op)
4303 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4304 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4305 new_temp = make_ssa_name (vec_dest, new_stmt);
4306 gimple_assign_set_lhs (new_stmt, new_temp);
4307 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4309 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4316 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4318 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4319 prev_stmt_info = vinfo_for_stmt (new_stmt);
4322 VEC_free (tree, heap, vec_oprnds0);
4324 VEC_free (tree, heap, vec_oprnds1);
4330 /* Get vectorized definitions for loop-based vectorization. For the first
4331 operand we call vect_get_vec_def_for_operand() (with OPRND containing
4332 scalar operand), and for the rest we get a copy with
4333 vect_get_vec_def_for_stmt_copy() using the previous vector definition
4334 (stored in OPRND). See vect_get_vec_def_for_stmt_copy() for details.
4335 The vectors are collected into VEC_OPRNDS. */
4338 vect_get_loop_based_defs (tree *oprnd, gimple stmt, enum vect_def_type dt,
4339 VEC (tree, heap) **vec_oprnds, int multi_step_cvt)
4343 /* Get first vector operand. */
4344 /* All the vector operands except the very first one (that is scalar oprnd)
4346 if (TREE_CODE (TREE_TYPE (*oprnd)) != VECTOR_TYPE)
4347 vec_oprnd = vect_get_vec_def_for_operand (*oprnd, stmt, NULL);
4349 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, *oprnd);
4351 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4353 /* Get second vector operand. */
4354 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd);
4355 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4359 /* For conversion in multiple steps, continue to get operands
4362 vect_get_loop_based_defs (oprnd, stmt, dt, vec_oprnds, multi_step_cvt - 1);
4366 /* Create vectorized demotion statements for vector operands from VEC_OPRNDS.
4367 For multi-step conversions store the resulting vectors and call the function
4371 vect_create_vectorized_demotion_stmts (VEC (tree, heap) **vec_oprnds,
4372 int multi_step_cvt, gimple stmt,
4373 VEC (tree, heap) *vec_dsts,
4374 gimple_stmt_iterator *gsi,
4375 slp_tree slp_node, enum tree_code code,
4376 stmt_vec_info *prev_stmt_info)
4379 tree vop0, vop1, new_tmp, vec_dest;
4381 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4383 vec_dest = VEC_pop (tree, vec_dsts);
4385 for (i = 0; i < VEC_length (tree, *vec_oprnds); i += 2)
4387 /* Create demotion operation. */
4388 vop0 = VEC_index (tree, *vec_oprnds, i);
4389 vop1 = VEC_index (tree, *vec_oprnds, i + 1);
4390 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4391 new_tmp = make_ssa_name (vec_dest, new_stmt);
4392 gimple_assign_set_lhs (new_stmt, new_tmp);
4393 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4396 /* Store the resulting vector for next recursive call. */
4397 VEC_replace (tree, *vec_oprnds, i/2, new_tmp);
4400 /* This is the last step of the conversion sequence. Store the
4401 vectors in SLP_NODE or in vector info of the scalar statement
4402 (or in STMT_VINFO_RELATED_STMT chain). */
4404 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4407 if (!*prev_stmt_info)
4408 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4410 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt;
4412 *prev_stmt_info = vinfo_for_stmt (new_stmt);
4417 /* For multi-step demotion operations we first generate demotion operations
4418 from the source type to the intermediate types, and then combine the
4419 results (stored in VEC_OPRNDS) in demotion operation to the destination
4423 /* At each level of recursion we have have of the operands we had at the
4425 VEC_truncate (tree, *vec_oprnds, (i+1)/2);
4426 vect_create_vectorized_demotion_stmts (vec_oprnds, multi_step_cvt - 1,
4427 stmt, vec_dsts, gsi, slp_node,
4428 code, prev_stmt_info);
4433 /* Function vectorizable_type_demotion
4435 Check if STMT performs a binary or unary operation that involves
4436 type demotion, and if it can be vectorized.
4437 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4438 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4439 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4442 vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi,
4443 gimple *vec_stmt, slp_tree slp_node)
4448 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4449 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4450 enum tree_code code, code1 = ERROR_MARK;
4453 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4454 stmt_vec_info prev_stmt_info;
4461 int multi_step_cvt = 0;
4462 VEC (tree, heap) *vec_oprnds0 = NULL;
4463 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4464 tree last_oprnd, intermediate_type;
4466 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4469 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4472 /* Is STMT a vectorizable type-demotion operation? */
4473 if (!is_gimple_assign (stmt))
4476 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4479 code = gimple_assign_rhs_code (stmt);
4480 if (!CONVERT_EXPR_CODE_P (code))
4483 op0 = gimple_assign_rhs1 (stmt);
4484 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4487 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4489 scalar_dest = gimple_assign_lhs (stmt);
4490 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4493 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4494 if (nunits_in >= nunits_out)
4497 /* Multiple types in SLP are handled by creating the appropriate number of
4498 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4503 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4505 gcc_assert (ncopies >= 1);
4507 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4508 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4509 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4510 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4511 && CONVERT_EXPR_CODE_P (code))))
4514 /* Check the operands of the operation. */
4515 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4517 if (vect_print_dump_info (REPORT_DETAILS))
4518 fprintf (vect_dump, "use not simple.");
4522 /* Supportable by target? */
4523 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1,
4524 &multi_step_cvt, &interm_types))
4527 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4529 if (!vec_stmt) /* transformation not required. */
4531 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4532 if (vect_print_dump_info (REPORT_DETAILS))
4533 fprintf (vect_dump, "=== vectorizable_demotion ===");
4534 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4539 if (vect_print_dump_info (REPORT_DETAILS))
4540 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4543 /* In case of multi-step demotion, we first generate demotion operations to
4544 the intermediate types, and then from that types to the final one.
4545 We create vector destinations for the intermediate type (TYPES) received
4546 from supportable_narrowing_operation, and store them in the correct order
4547 for future use in vect_create_vectorized_demotion_stmts(). */
4549 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4551 vec_dsts = VEC_alloc (tree, heap, 1);
4553 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4554 VEC_quick_push (tree, vec_dsts, vec_dest);
4558 for (i = VEC_length (tree, interm_types) - 1;
4559 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4561 vec_dest = vect_create_destination_var (scalar_dest,
4563 VEC_quick_push (tree, vec_dsts, vec_dest);
4567 /* In case the vectorization factor (VF) is bigger than the number
4568 of elements that we can fit in a vectype (nunits), we have to generate
4569 more than one vector stmt - i.e - we need to "unroll" the
4570 vector stmt by a factor VF/nunits. */
4572 prev_stmt_info = NULL;
4573 for (j = 0; j < ncopies; j++)
4577 vect_get_slp_defs (slp_node, &vec_oprnds0, NULL);
4580 VEC_free (tree, heap, vec_oprnds0);
4581 vec_oprnds0 = VEC_alloc (tree, heap,
4582 (multi_step_cvt ? vect_pow2 (multi_step_cvt) * 2 : 2));
4583 vect_get_loop_based_defs (&last_oprnd, stmt, dt[0], &vec_oprnds0,
4584 vect_pow2 (multi_step_cvt) - 1);
4587 /* Arguments are ready. Create the new vector stmts. */
4588 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4589 vect_create_vectorized_demotion_stmts (&vec_oprnds0,
4590 multi_step_cvt, stmt, tmp_vec_dsts,
4591 gsi, slp_node, code1,
4595 VEC_free (tree, heap, vec_oprnds0);
4596 VEC_free (tree, heap, vec_dsts);
4597 VEC_free (tree, heap, tmp_vec_dsts);
4598 VEC_free (tree, heap, interm_types);
4600 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4605 /* Create vectorized promotion statements for vector operands from VEC_OPRNDS0
4606 and VEC_OPRNDS1 (for binary operations). For multi-step conversions store
4607 the resulting vectors and call the function recursively. */
4610 vect_create_vectorized_promotion_stmts (VEC (tree, heap) **vec_oprnds0,
4611 VEC (tree, heap) **vec_oprnds1,
4612 int multi_step_cvt, gimple stmt,
4613 VEC (tree, heap) *vec_dsts,
4614 gimple_stmt_iterator *gsi,
4615 slp_tree slp_node, enum tree_code code1,
4616 enum tree_code code2, tree decl1,
4617 tree decl2, int op_type,
4618 stmt_vec_info *prev_stmt_info)
4621 tree vop0, vop1, new_tmp1, new_tmp2, vec_dest;
4622 gimple new_stmt1, new_stmt2;
4623 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4624 VEC (tree, heap) *vec_tmp;
4626 vec_dest = VEC_pop (tree, vec_dsts);
4627 vec_tmp = VEC_alloc (tree, heap, VEC_length (tree, *vec_oprnds0) * 2);
4629 for (i = 0; VEC_iterate (tree, *vec_oprnds0, i, vop0); i++)
4631 if (op_type == binary_op)
4632 vop1 = VEC_index (tree, *vec_oprnds1, i);
4636 /* Generate the two halves of promotion operation. */
4637 new_stmt1 = vect_gen_widened_results_half (code1, decl1, vop0, vop1,
4638 op_type, vec_dest, gsi, stmt);
4639 new_stmt2 = vect_gen_widened_results_half (code2, decl2, vop0, vop1,
4640 op_type, vec_dest, gsi, stmt);
4641 if (is_gimple_call (new_stmt1))
4643 new_tmp1 = gimple_call_lhs (new_stmt1);
4644 new_tmp2 = gimple_call_lhs (new_stmt2);
4648 new_tmp1 = gimple_assign_lhs (new_stmt1);
4649 new_tmp2 = gimple_assign_lhs (new_stmt2);
4654 /* Store the results for the recursive call. */
4655 VEC_quick_push (tree, vec_tmp, new_tmp1);
4656 VEC_quick_push (tree, vec_tmp, new_tmp2);
4660 /* Last step of promotion sequience - store the results. */
4663 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt1);
4664 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt2);
4668 if (!*prev_stmt_info)
4669 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt1;
4671 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt1;
4673 *prev_stmt_info = vinfo_for_stmt (new_stmt1);
4674 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt2;
4675 *prev_stmt_info = vinfo_for_stmt (new_stmt2);
4682 /* For multi-step promotion operation we first generate we call the
4683 function recurcively for every stage. We start from the input type,
4684 create promotion operations to the intermediate types, and then
4685 create promotions to the output type. */
4686 *vec_oprnds0 = VEC_copy (tree, heap, vec_tmp);
4687 VEC_free (tree, heap, vec_tmp);
4688 vect_create_vectorized_promotion_stmts (vec_oprnds0, vec_oprnds1,
4689 multi_step_cvt - 1, stmt,
4690 vec_dsts, gsi, slp_node, code1,
4691 code2, decl2, decl2, op_type,
4697 /* Function vectorizable_type_promotion
4699 Check if STMT performs a binary or unary operation that involves
4700 type promotion, and if it can be vectorized.
4701 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4702 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4703 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4706 vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
4707 gimple *vec_stmt, slp_tree slp_node)
4711 tree op0, op1 = NULL;
4712 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4713 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4714 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4715 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4716 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4720 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4721 stmt_vec_info prev_stmt_info;
4728 tree intermediate_type = NULL_TREE;
4729 int multi_step_cvt = 0;
4730 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4731 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4733 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4736 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4739 /* Is STMT a vectorizable type-promotion operation? */
4740 if (!is_gimple_assign (stmt))
4743 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4746 code = gimple_assign_rhs_code (stmt);
4747 if (!CONVERT_EXPR_CODE_P (code)
4748 && code != WIDEN_MULT_EXPR)
4751 op0 = gimple_assign_rhs1 (stmt);
4752 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4755 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4757 scalar_dest = gimple_assign_lhs (stmt);
4758 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4761 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4762 if (nunits_in <= nunits_out)
4765 /* Multiple types in SLP are handled by creating the appropriate number of
4766 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4771 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4773 gcc_assert (ncopies >= 1);
4775 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4776 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4777 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4778 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4779 && CONVERT_EXPR_CODE_P (code))))
4782 /* Check the operands of the operation. */
4783 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4785 if (vect_print_dump_info (REPORT_DETAILS))
4786 fprintf (vect_dump, "use not simple.");
4790 op_type = TREE_CODE_LENGTH (code);
4791 if (op_type == binary_op)
4793 op1 = gimple_assign_rhs2 (stmt);
4794 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4796 if (vect_print_dump_info (REPORT_DETAILS))
4797 fprintf (vect_dump, "use not simple.");
4802 /* Supportable by target? */
4803 if (!supportable_widening_operation (code, stmt, vectype_in,
4804 &decl1, &decl2, &code1, &code2,
4805 &multi_step_cvt, &interm_types))
4808 /* Binary widening operation can only be supported directly by the
4810 gcc_assert (!(multi_step_cvt && op_type == binary_op));
4812 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4814 if (!vec_stmt) /* transformation not required. */
4816 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4817 if (vect_print_dump_info (REPORT_DETAILS))
4818 fprintf (vect_dump, "=== vectorizable_promotion ===");
4819 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4825 if (vect_print_dump_info (REPORT_DETAILS))
4826 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4830 /* In case of multi-step promotion, we first generate promotion operations
4831 to the intermediate types, and then from that types to the final one.
4832 We store vector destination in VEC_DSTS in the correct order for
4833 recursive creation of promotion operations in
4834 vect_create_vectorized_promotion_stmts(). Vector destinations are created
4835 according to TYPES recieved from supportable_widening_operation(). */
4837 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4839 vec_dsts = VEC_alloc (tree, heap, 1);
4841 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4842 VEC_quick_push (tree, vec_dsts, vec_dest);
4846 for (i = VEC_length (tree, interm_types) - 1;
4847 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4849 vec_dest = vect_create_destination_var (scalar_dest,
4851 VEC_quick_push (tree, vec_dsts, vec_dest);
4857 vec_oprnds0 = VEC_alloc (tree, heap,
4858 (multi_step_cvt ? vect_pow2 (multi_step_cvt) : 1));
4859 if (op_type == binary_op)
4860 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4863 /* In case the vectorization factor (VF) is bigger than the number
4864 of elements that we can fit in a vectype (nunits), we have to generate
4865 more than one vector stmt - i.e - we need to "unroll" the
4866 vector stmt by a factor VF/nunits. */
4868 prev_stmt_info = NULL;
4869 for (j = 0; j < ncopies; j++)
4875 vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1);
4878 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4879 VEC_quick_push (tree, vec_oprnds0, vec_oprnd0);
4880 if (op_type == binary_op)
4882 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4883 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4889 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4890 VEC_replace (tree, vec_oprnds0, 0, vec_oprnd0);
4891 if (op_type == binary_op)
4893 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4894 VEC_replace (tree, vec_oprnds1, 0, vec_oprnd1);
4898 /* Arguments are ready. Create the new vector stmts. */
4899 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4900 vect_create_vectorized_promotion_stmts (&vec_oprnds0, &vec_oprnds1,
4901 multi_step_cvt, stmt,
4903 gsi, slp_node, code1, code2,
4904 decl1, decl2, op_type,
4908 VEC_free (tree, heap, vec_dsts);
4909 VEC_free (tree, heap, tmp_vec_dsts);
4910 VEC_free (tree, heap, interm_types);
4911 VEC_free (tree, heap, vec_oprnds0);
4912 VEC_free (tree, heap, vec_oprnds1);
4914 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4919 /* Function vect_strided_store_supported.
4921 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4922 and FALSE otherwise. */
4925 vect_strided_store_supported (tree vectype)
4927 optab interleave_high_optab, interleave_low_optab;
4930 mode = (int) TYPE_MODE (vectype);
4932 /* Check that the operation is supported. */
4933 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4934 vectype, optab_default);
4935 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4936 vectype, optab_default);
4937 if (!interleave_high_optab || !interleave_low_optab)
4939 if (vect_print_dump_info (REPORT_DETAILS))
4940 fprintf (vect_dump, "no optab for interleave.");
4944 if (optab_handler (interleave_high_optab, mode)->insn_code
4946 || optab_handler (interleave_low_optab, mode)->insn_code
4947 == CODE_FOR_nothing)
4949 if (vect_print_dump_info (REPORT_DETAILS))
4950 fprintf (vect_dump, "interleave op not supported by target.");
4958 /* Function vect_permute_store_chain.
4960 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4961 a power of 2, generate interleave_high/low stmts to reorder the data
4962 correctly for the stores. Return the final references for stores in
4965 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4966 The input is 4 vectors each containing 8 elements. We assign a number to each
4967 element, the input sequence is:
4969 1st vec: 0 1 2 3 4 5 6 7
4970 2nd vec: 8 9 10 11 12 13 14 15
4971 3rd vec: 16 17 18 19 20 21 22 23
4972 4th vec: 24 25 26 27 28 29 30 31
4974 The output sequence should be:
4976 1st vec: 0 8 16 24 1 9 17 25
4977 2nd vec: 2 10 18 26 3 11 19 27
4978 3rd vec: 4 12 20 28 5 13 21 30
4979 4th vec: 6 14 22 30 7 15 23 31
4981 i.e., we interleave the contents of the four vectors in their order.
4983 We use interleave_high/low instructions to create such output. The input of
4984 each interleave_high/low operation is two vectors:
4987 the even elements of the result vector are obtained left-to-right from the
4988 high/low elements of the first vector. The odd elements of the result are
4989 obtained left-to-right from the high/low elements of the second vector.
4990 The output of interleave_high will be: 0 4 1 5
4991 and of interleave_low: 2 6 3 7
4994 The permutation is done in log LENGTH stages. In each stage interleave_high
4995 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4996 where the first argument is taken from the first half of DR_CHAIN and the
4997 second argument from it's second half.
5000 I1: interleave_high (1st vec, 3rd vec)
5001 I2: interleave_low (1st vec, 3rd vec)
5002 I3: interleave_high (2nd vec, 4th vec)
5003 I4: interleave_low (2nd vec, 4th vec)
5005 The output for the first stage is:
5007 I1: 0 16 1 17 2 18 3 19
5008 I2: 4 20 5 21 6 22 7 23
5009 I3: 8 24 9 25 10 26 11 27
5010 I4: 12 28 13 29 14 30 15 31
5012 The output of the second stage, i.e. the final result is:
5014 I1: 0 8 16 24 1 9 17 25
5015 I2: 2 10 18 26 3 11 19 27
5016 I3: 4 12 20 28 5 13 21 30
5017 I4: 6 14 22 30 7 15 23 31. */
5020 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
5021 unsigned int length,
5023 gimple_stmt_iterator *gsi,
5024 VEC(tree,heap) **result_chain)
5026 tree perm_dest, vect1, vect2, high, low;
5028 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5032 enum tree_code high_code, low_code;
5034 scalar_dest = gimple_assign_lhs (stmt);
5036 /* Check that the operation is supported. */
5037 if (!vect_strided_store_supported (vectype))
5040 *result_chain = VEC_copy (tree, heap, dr_chain);
5042 for (i = 0; i < exact_log2 (length); i++)
5044 for (j = 0; j < length/2; j++)
5046 vect1 = VEC_index (tree, dr_chain, j);
5047 vect2 = VEC_index (tree, dr_chain, j+length/2);
5049 /* Create interleaving stmt:
5050 in the case of big endian:
5051 high = interleave_high (vect1, vect2)
5052 and in the case of little endian:
5053 high = interleave_low (vect1, vect2). */
5054 perm_dest = create_tmp_var (vectype, "vect_inter_high");
5055 DECL_GIMPLE_REG_P (perm_dest) = 1;
5056 add_referenced_var (perm_dest);
5057 if (BYTES_BIG_ENDIAN)
5059 high_code = VEC_INTERLEAVE_HIGH_EXPR;
5060 low_code = VEC_INTERLEAVE_LOW_EXPR;
5064 low_code = VEC_INTERLEAVE_HIGH_EXPR;
5065 high_code = VEC_INTERLEAVE_LOW_EXPR;
5067 perm_stmt = gimple_build_assign_with_ops (high_code, perm_dest,
5069 high = make_ssa_name (perm_dest, perm_stmt);
5070 gimple_assign_set_lhs (perm_stmt, high);
5071 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5072 VEC_replace (tree, *result_chain, 2*j, high);
5074 /* Create interleaving stmt:
5075 in the case of big endian:
5076 low = interleave_low (vect1, vect2)
5077 and in the case of little endian:
5078 low = interleave_high (vect1, vect2). */
5079 perm_dest = create_tmp_var (vectype, "vect_inter_low");
5080 DECL_GIMPLE_REG_P (perm_dest) = 1;
5081 add_referenced_var (perm_dest);
5082 perm_stmt = gimple_build_assign_with_ops (low_code, perm_dest,
5084 low = make_ssa_name (perm_dest, perm_stmt);
5085 gimple_assign_set_lhs (perm_stmt, low);
5086 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5087 VEC_replace (tree, *result_chain, 2*j+1, low);
5089 dr_chain = VEC_copy (tree, heap, *result_chain);
5095 /* Function vectorizable_store.
5097 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
5099 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5100 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5101 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5104 vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
5110 tree vec_oprnd = NULL_TREE;
5111 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5112 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
5113 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5114 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5115 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5116 enum machine_mode vec_mode;
5118 enum dr_alignment_support alignment_support_scheme;
5121 enum vect_def_type dt;
5122 stmt_vec_info prev_stmt_info = NULL;
5123 tree dataref_ptr = NULL_TREE;
5124 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5127 gimple next_stmt, first_stmt = NULL;
5128 bool strided_store = false;
5129 unsigned int group_size, i;
5130 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
5132 VEC(tree,heap) *vec_oprnds = NULL;
5133 bool slp = (slp_node != NULL);
5134 stmt_vec_info first_stmt_vinfo;
5135 unsigned int vec_num;
5137 /* Multiple types in SLP are handled by creating the appropriate number of
5138 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
5143 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5145 gcc_assert (ncopies >= 1);
5147 /* FORNOW. This restriction should be relaxed. */
5148 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
5150 if (vect_print_dump_info (REPORT_DETAILS))
5151 fprintf (vect_dump, "multiple types in nested loop.");
5155 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5158 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5161 /* Is vectorizable store? */
5163 if (!is_gimple_assign (stmt))
5166 scalar_dest = gimple_assign_lhs (stmt);
5167 if (TREE_CODE (scalar_dest) != ARRAY_REF
5168 && TREE_CODE (scalar_dest) != INDIRECT_REF
5169 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5172 gcc_assert (gimple_assign_single_p (stmt));
5173 op = gimple_assign_rhs1 (stmt);
5174 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5176 if (vect_print_dump_info (REPORT_DETAILS))
5177 fprintf (vect_dump, "use not simple.");
5181 /* If accesses through a pointer to vectype do not alias the original
5182 memory reference we have a problem. This should never be the case. */
5183 if (get_alias_set (vectype) != get_alias_set (scalar_dest)
5184 && !alias_set_subset_of (get_alias_set (vectype),
5185 get_alias_set (scalar_dest)))
5187 if (vect_print_dump_info (REPORT_DETAILS))
5188 fprintf (vect_dump, "??? vector type does not alias scalar type");
5192 /* The scalar rhs type needs to be trivially convertible to the vector
5193 component type. This should always be the case. */
5194 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (op)))
5196 if (vect_print_dump_info (REPORT_DETAILS))
5197 fprintf (vect_dump, "??? operands of different types");
5201 vec_mode = TYPE_MODE (vectype);
5202 /* FORNOW. In some cases can vectorize even if data-type not supported
5203 (e.g. - array initialization with 0). */
5204 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
5207 if (!STMT_VINFO_DATA_REF (stmt_info))
5210 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5212 strided_store = true;
5213 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5214 if (!vect_strided_store_supported (vectype)
5215 && !PURE_SLP_STMT (stmt_info) && !slp)
5218 if (first_stmt == stmt)
5220 /* STMT is the leader of the group. Check the operands of all the
5221 stmts of the group. */
5222 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
5225 gcc_assert (gimple_assign_single_p (next_stmt));
5226 op = gimple_assign_rhs1 (next_stmt);
5227 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5229 if (vect_print_dump_info (REPORT_DETAILS))
5230 fprintf (vect_dump, "use not simple.");
5233 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5238 if (!vec_stmt) /* transformation not required. */
5240 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
5241 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
5249 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5250 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5252 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
5255 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5257 /* We vectorize all the stmts of the interleaving group when we
5258 reach the last stmt in the group. */
5259 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
5260 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
5268 strided_store = false;
5270 /* VEC_NUM is the number of vect stmts to be created for this group. */
5272 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5274 vec_num = group_size;
5280 group_size = vec_num = 1;
5281 first_stmt_vinfo = stmt_info;
5284 if (vect_print_dump_info (REPORT_DETAILS))
5285 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
5287 dr_chain = VEC_alloc (tree, heap, group_size);
5288 oprnds = VEC_alloc (tree, heap, group_size);
5290 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5291 gcc_assert (alignment_support_scheme);
5292 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
5294 /* In case the vectorization factor (VF) is bigger than the number
5295 of elements that we can fit in a vectype (nunits), we have to generate
5296 more than one vector stmt - i.e - we need to "unroll" the
5297 vector stmt by a factor VF/nunits. For more details see documentation in
5298 vect_get_vec_def_for_copy_stmt. */
5300 /* In case of interleaving (non-unit strided access):
5307 We create vectorized stores starting from base address (the access of the
5308 first stmt in the chain (S2 in the above example), when the last store stmt
5309 of the chain (S4) is reached:
5312 VS2: &base + vec_size*1 = vx0
5313 VS3: &base + vec_size*2 = vx1
5314 VS4: &base + vec_size*3 = vx3
5316 Then permutation statements are generated:
5318 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
5319 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
5322 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5323 (the order of the data-refs in the output of vect_permute_store_chain
5324 corresponds to the order of scalar stmts in the interleaving chain - see
5325 the documentation of vect_permute_store_chain()).
5327 In case of both multiple types and interleaving, above vector stores and
5328 permutation stmts are created for every copy. The result vector stmts are
5329 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5330 STMT_VINFO_RELATED_STMT for the next copies.
5333 prev_stmt_info = NULL;
5334 for (j = 0; j < ncopies; j++)
5343 /* Get vectorized arguments for SLP_NODE. */
5344 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
5346 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
5350 /* For interleaved stores we collect vectorized defs for all the
5351 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
5352 used as an input to vect_permute_store_chain(), and OPRNDS as
5353 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
5355 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5356 OPRNDS are of size 1. */
5357 next_stmt = first_stmt;
5358 for (i = 0; i < group_size; i++)
5360 /* Since gaps are not supported for interleaved stores,
5361 GROUP_SIZE is the exact number of stmts in the chain.
5362 Therefore, NEXT_STMT can't be NULL_TREE. In case that
5363 there is no interleaving, GROUP_SIZE is 1, and only one
5364 iteration of the loop will be executed. */
5365 gcc_assert (next_stmt
5366 && gimple_assign_single_p (next_stmt));
5367 op = gimple_assign_rhs1 (next_stmt);
5369 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
5371 VEC_quick_push(tree, dr_chain, vec_oprnd);
5372 VEC_quick_push(tree, oprnds, vec_oprnd);
5373 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5377 /* We should have catched mismatched types earlier. */
5378 gcc_assert (useless_type_conversion_p (vectype,
5379 TREE_TYPE (vec_oprnd)));
5380 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
5381 &dummy, &ptr_incr, false,
5383 gcc_assert (!inv_p);
5387 /* For interleaved stores we created vectorized defs for all the
5388 defs stored in OPRNDS in the previous iteration (previous copy).
5389 DR_CHAIN is then used as an input to vect_permute_store_chain(),
5390 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
5392 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5393 OPRNDS are of size 1. */
5394 for (i = 0; i < group_size; i++)
5396 op = VEC_index (tree, oprnds, i);
5397 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
5398 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
5399 VEC_replace(tree, dr_chain, i, vec_oprnd);
5400 VEC_replace(tree, oprnds, i, vec_oprnd);
5403 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
5408 result_chain = VEC_alloc (tree, heap, group_size);
5410 if (!vect_permute_store_chain (dr_chain, group_size, stmt, gsi,
5415 next_stmt = first_stmt;
5416 for (i = 0; i < vec_num; i++)
5419 /* Bump the vector pointer. */
5420 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
5424 vec_oprnd = VEC_index (tree, vec_oprnds, i);
5425 else if (strided_store)
5426 /* For strided stores vectorized defs are interleaved in
5427 vect_permute_store_chain(). */
5428 vec_oprnd = VEC_index (tree, result_chain, i);
5430 data_ref = build_fold_indirect_ref (dataref_ptr);
5431 /* Arguments are ready. Create the new vector stmt. */
5432 new_stmt = gimple_build_assign (data_ref, vec_oprnd);
5433 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5434 mark_symbols_for_renaming (new_stmt);
5440 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5442 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5444 prev_stmt_info = vinfo_for_stmt (new_stmt);
5445 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5451 VEC_free (tree, heap, dr_chain);
5452 VEC_free (tree, heap, oprnds);
5454 VEC_free (tree, heap, result_chain);
5460 /* Function vect_setup_realignment
5462 This function is called when vectorizing an unaligned load using
5463 the dr_explicit_realign[_optimized] scheme.
5464 This function generates the following code at the loop prolog:
5467 x msq_init = *(floor(p)); # prolog load
5468 realignment_token = call target_builtin;
5470 x msq = phi (msq_init, ---)
5472 The stmts marked with x are generated only for the case of
5473 dr_explicit_realign_optimized.
5475 The code above sets up a new (vector) pointer, pointing to the first
5476 location accessed by STMT, and a "floor-aligned" load using that pointer.
5477 It also generates code to compute the "realignment-token" (if the relevant
5478 target hook was defined), and creates a phi-node at the loop-header bb
5479 whose arguments are the result of the prolog-load (created by this
5480 function) and the result of a load that takes place in the loop (to be
5481 created by the caller to this function).
5483 For the case of dr_explicit_realign_optimized:
5484 The caller to this function uses the phi-result (msq) to create the
5485 realignment code inside the loop, and sets up the missing phi argument,
5488 msq = phi (msq_init, lsq)
5489 lsq = *(floor(p')); # load in loop
5490 result = realign_load (msq, lsq, realignment_token);
5492 For the case of dr_explicit_realign:
5494 msq = *(floor(p)); # load in loop
5496 lsq = *(floor(p')); # load in loop
5497 result = realign_load (msq, lsq, realignment_token);
5500 STMT - (scalar) load stmt to be vectorized. This load accesses
5501 a memory location that may be unaligned.
5502 BSI - place where new code is to be inserted.
5503 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5507 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5508 target hook, if defined.
5509 Return value - the result of the loop-header phi node. */
5512 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
5513 tree *realignment_token,
5514 enum dr_alignment_support alignment_support_scheme,
5516 struct loop **at_loop)
5518 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5519 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5520 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5521 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5523 tree scalar_dest = gimple_assign_lhs (stmt);
5530 tree msq_init = NULL_TREE;
5533 tree msq = NULL_TREE;
5534 gimple_seq stmts = NULL;
5536 bool compute_in_loop = false;
5537 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5538 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5539 struct loop *loop_for_initial_load;
5541 gcc_assert (alignment_support_scheme == dr_explicit_realign
5542 || alignment_support_scheme == dr_explicit_realign_optimized);
5544 /* We need to generate three things:
5545 1. the misalignment computation
5546 2. the extra vector load (for the optimized realignment scheme).
5547 3. the phi node for the two vectors from which the realignment is
5548 done (for the optimized realignment scheme).
5551 /* 1. Determine where to generate the misalignment computation.
5553 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5554 calculation will be generated by this function, outside the loop (in the
5555 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5556 caller, inside the loop.
5558 Background: If the misalignment remains fixed throughout the iterations of
5559 the loop, then both realignment schemes are applicable, and also the
5560 misalignment computation can be done outside LOOP. This is because we are
5561 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5562 are a multiple of VS (the Vector Size), and therefore the misalignment in
5563 different vectorized LOOP iterations is always the same.
5564 The problem arises only if the memory access is in an inner-loop nested
5565 inside LOOP, which is now being vectorized using outer-loop vectorization.
5566 This is the only case when the misalignment of the memory access may not
5567 remain fixed throughout the iterations of the inner-loop (as explained in
5568 detail in vect_supportable_dr_alignment). In this case, not only is the
5569 optimized realignment scheme not applicable, but also the misalignment
5570 computation (and generation of the realignment token that is passed to
5571 REALIGN_LOAD) have to be done inside the loop.
5573 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5574 or not, which in turn determines if the misalignment is computed inside
5575 the inner-loop, or outside LOOP. */
5577 if (init_addr != NULL_TREE)
5579 compute_in_loop = true;
5580 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5584 /* 2. Determine where to generate the extra vector load.
5586 For the optimized realignment scheme, instead of generating two vector
5587 loads in each iteration, we generate a single extra vector load in the
5588 preheader of the loop, and in each iteration reuse the result of the
5589 vector load from the previous iteration. In case the memory access is in
5590 an inner-loop nested inside LOOP, which is now being vectorized using
5591 outer-loop vectorization, we need to determine whether this initial vector
5592 load should be generated at the preheader of the inner-loop, or can be
5593 generated at the preheader of LOOP. If the memory access has no evolution
5594 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5595 to be generated inside LOOP (in the preheader of the inner-loop). */
5597 if (nested_in_vect_loop)
5599 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5600 bool invariant_in_outerloop =
5601 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5602 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5605 loop_for_initial_load = loop;
5607 *at_loop = loop_for_initial_load;
5609 /* 3. For the case of the optimized realignment, create the first vector
5610 load at the loop preheader. */
5612 if (alignment_support_scheme == dr_explicit_realign_optimized)
5614 /* Create msq_init = *(floor(p1)) in the loop preheader */
5616 gcc_assert (!compute_in_loop);
5617 pe = loop_preheader_edge (loop_for_initial_load);
5618 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5619 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5620 &init_addr, &inc, true, &inv_p, NULL_TREE);
5621 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5622 new_stmt = gimple_build_assign (vec_dest, data_ref);
5623 new_temp = make_ssa_name (vec_dest, new_stmt);
5624 gimple_assign_set_lhs (new_stmt, new_temp);
5625 mark_symbols_for_renaming (new_stmt);
5626 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5627 gcc_assert (!new_bb);
5628 msq_init = gimple_assign_lhs (new_stmt);
5631 /* 4. Create realignment token using a target builtin, if available.
5632 It is done either inside the containing loop, or before LOOP (as
5633 determined above). */
5635 if (targetm.vectorize.builtin_mask_for_load)
5639 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5640 if (compute_in_loop)
5641 gcc_assert (init_addr); /* already computed by the caller. */
5644 /* Generate the INIT_ADDR computation outside LOOP. */
5645 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5647 pe = loop_preheader_edge (loop);
5648 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5649 gcc_assert (!new_bb);
5652 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5653 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5655 vect_create_destination_var (scalar_dest,
5656 gimple_call_return_type (new_stmt));
5657 new_temp = make_ssa_name (vec_dest, new_stmt);
5658 gimple_call_set_lhs (new_stmt, new_temp);
5660 if (compute_in_loop)
5661 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5664 /* Generate the misalignment computation outside LOOP. */
5665 pe = loop_preheader_edge (loop);
5666 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5667 gcc_assert (!new_bb);
5670 *realignment_token = gimple_call_lhs (new_stmt);
5672 /* The result of the CALL_EXPR to this builtin is determined from
5673 the value of the parameter and no global variables are touched
5674 which makes the builtin a "const" function. Requiring the
5675 builtin to have the "const" attribute makes it unnecessary
5676 to call mark_call_clobbered. */
5677 gcc_assert (TREE_READONLY (builtin_decl));
5680 if (alignment_support_scheme == dr_explicit_realign)
5683 gcc_assert (!compute_in_loop);
5684 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5687 /* 5. Create msq = phi <msq_init, lsq> in loop */
5689 pe = loop_preheader_edge (containing_loop);
5690 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5691 msq = make_ssa_name (vec_dest, NULL);
5692 phi_stmt = create_phi_node (msq, containing_loop->header);
5693 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5694 add_phi_arg (phi_stmt, msq_init, pe);
5700 /* Function vect_strided_load_supported.
5702 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5703 and FALSE otherwise. */
5706 vect_strided_load_supported (tree vectype)
5708 optab perm_even_optab, perm_odd_optab;
5711 mode = (int) TYPE_MODE (vectype);
5713 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype,
5715 if (!perm_even_optab)
5717 if (vect_print_dump_info (REPORT_DETAILS))
5718 fprintf (vect_dump, "no optab for perm_even.");
5722 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5724 if (vect_print_dump_info (REPORT_DETAILS))
5725 fprintf (vect_dump, "perm_even op not supported by target.");
5729 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype,
5731 if (!perm_odd_optab)
5733 if (vect_print_dump_info (REPORT_DETAILS))
5734 fprintf (vect_dump, "no optab for perm_odd.");
5738 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5740 if (vect_print_dump_info (REPORT_DETAILS))
5741 fprintf (vect_dump, "perm_odd op not supported by target.");
5748 /* Function vect_permute_load_chain.
5750 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5751 a power of 2, generate extract_even/odd stmts to reorder the input data
5752 correctly. Return the final references for loads in RESULT_CHAIN.
5754 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5755 The input is 4 vectors each containing 8 elements. We assign a number to each
5756 element, the input sequence is:
5758 1st vec: 0 1 2 3 4 5 6 7
5759 2nd vec: 8 9 10 11 12 13 14 15
5760 3rd vec: 16 17 18 19 20 21 22 23
5761 4th vec: 24 25 26 27 28 29 30 31
5763 The output sequence should be:
5765 1st vec: 0 4 8 12 16 20 24 28
5766 2nd vec: 1 5 9 13 17 21 25 29
5767 3rd vec: 2 6 10 14 18 22 26 30
5768 4th vec: 3 7 11 15 19 23 27 31
5770 i.e., the first output vector should contain the first elements of each
5771 interleaving group, etc.
5773 We use extract_even/odd instructions to create such output. The input of each
5774 extract_even/odd operation is two vectors
5778 and the output is the vector of extracted even/odd elements. The output of
5779 extract_even will be: 0 2 4 6
5780 and of extract_odd: 1 3 5 7
5783 The permutation is done in log LENGTH stages. In each stage extract_even and
5784 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5785 order. In our example,
5787 E1: extract_even (1st vec, 2nd vec)
5788 E2: extract_odd (1st vec, 2nd vec)
5789 E3: extract_even (3rd vec, 4th vec)
5790 E4: extract_odd (3rd vec, 4th vec)
5792 The output for the first stage will be:
5794 E1: 0 2 4 6 8 10 12 14
5795 E2: 1 3 5 7 9 11 13 15
5796 E3: 16 18 20 22 24 26 28 30
5797 E4: 17 19 21 23 25 27 29 31
5799 In order to proceed and create the correct sequence for the next stage (or
5800 for the correct output, if the second stage is the last one, as in our
5801 example), we first put the output of extract_even operation and then the
5802 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5803 The input for the second stage is:
5805 1st vec (E1): 0 2 4 6 8 10 12 14
5806 2nd vec (E3): 16 18 20 22 24 26 28 30
5807 3rd vec (E2): 1 3 5 7 9 11 13 15
5808 4th vec (E4): 17 19 21 23 25 27 29 31
5810 The output of the second stage:
5812 E1: 0 4 8 12 16 20 24 28
5813 E2: 2 6 10 14 18 22 26 30
5814 E3: 1 5 9 13 17 21 25 29
5815 E4: 3 7 11 15 19 23 27 31
5817 And RESULT_CHAIN after reordering:
5819 1st vec (E1): 0 4 8 12 16 20 24 28
5820 2nd vec (E3): 1 5 9 13 17 21 25 29
5821 3rd vec (E2): 2 6 10 14 18 22 26 30
5822 4th vec (E4): 3 7 11 15 19 23 27 31. */
5825 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5826 unsigned int length,
5828 gimple_stmt_iterator *gsi,
5829 VEC(tree,heap) **result_chain)
5831 tree perm_dest, data_ref, first_vect, second_vect;
5833 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5837 /* Check that the operation is supported. */
5838 if (!vect_strided_load_supported (vectype))
5841 *result_chain = VEC_copy (tree, heap, dr_chain);
5842 for (i = 0; i < exact_log2 (length); i++)
5844 for (j = 0; j < length; j +=2)
5846 first_vect = VEC_index (tree, dr_chain, j);
5847 second_vect = VEC_index (tree, dr_chain, j+1);
5849 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5850 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5851 DECL_GIMPLE_REG_P (perm_dest) = 1;
5852 add_referenced_var (perm_dest);
5854 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_EVEN_EXPR,
5855 perm_dest, first_vect,
5858 data_ref = make_ssa_name (perm_dest, perm_stmt);
5859 gimple_assign_set_lhs (perm_stmt, data_ref);
5860 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5861 mark_symbols_for_renaming (perm_stmt);
5863 VEC_replace (tree, *result_chain, j/2, data_ref);
5865 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5866 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5867 DECL_GIMPLE_REG_P (perm_dest) = 1;
5868 add_referenced_var (perm_dest);
5870 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_ODD_EXPR,
5871 perm_dest, first_vect,
5873 data_ref = make_ssa_name (perm_dest, perm_stmt);
5874 gimple_assign_set_lhs (perm_stmt, data_ref);
5875 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5876 mark_symbols_for_renaming (perm_stmt);
5878 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5880 dr_chain = VEC_copy (tree, heap, *result_chain);
5886 /* Function vect_transform_strided_load.
5888 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5889 to perform their permutation and ascribe the result vectorized statements to
5890 the scalar statements.
5894 vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
5895 gimple_stmt_iterator *gsi)
5897 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5898 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5899 gimple next_stmt, new_stmt;
5900 VEC(tree,heap) *result_chain = NULL;
5901 unsigned int i, gap_count;
5904 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5905 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5906 vectors, that are ready for vector computation. */
5907 result_chain = VEC_alloc (tree, heap, size);
5909 if (!vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain))
5912 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5913 Since we scan the chain starting from it's first node, their order
5914 corresponds the order of data-refs in RESULT_CHAIN. */
5915 next_stmt = first_stmt;
5917 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5922 /* Skip the gaps. Loads created for the gaps will be removed by dead
5923 code elimination pass later. No need to check for the first stmt in
5924 the group, since it always exists.
5925 DR_GROUP_GAP is the number of steps in elements from the previous
5926 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5927 correspond to the gaps.
5929 if (next_stmt != first_stmt
5930 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5938 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5939 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5940 copies, and we put the new vector statement in the first available
5942 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5943 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5946 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5949 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5951 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5954 prev_stmt = rel_stmt;
5956 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5959 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5964 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5966 /* If NEXT_STMT accesses the same DR as the previous statement,
5967 put the same TMP_DATA_REF as its vectorized statement; otherwise
5968 get the next data-ref from RESULT_CHAIN. */
5969 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5974 VEC_free (tree, heap, result_chain);
5979 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
5980 building a vector of type MASK_TYPE from it) and two input vectors placed in
5981 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
5982 shifting by STRIDE elements of DR_CHAIN for every copy.
5983 (STRIDE is the number of vectorized stmts for NODE divided by the number of
5985 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
5986 the created stmts must be inserted. */
5989 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
5990 int *mask_array, int mask_nunits,
5991 tree mask_element_type, tree mask_type,
5992 int first_vec_indx, int second_vec_indx,
5993 gimple_stmt_iterator *gsi, slp_tree node,
5994 tree builtin_decl, tree vectype,
5995 VEC(tree,heap) *dr_chain,
5996 int ncopies, int vect_stmts_counter)
5998 tree t = NULL_TREE, mask_vec, mask, perm_dest;
5999 gimple perm_stmt = NULL;
6000 stmt_vec_info next_stmt_info;
6001 int i, group_size, stride, dr_chain_size;
6002 tree first_vec, second_vec, data_ref;
6005 VEC (tree, heap) *params = NULL;
6007 /* Create a vector mask. */
6008 for (i = mask_nunits - 1; i >= 0; --i)
6009 t = tree_cons (NULL_TREE, build_int_cst (mask_element_type, mask_array[i]),
6012 mask_vec = build_vector (mask_type, t);
6013 mask = vect_init_vector (stmt, mask_vec, mask_type, NULL);
6015 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node));
6016 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
6017 dr_chain_size = VEC_length (tree, dr_chain);
6019 /* Initialize the vect stmts of NODE to properly insert the generated
6021 for (i = VEC_length (gimple, SLP_TREE_VEC_STMTS (node));
6022 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6023 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (node), NULL);
6025 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
6026 for (i = 0; i < ncopies; i++)
6028 first_vec = VEC_index (tree, dr_chain, first_vec_indx);
6029 second_vec = VEC_index (tree, dr_chain, second_vec_indx);
6031 /* Build argument list for the vectorized call. */
6032 VEC_free (tree, heap, params);
6033 params = VEC_alloc (tree, heap, 3);
6034 VEC_quick_push (tree, params, first_vec);
6035 VEC_quick_push (tree, params, second_vec);
6036 VEC_quick_push (tree, params, mask);
6038 /* Generate the permute statement. */
6039 perm_stmt = gimple_build_call_vec (builtin_decl, params);
6040 data_ref = make_ssa_name (perm_dest, perm_stmt);
6041 gimple_call_set_lhs (perm_stmt, data_ref);
6042 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6043 FOR_EACH_SSA_TREE_OPERAND (sym, perm_stmt, iter, SSA_OP_ALL_VIRTUALS)
6045 if (TREE_CODE (sym) == SSA_NAME)
6046 sym = SSA_NAME_VAR (sym);
6047 mark_sym_for_renaming (sym);
6050 /* Store the vector statement in NODE. */
6051 VEC_replace (gimple, SLP_TREE_VEC_STMTS (node),
6052 stride * i + vect_stmts_counter, perm_stmt);
6054 first_vec_indx += stride;
6055 second_vec_indx += stride;
6058 /* Mark the scalar stmt as vectorized. */
6059 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
6060 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
6064 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
6065 return in CURRENT_MASK_ELEMENT its equivalent in target specific
6066 representation. Check that the mask is valid and return FALSE if not.
6067 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
6068 the next vector, i.e., the current first vector is not needed. */
6071 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
6072 int mask_nunits, bool only_one_vec, int index,
6073 int *mask, int *current_mask_element,
6074 bool *need_next_vector)
6077 static int number_of_mask_fixes = 1;
6078 static bool mask_fixed = false;
6079 static bool needs_first_vector = false;
6081 /* Convert to target specific representation. */
6082 *current_mask_element = first_mask_element + m;
6083 /* Adjust the value in case it's a mask for second and third vectors. */
6084 *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1);
6086 if (*current_mask_element < mask_nunits)
6087 needs_first_vector = true;
6089 /* We have only one input vector to permute but the mask accesses values in
6090 the next vector as well. */
6091 if (only_one_vec && *current_mask_element >= mask_nunits)
6093 if (vect_print_dump_info (REPORT_DETAILS))
6095 fprintf (vect_dump, "permutation requires at least two vectors ");
6096 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6102 /* The mask requires the next vector. */
6103 if (*current_mask_element >= mask_nunits * 2)
6105 if (needs_first_vector || mask_fixed)
6107 /* We either need the first vector too or have already moved to the
6108 next vector. In both cases, this permutation needs three
6110 if (vect_print_dump_info (REPORT_DETAILS))
6112 fprintf (vect_dump, "permutation requires at "
6113 "least three vectors ");
6114 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6120 /* We move to the next vector, dropping the first one and working with
6121 the second and the third - we need to adjust the values of the mask
6123 *current_mask_element -= mask_nunits * number_of_mask_fixes;
6125 for (i = 0; i < index; i++)
6126 mask[i] -= mask_nunits * number_of_mask_fixes;
6128 (number_of_mask_fixes)++;
6132 *need_next_vector = mask_fixed;
6134 /* This was the last element of this mask. Start a new one. */
6135 if (index == mask_nunits - 1)
6137 number_of_mask_fixes = 1;
6139 needs_first_vector = false;
6146 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6147 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6148 permute statements for SLP_NODE_INSTANCE. */
6150 vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
6151 gimple_stmt_iterator *gsi, int vf,
6152 slp_instance slp_node_instance, bool analyze_only)
6154 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6155 tree mask_element_type = NULL_TREE, mask_type;
6156 int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index;
6158 tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl;
6159 gimple next_scalar_stmt;
6160 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6161 int first_mask_element;
6162 int index, unroll_factor, *mask, current_mask_element, ncopies;
6163 bool only_one_vec = false, need_next_vector = false;
6164 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
6166 if (!targetm.vectorize.builtin_vec_perm)
6168 if (vect_print_dump_info (REPORT_DETAILS))
6170 fprintf (vect_dump, "no builtin for vect permute for ");
6171 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6177 builtin_decl = targetm.vectorize.builtin_vec_perm (vectype,
6178 &mask_element_type);
6179 if (!builtin_decl || !mask_element_type)
6181 if (vect_print_dump_info (REPORT_DETAILS))
6183 fprintf (vect_dump, "no builtin for vect permute for ");
6184 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6190 mask_type = get_vectype_for_scalar_type (mask_element_type);
6191 mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type);
6192 mask = (int *) xmalloc (sizeof (int) * mask_nunits);
6193 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6194 scale = mask_nunits / nunits;
6195 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6197 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
6198 unrolling factor. */
6199 orig_vec_stmts_num = group_size *
6200 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
6201 if (orig_vec_stmts_num == 1)
6202 only_one_vec = true;
6204 /* Number of copies is determined by the final vectorization factor
6205 relatively to SLP_NODE_INSTANCE unrolling factor. */
6206 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6208 /* Generate permutation masks for every NODE. Number of masks for each NODE
6209 is equal to GROUP_SIZE.
6210 E.g., we have a group of three nodes with three loads from the same
6211 location in each node, and the vector size is 4. I.e., we have a
6212 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6213 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6214 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6217 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target
6218 scpecific type, e.g., in bytes for Altivec.
6219 The last mask is illegal since we assume two operands for permute
6220 operation, and the mask element values can't be outside that range. Hence,
6221 the last mask must be converted into {2,5,5,5}.
6222 For the first two permutations we need the first and the second input
6223 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6224 we need the second and the third vectors: {b1,c1,a2,b2} and
6228 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_node_instance),
6234 vect_stmts_counter = 0;
6236 first_vec_index = vec_index++;
6238 second_vec_index = first_vec_index;
6240 second_vec_index = vec_index++;
6242 for (j = 0; j < unroll_factor; j++)
6244 for (k = 0; k < group_size; k++)
6246 first_mask_element = (i + j * group_size) * scale;
6247 for (m = 0; m < scale; m++)
6249 if (!vect_get_mask_element (stmt, first_mask_element, m,
6250 mask_nunits, only_one_vec, index, mask,
6251 ¤t_mask_element, &need_next_vector))
6254 mask[index++] = current_mask_element;
6257 if (index == mask_nunits)
6262 if (need_next_vector)
6264 first_vec_index = second_vec_index;
6265 second_vec_index = vec_index;
6268 next_scalar_stmt = VEC_index (gimple,
6269 SLP_TREE_SCALAR_STMTS (node), scalar_index++);
6271 vect_create_mask_and_perm (stmt, next_scalar_stmt,
6272 mask, mask_nunits, mask_element_type, mask_type,
6273 first_vec_index, second_vec_index, gsi, node,
6274 builtin_decl, vectype, dr_chain, ncopies,
6275 vect_stmts_counter++);
6286 /* vectorizable_load.
6288 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
6290 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6291 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
6292 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6295 vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
6296 slp_tree slp_node, slp_instance slp_node_instance)
6299 tree vec_dest = NULL;
6300 tree data_ref = NULL;
6301 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6302 stmt_vec_info prev_stmt_info;
6303 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6304 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6305 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
6306 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
6307 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
6308 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6311 gimple new_stmt = NULL;
6313 enum dr_alignment_support alignment_support_scheme;
6314 tree dataref_ptr = NULL_TREE;
6316 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6318 int i, j, group_size;
6319 tree msq = NULL_TREE, lsq;
6320 tree offset = NULL_TREE;
6321 tree realignment_token = NULL_TREE;
6323 VEC(tree,heap) *dr_chain = NULL;
6324 bool strided_load = false;
6328 bool compute_in_loop = false;
6329 struct loop *at_loop;
6331 bool slp = (slp_node != NULL);
6332 bool slp_perm = false;
6333 enum tree_code code;
6335 /* Multiple types in SLP are handled by creating the appropriate number of
6336 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
6341 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6343 gcc_assert (ncopies >= 1);
6345 /* FORNOW. This restriction should be relaxed. */
6346 if (nested_in_vect_loop && ncopies > 1)
6348 if (vect_print_dump_info (REPORT_DETAILS))
6349 fprintf (vect_dump, "multiple types in nested loop.");
6353 if (slp && SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance))
6356 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6359 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6362 /* Is vectorizable load? */
6363 if (!is_gimple_assign (stmt))
6366 scalar_dest = gimple_assign_lhs (stmt);
6367 if (TREE_CODE (scalar_dest) != SSA_NAME)
6370 code = gimple_assign_rhs_code (stmt);
6371 if (code != ARRAY_REF
6372 && code != INDIRECT_REF
6373 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
6376 if (!STMT_VINFO_DATA_REF (stmt_info))
6379 scalar_type = TREE_TYPE (DR_REF (dr));
6380 mode = (int) TYPE_MODE (vectype);
6382 /* FORNOW. In some cases can vectorize even if data-type not supported
6383 (e.g. - data copies). */
6384 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
6386 if (vect_print_dump_info (REPORT_DETAILS))
6387 fprintf (vect_dump, "Aligned load, but unsupported type.");
6391 /* If accesses through a pointer to vectype do not alias the original
6392 memory reference we have a problem. This should never happen. */
6393 if (get_alias_set (vectype) != get_alias_set (gimple_assign_rhs1 (stmt))
6394 && !alias_set_subset_of (get_alias_set (vectype),
6395 get_alias_set (gimple_assign_rhs1 (stmt))))
6397 if (vect_print_dump_info (REPORT_DETAILS))
6398 fprintf (vect_dump, "??? vector type does not alias scalar type");
6402 /* The vector component type needs to be trivially convertible to the
6403 scalar lhs. This should always be the case. */
6404 if (!useless_type_conversion_p (TREE_TYPE (scalar_dest), TREE_TYPE (vectype)))
6406 if (vect_print_dump_info (REPORT_DETAILS))
6407 fprintf (vect_dump, "??? operands of different types");
6411 /* Check if the load is a part of an interleaving chain. */
6412 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6414 strided_load = true;
6416 gcc_assert (! nested_in_vect_loop);
6418 /* Check if interleaving is supported. */
6419 if (!vect_strided_load_supported (vectype)
6420 && !PURE_SLP_STMT (stmt_info) && !slp)
6424 if (!vec_stmt) /* transformation not required. */
6426 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
6427 vect_model_load_cost (stmt_info, ncopies, NULL);
6431 if (vect_print_dump_info (REPORT_DETAILS))
6432 fprintf (vect_dump, "transform load.");
6438 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
6439 /* Check if the chain of loads is already vectorized. */
6440 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
6442 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6445 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
6446 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
6448 /* VEC_NUM is the number of vect stmts to be created for this group. */
6451 strided_load = false;
6452 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6455 vec_num = group_size;
6457 dr_chain = VEC_alloc (tree, heap, vec_num);
6463 group_size = vec_num = 1;
6466 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
6467 gcc_assert (alignment_support_scheme);
6469 /* In case the vectorization factor (VF) is bigger than the number
6470 of elements that we can fit in a vectype (nunits), we have to generate
6471 more than one vector stmt - i.e - we need to "unroll" the
6472 vector stmt by a factor VF/nunits. In doing so, we record a pointer
6473 from one copy of the vector stmt to the next, in the field
6474 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
6475 stages to find the correct vector defs to be used when vectorizing
6476 stmts that use the defs of the current stmt. The example below illustrates
6477 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
6478 4 vectorized stmts):
6480 before vectorization:
6481 RELATED_STMT VEC_STMT
6485 step 1: vectorize stmt S1:
6486 We first create the vector stmt VS1_0, and, as usual, record a
6487 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
6488 Next, we create the vector stmt VS1_1, and record a pointer to
6489 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
6490 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
6492 RELATED_STMT VEC_STMT
6493 VS1_0: vx0 = memref0 VS1_1 -
6494 VS1_1: vx1 = memref1 VS1_2 -
6495 VS1_2: vx2 = memref2 VS1_3 -
6496 VS1_3: vx3 = memref3 - -
6497 S1: x = load - VS1_0
6500 See in documentation in vect_get_vec_def_for_stmt_copy for how the
6501 information we recorded in RELATED_STMT field is used to vectorize
6504 /* In case of interleaving (non-unit strided access):
6511 Vectorized loads are created in the order of memory accesses
6512 starting from the access of the first stmt of the chain:
6515 VS2: vx1 = &base + vec_size*1
6516 VS3: vx3 = &base + vec_size*2
6517 VS4: vx4 = &base + vec_size*3
6519 Then permutation statements are generated:
6521 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
6522 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
6525 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
6526 (the order of the data-refs in the output of vect_permute_load_chain
6527 corresponds to the order of scalar stmts in the interleaving chain - see
6528 the documentation of vect_permute_load_chain()).
6529 The generation of permutation stmts and recording them in
6530 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
6532 In case of both multiple types and interleaving, the vector loads and
6533 permutation stmts above are created for every copy. The result vector stmts
6534 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
6535 STMT_VINFO_RELATED_STMT for the next copies. */
6537 /* If the data reference is aligned (dr_aligned) or potentially unaligned
6538 on a target that supports unaligned accesses (dr_unaligned_supported)
6539 we generate the following code:
6543 p = p + indx * vectype_size;
6548 Otherwise, the data reference is potentially unaligned on a target that
6549 does not support unaligned accesses (dr_explicit_realign_optimized) -
6550 then generate the following code, in which the data in each iteration is
6551 obtained by two vector loads, one from the previous iteration, and one
6552 from the current iteration:
6554 msq_init = *(floor(p1))
6555 p2 = initial_addr + VS - 1;
6556 realignment_token = call target_builtin;
6559 p2 = p2 + indx * vectype_size
6561 vec_dest = realign_load (msq, lsq, realignment_token)
6566 /* If the misalignment remains the same throughout the execution of the
6567 loop, we can create the init_addr and permutation mask at the loop
6568 preheader. Otherwise, it needs to be created inside the loop.
6569 This can only occur when vectorizing memory accesses in the inner-loop
6570 nested within an outer-loop that is being vectorized. */
6572 if (nested_in_vect_loop_p (loop, stmt)
6573 && (TREE_INT_CST_LOW (DR_STEP (dr))
6574 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
6576 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
6577 compute_in_loop = true;
6580 if ((alignment_support_scheme == dr_explicit_realign_optimized
6581 || alignment_support_scheme == dr_explicit_realign)
6582 && !compute_in_loop)
6584 msq = vect_setup_realignment (first_stmt, gsi, &realignment_token,
6585 alignment_support_scheme, NULL_TREE,
6587 if (alignment_support_scheme == dr_explicit_realign_optimized)
6589 phi = SSA_NAME_DEF_STMT (msq);
6590 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6596 prev_stmt_info = NULL;
6597 for (j = 0; j < ncopies; j++)
6599 /* 1. Create the vector pointer update chain. */
6601 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
6603 &dummy, &ptr_incr, false,
6607 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
6609 for (i = 0; i < vec_num; i++)
6612 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
6615 /* 2. Create the vector-load in the loop. */
6616 switch (alignment_support_scheme)
6619 gcc_assert (aligned_access_p (first_dr));
6620 data_ref = build_fold_indirect_ref (dataref_ptr);
6622 case dr_unaligned_supported:
6624 int mis = DR_MISALIGNMENT (first_dr);
6625 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
6627 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
6629 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
6632 case dr_explicit_realign:
6635 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6637 if (compute_in_loop)
6638 msq = vect_setup_realignment (first_stmt, gsi,
6640 dr_explicit_realign,
6643 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6644 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6645 new_stmt = gimple_build_assign (vec_dest, data_ref);
6646 new_temp = make_ssa_name (vec_dest, new_stmt);
6647 gimple_assign_set_lhs (new_stmt, new_temp);
6648 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6649 copy_virtual_operands (new_stmt, stmt);
6650 mark_symbols_for_renaming (new_stmt);
6653 bump = size_binop (MULT_EXPR, vs_minus_1,
6654 TYPE_SIZE_UNIT (scalar_type));
6655 ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump);
6656 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
6659 case dr_explicit_realign_optimized:
6660 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6665 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6666 new_stmt = gimple_build_assign (vec_dest, data_ref);
6667 new_temp = make_ssa_name (vec_dest, new_stmt);
6668 gimple_assign_set_lhs (new_stmt, new_temp);
6669 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6670 mark_symbols_for_renaming (new_stmt);
6672 /* 3. Handle explicit realignment if necessary/supported. Create in
6673 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
6674 if (alignment_support_scheme == dr_explicit_realign_optimized
6675 || alignment_support_scheme == dr_explicit_realign)
6679 lsq = gimple_assign_lhs (new_stmt);
6680 if (!realignment_token)
6681 realignment_token = dataref_ptr;
6682 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6683 tmp = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
6685 new_stmt = gimple_build_assign (vec_dest, tmp);
6686 new_temp = make_ssa_name (vec_dest, new_stmt);
6687 gimple_assign_set_lhs (new_stmt, new_temp);
6688 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6690 if (alignment_support_scheme == dr_explicit_realign_optimized)
6693 if (i == vec_num - 1 && j == ncopies - 1)
6694 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
6699 /* 4. Handle invariant-load. */
6702 gcc_assert (!strided_load);
6703 gcc_assert (nested_in_vect_loop_p (loop, stmt));
6708 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
6710 /* CHECKME: bitpos depends on endianess? */
6711 bitpos = bitsize_zero_node;
6712 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6715 vect_create_destination_var (scalar_dest, NULL_TREE);
6716 new_stmt = gimple_build_assign (vec_dest, vec_inv);
6717 new_temp = make_ssa_name (vec_dest, new_stmt);
6718 gimple_assign_set_lhs (new_stmt, new_temp);
6719 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6721 for (k = nunits - 1; k >= 0; --k)
6722 t = tree_cons (NULL_TREE, new_temp, t);
6723 /* FIXME: use build_constructor directly. */
6724 vec_inv = build_constructor_from_list (vectype, t);
6725 new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
6726 new_stmt = SSA_NAME_DEF_STMT (new_temp);
6729 gcc_unreachable (); /* FORNOW. */
6732 /* Collect vector loads and later create their permutation in
6733 vect_transform_strided_load (). */
6734 if (strided_load || slp_perm)
6735 VEC_quick_push (tree, dr_chain, new_temp);
6737 /* Store vector loads in the corresponding SLP_NODE. */
6738 if (slp && !slp_perm)
6739 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
6742 if (slp && !slp_perm)
6747 if (!vect_transform_slp_perm_load (stmt, dr_chain, gsi,
6748 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
6749 slp_node_instance, false))
6751 VEC_free (tree, heap, dr_chain);
6759 if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi))
6762 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6763 VEC_free (tree, heap, dr_chain);
6764 dr_chain = VEC_alloc (tree, heap, group_size);
6769 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6771 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6772 prev_stmt_info = vinfo_for_stmt (new_stmt);
6778 VEC_free (tree, heap, dr_chain);
6784 /* Function vectorizable_live_operation.
6786 STMT computes a value that is used outside the loop. Check if
6787 it can be supported. */
6790 vectorizable_live_operation (gimple stmt,
6791 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6792 gimple *vec_stmt ATTRIBUTE_UNUSED)
6794 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6795 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6796 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6802 enum vect_def_type dt;
6803 enum tree_code code;
6804 enum gimple_rhs_class rhs_class;
6806 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6808 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6811 if (!is_gimple_assign (stmt))
6814 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6817 /* FORNOW. CHECKME. */
6818 if (nested_in_vect_loop_p (loop, stmt))
6821 code = gimple_assign_rhs_code (stmt);
6822 op_type = TREE_CODE_LENGTH (code);
6823 rhs_class = get_gimple_rhs_class (code);
6824 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
6825 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
6827 /* FORNOW: support only if all uses are invariant. This means
6828 that the scalar operations can remain in place, unvectorized.
6829 The original last scalar value that they compute will be used. */
6831 for (i = 0; i < op_type; i++)
6833 if (rhs_class == GIMPLE_SINGLE_RHS)
6834 op = TREE_OPERAND (gimple_op (stmt, 1), i);
6836 op = gimple_op (stmt, i + 1);
6837 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
6839 if (vect_print_dump_info (REPORT_DETAILS))
6840 fprintf (vect_dump, "use not simple.");
6844 if (dt != vect_invariant_def && dt != vect_constant_def)
6848 /* No transformation is required for the cases we currently support. */
6853 /* Function vect_is_simple_cond.
6856 LOOP - the loop that is being vectorized.
6857 COND - Condition that is checked for simple use.
6859 Returns whether a COND can be vectorized. Checks whether
6860 condition operands are supportable using vec_is_simple_use. */
6863 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6867 enum vect_def_type dt;
6869 if (!COMPARISON_CLASS_P (cond))
6872 lhs = TREE_OPERAND (cond, 0);
6873 rhs = TREE_OPERAND (cond, 1);
6875 if (TREE_CODE (lhs) == SSA_NAME)
6877 gimple lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6878 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6881 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6882 && TREE_CODE (lhs) != FIXED_CST)
6885 if (TREE_CODE (rhs) == SSA_NAME)
6887 gimple rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6888 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6891 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
6892 && TREE_CODE (rhs) != FIXED_CST)
6898 /* vectorizable_condition.
6900 Check if STMT is conditional modify expression that can be vectorized.
6901 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6902 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
6905 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6908 vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
6911 tree scalar_dest = NULL_TREE;
6912 tree vec_dest = NULL_TREE;
6913 tree op = NULL_TREE;
6914 tree cond_expr, then_clause, else_clause;
6915 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6916 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6917 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6918 tree vec_compare, vec_cond_expr;
6920 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6921 enum machine_mode vec_mode;
6923 enum vect_def_type dt;
6924 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6925 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6926 enum tree_code code;
6928 gcc_assert (ncopies >= 1);
6930 return false; /* FORNOW */
6932 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6935 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6938 /* FORNOW: SLP not supported. */
6939 if (STMT_SLP_TYPE (stmt_info))
6942 /* FORNOW: not yet supported. */
6943 if (STMT_VINFO_LIVE_P (stmt_info))
6945 if (vect_print_dump_info (REPORT_DETAILS))
6946 fprintf (vect_dump, "value used after loop.");
6950 /* Is vectorizable conditional operation? */
6951 if (!is_gimple_assign (stmt))
6954 code = gimple_assign_rhs_code (stmt);
6956 if (code != COND_EXPR)
6959 gcc_assert (gimple_assign_single_p (stmt));
6960 op = gimple_assign_rhs1 (stmt);
6961 cond_expr = TREE_OPERAND (op, 0);
6962 then_clause = TREE_OPERAND (op, 1);
6963 else_clause = TREE_OPERAND (op, 2);
6965 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6968 /* We do not handle two different vector types for the condition
6970 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6973 if (TREE_CODE (then_clause) == SSA_NAME)
6975 gimple then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6976 if (!vect_is_simple_use (then_clause, loop_vinfo,
6977 &then_def_stmt, &def, &dt))
6980 else if (TREE_CODE (then_clause) != INTEGER_CST
6981 && TREE_CODE (then_clause) != REAL_CST
6982 && TREE_CODE (then_clause) != FIXED_CST)
6985 if (TREE_CODE (else_clause) == SSA_NAME)
6987 gimple else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6988 if (!vect_is_simple_use (else_clause, loop_vinfo,
6989 &else_def_stmt, &def, &dt))
6992 else if (TREE_CODE (else_clause) != INTEGER_CST
6993 && TREE_CODE (else_clause) != REAL_CST
6994 && TREE_CODE (else_clause) != FIXED_CST)
6998 vec_mode = TYPE_MODE (vectype);
7002 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
7003 return expand_vec_cond_expr_p (op, vec_mode);
7009 scalar_dest = gimple_assign_lhs (stmt);
7010 vec_dest = vect_create_destination_var (scalar_dest, vectype);
7012 /* Handle cond expr. */
7014 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
7016 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
7017 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
7018 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
7020 /* Arguments are ready. Create the new vector stmt. */
7021 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
7022 vec_cond_lhs, vec_cond_rhs);
7023 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
7024 vec_compare, vec_then_clause, vec_else_clause);
7026 *vec_stmt = gimple_build_assign (vec_dest, vec_cond_expr);
7027 new_temp = make_ssa_name (vec_dest, *vec_stmt);
7028 gimple_assign_set_lhs (*vec_stmt, new_temp);
7029 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
7035 /* Function vect_transform_stmt.
7037 Create a vectorized stmt to replace STMT, and insert it at BSI. */
7040 vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
7041 bool *strided_store, slp_tree slp_node,
7042 slp_instance slp_node_instance)
7044 bool is_store = false;
7045 gimple vec_stmt = NULL;
7046 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7047 gimple orig_stmt_in_pattern;
7050 switch (STMT_VINFO_TYPE (stmt_info))
7052 case type_demotion_vec_info_type:
7053 done = vectorizable_type_demotion (stmt, gsi, &vec_stmt, slp_node);
7057 case type_promotion_vec_info_type:
7058 done = vectorizable_type_promotion (stmt, gsi, &vec_stmt, slp_node);
7062 case type_conversion_vec_info_type:
7063 done = vectorizable_conversion (stmt, gsi, &vec_stmt, slp_node);
7067 case induc_vec_info_type:
7068 gcc_assert (!slp_node);
7069 done = vectorizable_induction (stmt, gsi, &vec_stmt);
7073 case op_vec_info_type:
7074 done = vectorizable_operation (stmt, gsi, &vec_stmt, slp_node);
7078 case assignment_vec_info_type:
7079 done = vectorizable_assignment (stmt, gsi, &vec_stmt, slp_node);
7083 case load_vec_info_type:
7084 done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node,
7089 case store_vec_info_type:
7090 done = vectorizable_store (stmt, gsi, &vec_stmt, slp_node);
7092 if (STMT_VINFO_STRIDED_ACCESS (stmt_info) && !slp_node)
7094 /* In case of interleaving, the whole chain is vectorized when the
7095 last store in the chain is reached. Store stmts before the last
7096 one are skipped, and there vec_stmt_info shouldn't be freed
7098 *strided_store = true;
7099 if (STMT_VINFO_VEC_STMT (stmt_info))
7106 case condition_vec_info_type:
7107 gcc_assert (!slp_node);
7108 done = vectorizable_condition (stmt, gsi, &vec_stmt);
7112 case call_vec_info_type:
7113 gcc_assert (!slp_node);
7114 done = vectorizable_call (stmt, gsi, &vec_stmt);
7117 case reduc_vec_info_type:
7118 gcc_assert (!slp_node);
7119 done = vectorizable_reduction (stmt, gsi, &vec_stmt);
7124 if (!STMT_VINFO_LIVE_P (stmt_info))
7126 if (vect_print_dump_info (REPORT_DETAILS))
7127 fprintf (vect_dump, "stmt not supported.");
7132 if (STMT_VINFO_LIVE_P (stmt_info)
7133 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
7135 done = vectorizable_live_operation (stmt, gsi, &vec_stmt);
7141 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
7142 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
7143 if (orig_stmt_in_pattern)
7145 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
7146 /* STMT was inserted by the vectorizer to replace a computation idiom.
7147 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
7148 computed this idiom. We need to record a pointer to VEC_STMT in
7149 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
7150 documentation of vect_pattern_recog. */
7151 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
7153 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
7154 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
7163 /* This function builds ni_name = number of iterations loop executes
7164 on the loop preheader. */
7167 vect_build_loop_niters (loop_vec_info loop_vinfo)
7170 gimple_seq stmts = NULL;
7172 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7173 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
7175 var = create_tmp_var (TREE_TYPE (ni), "niters");
7176 add_referenced_var (var);
7177 ni_name = force_gimple_operand (ni, &stmts, false, var);
7179 pe = loop_preheader_edge (loop);
7182 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7183 gcc_assert (!new_bb);
7190 /* This function generates the following statements:
7192 ni_name = number of iterations loop executes
7193 ratio = ni_name / vf
7194 ratio_mult_vf_name = ratio * vf
7196 and places them at the loop preheader edge. */
7199 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
7201 tree *ratio_mult_vf_name_ptr,
7202 tree *ratio_name_ptr)
7211 tree ratio_mult_vf_name;
7212 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7213 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
7214 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7217 pe = loop_preheader_edge (loop);
7219 /* Generate temporary variable that contains
7220 number of iterations loop executes. */
7222 ni_name = vect_build_loop_niters (loop_vinfo);
7223 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
7225 /* Create: ratio = ni >> log2(vf) */
7227 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
7228 if (!is_gimple_val (ratio_name))
7230 var = create_tmp_var (TREE_TYPE (ni), "bnd");
7231 add_referenced_var (var);
7234 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
7235 pe = loop_preheader_edge (loop);
7236 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7237 gcc_assert (!new_bb);
7240 /* Create: ratio_mult_vf = ratio << log2 (vf). */
7242 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
7243 ratio_name, log_vf);
7244 if (!is_gimple_val (ratio_mult_vf_name))
7246 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
7247 add_referenced_var (var);
7250 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
7252 pe = loop_preheader_edge (loop);
7253 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7254 gcc_assert (!new_bb);
7257 *ni_name_ptr = ni_name;
7258 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
7259 *ratio_name_ptr = ratio_name;
7265 /* Function vect_update_ivs_after_vectorizer.
7267 "Advance" the induction variables of LOOP to the value they should take
7268 after the execution of LOOP. This is currently necessary because the
7269 vectorizer does not handle induction variables that are used after the
7270 loop. Such a situation occurs when the last iterations of LOOP are
7272 1. We introduced new uses after LOOP for IVs that were not originally used
7273 after LOOP: the IVs of LOOP are now used by an epilog loop.
7274 2. LOOP is going to be vectorized; this means that it will iterate N/VF
7275 times, whereas the loop IVs should be bumped N times.
7278 - LOOP - a loop that is going to be vectorized. The last few iterations
7279 of LOOP were peeled.
7280 - NITERS - the number of iterations that LOOP executes (before it is
7281 vectorized). i.e, the number of times the ivs should be bumped.
7282 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
7283 coming out from LOOP on which there are uses of the LOOP ivs
7284 (this is the path from LOOP->exit to epilog_loop->preheader).
7286 The new definitions of the ivs are placed in LOOP->exit.
7287 The phi args associated with the edge UPDATE_E in the bb
7288 UPDATE_E->dest are updated accordingly.
7290 Assumption 1: Like the rest of the vectorizer, this function assumes
7291 a single loop exit that has a single predecessor.
7293 Assumption 2: The phi nodes in the LOOP header and in update_bb are
7294 organized in the same order.
7296 Assumption 3: The access function of the ivs is simple enough (see
7297 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
7299 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
7300 coming out of LOOP on which the ivs of LOOP are used (this is the path
7301 that leads to the epilog loop; other paths skip the epilog loop). This
7302 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
7303 needs to have its phis updated.
7307 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
7310 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7311 basic_block exit_bb = single_exit (loop)->dest;
7313 gimple_stmt_iterator gsi, gsi1;
7314 basic_block update_bb = update_e->dest;
7316 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
7318 /* Make sure there exists a single-predecessor exit bb: */
7319 gcc_assert (single_pred_p (exit_bb));
7321 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
7322 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
7323 gsi_next (&gsi), gsi_next (&gsi1))
7325 tree access_fn = NULL;
7326 tree evolution_part;
7329 tree var, ni, ni_name;
7330 gimple_stmt_iterator last_gsi;
7332 phi = gsi_stmt (gsi);
7333 phi1 = gsi_stmt (gsi1);
7334 if (vect_print_dump_info (REPORT_DETAILS))
7336 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
7337 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
7340 /* Skip virtual phi's. */
7341 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
7343 if (vect_print_dump_info (REPORT_DETAILS))
7344 fprintf (vect_dump, "virtual phi. skip.");
7348 /* Skip reduction phis. */
7349 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
7351 if (vect_print_dump_info (REPORT_DETAILS))
7352 fprintf (vect_dump, "reduc phi. skip.");
7356 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
7357 gcc_assert (access_fn);
7358 STRIP_NOPS (access_fn);
7360 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
7361 gcc_assert (evolution_part != NULL_TREE);
7363 /* FORNOW: We do not support IVs whose evolution function is a polynomial
7364 of degree >= 2 or exponential. */
7365 gcc_assert (!tree_is_chrec (evolution_part));
7367 step_expr = evolution_part;
7368 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
7371 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
7372 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
7374 fold_convert (sizetype,
7375 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
7376 niters, step_expr)));
7378 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
7379 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
7380 fold_convert (TREE_TYPE (init_expr),
7387 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
7388 add_referenced_var (var);
7390 last_gsi = gsi_last_bb (exit_bb);
7391 ni_name = force_gimple_operand_gsi (&last_gsi, ni, false, var,
7392 true, GSI_SAME_STMT);
7394 /* Fix phi expressions in the successor bb. */
7395 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
7399 /* Return the more conservative threshold between the
7400 min_profitable_iters returned by the cost model and the user
7401 specified threshold, if provided. */
7404 conservative_cost_threshold (loop_vec_info loop_vinfo,
7405 int min_profitable_iters)
7408 int min_scalar_loop_bound;
7410 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
7411 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
7413 /* Use the cost model only if it is more conservative than user specified
7415 th = (unsigned) min_scalar_loop_bound;
7416 if (min_profitable_iters
7417 && (!min_scalar_loop_bound
7418 || min_profitable_iters > min_scalar_loop_bound))
7419 th = (unsigned) min_profitable_iters;
7421 if (th && vect_print_dump_info (REPORT_COST))
7422 fprintf (vect_dump, "Vectorization may not be profitable.");
7427 /* Function vect_do_peeling_for_loop_bound
7429 Peel the last iterations of the loop represented by LOOP_VINFO.
7430 The peeled iterations form a new epilog loop. Given that the loop now
7431 iterates NITERS times, the new epilog loop iterates
7432 NITERS % VECTORIZATION_FACTOR times.
7434 The original loop will later be made to iterate
7435 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
7438 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
7440 tree ni_name, ratio_mult_vf_name;
7441 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7442 struct loop *new_loop;
7444 basic_block preheader;
7446 bool check_profitability = false;
7447 unsigned int th = 0;
7448 int min_profitable_iters;
7450 if (vect_print_dump_info (REPORT_DETAILS))
7451 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
7453 initialize_original_copy_tables ();
7455 /* Generate the following variables on the preheader of original loop:
7457 ni_name = number of iteration the original loop executes
7458 ratio = ni_name / vf
7459 ratio_mult_vf_name = ratio * vf */
7460 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
7461 &ratio_mult_vf_name, ratio);
7463 loop_num = loop->num;
7465 /* If cost model check not done during versioning and
7466 peeling for alignment. */
7467 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7468 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
7469 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7471 check_profitability = true;
7473 /* Get profitability threshold for vectorized loop. */
7474 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7476 th = conservative_cost_threshold (loop_vinfo,
7477 min_profitable_iters);
7480 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
7481 ratio_mult_vf_name, ni_name, false,
7482 th, check_profitability);
7483 gcc_assert (new_loop);
7484 gcc_assert (loop_num == loop->num);
7485 #ifdef ENABLE_CHECKING
7486 slpeel_verify_cfg_after_peeling (loop, new_loop);
7489 /* A guard that controls whether the new_loop is to be executed or skipped
7490 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
7491 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
7492 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
7493 is on the path where the LOOP IVs are used and need to be updated. */
7495 preheader = loop_preheader_edge (new_loop)->src;
7496 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
7497 update_e = EDGE_PRED (preheader, 0);
7499 update_e = EDGE_PRED (preheader, 1);
7501 /* Update IVs of original loop as if they were advanced
7502 by ratio_mult_vf_name steps. */
7503 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
7505 /* After peeling we have to reset scalar evolution analyzer. */
7508 free_original_copy_tables ();
7512 /* Function vect_gen_niters_for_prolog_loop
7514 Set the number of iterations for the loop represented by LOOP_VINFO
7515 to the minimum between LOOP_NITERS (the original iteration count of the loop)
7516 and the misalignment of DR - the data reference recorded in
7517 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
7518 this loop, the data reference DR will refer to an aligned location.
7520 The following computation is generated:
7522 If the misalignment of DR is known at compile time:
7523 addr_mis = int mis = DR_MISALIGNMENT (dr);
7524 Else, compute address misalignment in bytes:
7525 addr_mis = addr & (vectype_size - 1)
7527 prolog_niters = min (LOOP_NITERS, ((VF - addr_mis/elem_size)&(VF-1))/step)
7529 (elem_size = element type size; an element is the scalar element whose type
7530 is the inner type of the vectype)
7532 When the step of the data-ref in the loop is not 1 (as in interleaved data
7533 and SLP), the number of iterations of the prolog must be divided by the step
7534 (which is equal to the size of interleaved group).
7536 The above formulas assume that VF == number of elements in the vector. This
7537 may not hold when there are multiple-types in the loop.
7538 In this case, for some data-references in the loop the VF does not represent
7539 the number of elements that fit in the vector. Therefore, instead of VF we
7540 use TYPE_VECTOR_SUBPARTS. */
7543 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
7545 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
7546 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7549 tree iters, iters_name;
7552 gimple dr_stmt = DR_STMT (dr);
7553 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
7554 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7555 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
7556 tree niters_type = TREE_TYPE (loop_niters);
7558 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
7559 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
7561 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7562 step = DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info)));
7564 pe = loop_preheader_edge (loop);
7566 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
7568 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
7569 int elem_misalign = byte_misalign / element_size;
7571 if (vect_print_dump_info (REPORT_DETAILS))
7572 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
7574 iters = build_int_cst (niters_type,
7575 (((nelements - elem_misalign) & (nelements - 1)) / step));
7579 gimple_seq new_stmts = NULL;
7580 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
7581 &new_stmts, NULL_TREE, loop);
7582 tree ptr_type = TREE_TYPE (start_addr);
7583 tree size = TYPE_SIZE (ptr_type);
7584 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
7585 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
7586 tree elem_size_log =
7587 build_int_cst (type, exact_log2 (vectype_align/nelements));
7588 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
7589 tree nelements_tree = build_int_cst (type, nelements);
7593 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmts);
7594 gcc_assert (!new_bb);
7596 /* Create: byte_misalign = addr & (vectype_size - 1) */
7598 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
7600 /* Create: elem_misalign = byte_misalign / element_size */
7602 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
7604 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
7605 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
7606 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
7607 iters = fold_convert (niters_type, iters);
7610 /* Create: prolog_loop_niters = min (iters, loop_niters) */
7611 /* If the loop bound is known at compile time we already verified that it is
7612 greater than vf; since the misalignment ('iters') is at most vf, there's
7613 no need to generate the MIN_EXPR in this case. */
7614 if (TREE_CODE (loop_niters) != INTEGER_CST)
7615 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
7617 if (vect_print_dump_info (REPORT_DETAILS))
7619 fprintf (vect_dump, "niters for prolog loop: ");
7620 print_generic_expr (vect_dump, iters, TDF_SLIM);
7623 var = create_tmp_var (niters_type, "prolog_loop_niters");
7624 add_referenced_var (var);
7626 iters_name = force_gimple_operand (iters, &stmts, false, var);
7628 /* Insert stmt on loop preheader edge. */
7631 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7632 gcc_assert (!new_bb);
7639 /* Function vect_update_init_of_dr
7641 NITERS iterations were peeled from LOOP. DR represents a data reference
7642 in LOOP. This function updates the information recorded in DR to
7643 account for the fact that the first NITERS iterations had already been
7644 executed. Specifically, it updates the OFFSET field of DR. */
7647 vect_update_init_of_dr (struct data_reference *dr, tree niters)
7649 tree offset = DR_OFFSET (dr);
7651 niters = fold_build2 (MULT_EXPR, sizetype,
7652 fold_convert (sizetype, niters),
7653 fold_convert (sizetype, DR_STEP (dr)));
7654 offset = fold_build2 (PLUS_EXPR, sizetype, offset, niters);
7655 DR_OFFSET (dr) = offset;
7659 /* Function vect_update_inits_of_drs
7661 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
7662 This function updates the information recorded for the data references in
7663 the loop to account for the fact that the first NITERS iterations had
7664 already been executed. Specifically, it updates the initial_condition of
7665 the access_function of all the data_references in the loop. */
7668 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
7671 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
7672 struct data_reference *dr;
7674 if (vect_print_dump_info (REPORT_DETAILS))
7675 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
7677 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
7678 vect_update_init_of_dr (dr, niters);
7682 /* Function vect_do_peeling_for_alignment
7684 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
7685 'niters' is set to the misalignment of one of the data references in the
7686 loop, thereby forcing it to refer to an aligned location at the beginning
7687 of the execution of this loop. The data reference for which we are
7688 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
7691 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
7693 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7694 tree niters_of_prolog_loop, ni_name;
7696 struct loop *new_loop;
7697 bool check_profitability = false;
7698 unsigned int th = 0;
7699 int min_profitable_iters;
7701 if (vect_print_dump_info (REPORT_DETAILS))
7702 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
7704 initialize_original_copy_tables ();
7706 ni_name = vect_build_loop_niters (loop_vinfo);
7707 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
7710 /* If cost model check not done during versioning. */
7711 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7712 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7714 check_profitability = true;
7716 /* Get profitability threshold for vectorized loop. */
7717 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7719 th = conservative_cost_threshold (loop_vinfo,
7720 min_profitable_iters);
7723 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
7725 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
7726 niters_of_prolog_loop, ni_name, true,
7727 th, check_profitability);
7729 gcc_assert (new_loop);
7730 #ifdef ENABLE_CHECKING
7731 slpeel_verify_cfg_after_peeling (new_loop, loop);
7734 /* Update number of times loop executes. */
7735 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
7736 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
7737 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
7739 /* Update the init conditions of the access functions of all data refs. */
7740 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
7742 /* After peeling we have to reset scalar evolution analyzer. */
7745 free_original_copy_tables ();
7749 /* Function vect_create_cond_for_align_checks.
7751 Create a conditional expression that represents the alignment checks for
7752 all of data references (array element references) whose alignment must be
7756 COND_EXPR - input conditional expression. New conditions will be chained
7757 with logical AND operation.
7758 LOOP_VINFO - two fields of the loop information are used.
7759 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
7760 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
7763 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7765 The returned value is the conditional expression to be used in the if
7766 statement that controls which version of the loop gets executed at runtime.
7768 The algorithm makes two assumptions:
7769 1) The number of bytes "n" in a vector is a power of 2.
7770 2) An address "a" is aligned if a%n is zero and that this
7771 test can be done as a&(n-1) == 0. For example, for 16
7772 byte vectors the test is a&0xf == 0. */
7775 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
7777 gimple_seq *cond_expr_stmt_list)
7779 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7780 VEC(gimple,heap) *may_misalign_stmts
7781 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
7783 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
7787 tree int_ptrsize_type;
7789 tree or_tmp_name = NULL_TREE;
7790 tree and_tmp, and_tmp_name;
7793 tree part_cond_expr;
7795 /* Check that mask is one less than a power of 2, i.e., mask is
7796 all zeros followed by all ones. */
7797 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
7799 /* CHECKME: what is the best integer or unsigned type to use to hold a
7800 cast from a pointer value? */
7801 psize = TYPE_SIZE (ptr_type_node);
7803 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
7805 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
7806 of the first vector of the i'th data reference. */
7808 for (i = 0; VEC_iterate (gimple, may_misalign_stmts, i, ref_stmt); i++)
7810 gimple_seq new_stmt_list = NULL;
7812 tree addr_tmp, addr_tmp_name;
7813 tree or_tmp, new_or_tmp_name;
7814 gimple addr_stmt, or_stmt;
7816 /* create: addr_tmp = (int)(address_of_first_vector) */
7818 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
7820 if (new_stmt_list != NULL)
7821 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
7823 sprintf (tmp_name, "%s%d", "addr2int", i);
7824 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7825 add_referenced_var (addr_tmp);
7826 addr_tmp_name = make_ssa_name (addr_tmp, NULL);
7827 addr_stmt = gimple_build_assign_with_ops (NOP_EXPR, addr_tmp_name,
7828 addr_base, NULL_TREE);
7829 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
7830 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
7832 /* The addresses are OR together. */
7834 if (or_tmp_name != NULL_TREE)
7836 /* create: or_tmp = or_tmp | addr_tmp */
7837 sprintf (tmp_name, "%s%d", "orptrs", i);
7838 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7839 add_referenced_var (or_tmp);
7840 new_or_tmp_name = make_ssa_name (or_tmp, NULL);
7841 or_stmt = gimple_build_assign_with_ops (BIT_IOR_EXPR,
7843 or_tmp_name, addr_tmp_name);
7844 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
7845 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
7846 or_tmp_name = new_or_tmp_name;
7849 or_tmp_name = addr_tmp_name;
7853 mask_cst = build_int_cst (int_ptrsize_type, mask);
7855 /* create: and_tmp = or_tmp & mask */
7856 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
7857 add_referenced_var (and_tmp);
7858 and_tmp_name = make_ssa_name (and_tmp, NULL);
7860 and_stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, and_tmp_name,
7861 or_tmp_name, mask_cst);
7862 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
7863 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
7865 /* Make and_tmp the left operand of the conditional test against zero.
7866 if and_tmp has a nonzero bit then some address is unaligned. */
7867 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
7868 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
7869 and_tmp_name, ptrsize_zero);
7871 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7872 *cond_expr, part_cond_expr);
7874 *cond_expr = part_cond_expr;
7877 /* Function vect_vfa_segment_size.
7879 Create an expression that computes the size of segment
7880 that will be accessed for a data reference. The functions takes into
7881 account that realignment loads may access one more vector.
7884 DR: The data reference.
7885 VECT_FACTOR: vectorization factor.
7887 Return an expression whose value is the size of segment which will be
7891 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7893 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7894 DR_STEP (dr), vect_factor);
7896 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7898 tree vector_size = TYPE_SIZE_UNIT
7899 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7901 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7902 segment_length, vector_size);
7904 return fold_convert (sizetype, segment_length);
7907 /* Function vect_create_cond_for_alias_checks.
7909 Create a conditional expression that represents the run-time checks for
7910 overlapping of address ranges represented by a list of data references
7911 relations passed as input.
7914 COND_EXPR - input conditional expression. New conditions will be chained
7915 with logical AND operation.
7916 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7920 COND_EXPR - conditional expression.
7921 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7925 The returned value is the conditional expression to be used in the if
7926 statement that controls which version of the loop gets executed at runtime.
7930 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7932 gimple_seq * cond_expr_stmt_list)
7934 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7935 VEC (ddr_p, heap) * may_alias_ddrs =
7936 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7938 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7942 tree part_cond_expr;
7944 /* Create expression
7945 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7946 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7950 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7951 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
7953 if (VEC_empty (ddr_p, may_alias_ddrs))
7956 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
7958 struct data_reference *dr_a, *dr_b;
7959 gimple dr_group_first_a, dr_group_first_b;
7960 tree addr_base_a, addr_base_b;
7961 tree segment_length_a, segment_length_b;
7962 gimple stmt_a, stmt_b;
7965 stmt_a = DR_STMT (DDR_A (ddr));
7966 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
7967 if (dr_group_first_a)
7969 stmt_a = dr_group_first_a;
7970 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
7974 stmt_b = DR_STMT (DDR_B (ddr));
7975 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
7976 if (dr_group_first_b)
7978 stmt_b = dr_group_first_b;
7979 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
7983 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
7986 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
7989 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
7990 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
7992 if (vect_print_dump_info (REPORT_DR_DETAILS))
7995 "create runtime check for data references ");
7996 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
7997 fprintf (vect_dump, " and ");
7998 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
8003 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
8004 fold_build2 (LT_EXPR, boolean_type_node,
8005 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
8009 fold_build2 (LT_EXPR, boolean_type_node,
8010 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
8016 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
8017 *cond_expr, part_cond_expr);
8019 *cond_expr = part_cond_expr;
8021 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8022 fprintf (vect_dump, "created %u versioning for alias checks.\n",
8023 VEC_length (ddr_p, may_alias_ddrs));
8027 /* Function vect_loop_versioning.
8029 If the loop has data references that may or may not be aligned or/and
8030 has data reference relations whose independence was not proven then
8031 two versions of the loop need to be generated, one which is vectorized
8032 and one which isn't. A test is then generated to control which of the
8033 loops is executed. The test checks for the alignment of all of the
8034 data references that may or may not be aligned. An additional
8035 sequence of runtime tests is generated for each pairs of DDRs whose
8036 independence was not proven. The vectorized version of loop is
8037 executed only if both alias and alignment tests are passed.
8039 The test generated to check which version of loop is executed
8040 is modified to also check for profitability as indicated by the
8041 cost model initially. */
8044 vect_loop_versioning (loop_vec_info loop_vinfo)
8046 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8048 tree cond_expr = NULL_TREE;
8049 gimple_seq cond_expr_stmt_list = NULL;
8050 basic_block condition_bb;
8051 gimple_stmt_iterator gsi, cond_exp_gsi;
8052 basic_block merge_bb;
8053 basic_block new_exit_bb;
8055 gimple orig_phi, new_phi;
8057 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
8058 gimple_seq gimplify_stmt_list = NULL;
8059 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
8060 int min_profitable_iters = 0;
8063 /* Get profitability threshold for vectorized loop. */
8064 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
8066 th = conservative_cost_threshold (loop_vinfo,
8067 min_profitable_iters);
8070 build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
8071 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
8073 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
8076 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
8077 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
8078 &cond_expr_stmt_list);
8080 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8081 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
8082 &cond_expr_stmt_list);
8085 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
8087 force_gimple_operand (cond_expr, &gimplify_stmt_list, true, NULL_TREE);
8088 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
8090 initialize_original_copy_tables ();
8091 nloop = loop_version (loop, cond_expr, &condition_bb,
8092 prob, prob, REG_BR_PROB_BASE - prob, true);
8093 free_original_copy_tables();
8095 /* Loop versioning violates an assumption we try to maintain during
8096 vectorization - that the loop exit block has a single predecessor.
8097 After versioning, the exit block of both loop versions is the same
8098 basic block (i.e. it has two predecessors). Just in order to simplify
8099 following transformations in the vectorizer, we fix this situation
8100 here by adding a new (empty) block on the exit-edge of the loop,
8101 with the proper loop-exit phis to maintain loop-closed-form. */
8103 merge_bb = single_exit (loop)->dest;
8104 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
8105 new_exit_bb = split_edge (single_exit (loop));
8106 new_exit_e = single_exit (loop);
8107 e = EDGE_SUCC (new_exit_bb, 0);
8109 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8111 orig_phi = gsi_stmt (gsi);
8112 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
8114 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
8115 add_phi_arg (new_phi, arg, new_exit_e);
8116 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
8119 /* End loop-exit-fixes after versioning. */
8121 update_ssa (TODO_update_ssa);
8122 if (cond_expr_stmt_list)
8124 cond_exp_gsi = gsi_last_bb (condition_bb);
8125 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, GSI_SAME_STMT);
8129 /* Remove a group of stores (for SLP or interleaving), free their
8133 vect_remove_stores (gimple first_stmt)
8135 gimple next = first_stmt;
8137 gimple_stmt_iterator next_si;
8141 /* Free the attached stmt_vec_info and remove the stmt. */
8142 next_si = gsi_for_stmt (next);
8143 gsi_remove (&next_si, true);
8144 tmp = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
8145 free_stmt_vec_info (next);
8151 /* Vectorize SLP instance tree in postorder. */
8154 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
8155 unsigned int vectorization_factor)
8158 bool strided_store, is_store;
8159 gimple_stmt_iterator si;
8160 stmt_vec_info stmt_info;
8161 unsigned int vec_stmts_size, nunits, group_size;
8164 slp_tree loads_node;
8169 vect_schedule_slp_instance (SLP_TREE_LEFT (node), instance,
8170 vectorization_factor);
8171 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), instance,
8172 vectorization_factor);
8174 stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
8175 stmt_info = vinfo_for_stmt (stmt);
8177 /* VECTYPE is the type of the destination. */
8178 vectype = get_vectype_for_scalar_type (TREE_TYPE (gimple_assign_lhs (stmt)));
8179 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
8180 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
8182 /* For each SLP instance calculate number of vector stmts to be created
8183 for the scalar stmts in each node of the SLP tree. Number of vector
8184 elements in one vector iteration is the number of scalar elements in
8185 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
8187 vec_stmts_size = (vectorization_factor * group_size) / nunits;
8189 /* In case of load permutation we have to allocate vectorized statements for
8190 all the nodes that participate in that permutation. */
8191 if (SLP_INSTANCE_LOAD_PERMUTATION (instance))
8194 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), i, loads_node);
8197 if (!SLP_TREE_VEC_STMTS (loads_node))
8199 SLP_TREE_VEC_STMTS (loads_node) = VEC_alloc (gimple, heap,
8201 SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
8206 if (!SLP_TREE_VEC_STMTS (node))
8208 SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size);
8209 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
8212 if (vect_print_dump_info (REPORT_DETAILS))
8214 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
8215 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8218 /* Loads should be inserted before the first load. */
8219 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
8220 && STMT_VINFO_STRIDED_ACCESS (stmt_info)
8221 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
8222 si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
8224 si = gsi_for_stmt (stmt);
8226 is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance);
8229 if (DR_GROUP_FIRST_DR (stmt_info))
8230 /* If IS_STORE is TRUE, the vectorization of the
8231 interleaving chain was completed - free all the stores in
8233 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8235 /* FORNOW: SLP originates only from strided stores. */
8241 /* FORNOW: SLP originates only from strided stores. */
8247 vect_schedule_slp (loop_vec_info loop_vinfo)
8249 VEC (slp_instance, heap) *slp_instances =
8250 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
8251 slp_instance instance;
8253 bool is_store = false;
8255 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
8257 /* Schedule the tree of INSTANCE. */
8258 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
8259 instance, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
8261 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
8262 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
8263 fprintf (vect_dump, "vectorizing stmts using SLP.");
8269 /* Function vect_transform_loop.
8271 The analysis phase has determined that the loop is vectorizable.
8272 Vectorize the loop - created vectorized stmts to replace the scalar
8273 stmts in the loop, and update the loop exit condition. */
8276 vect_transform_loop (loop_vec_info loop_vinfo)
8278 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8279 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8280 int nbbs = loop->num_nodes;
8281 gimple_stmt_iterator si;
8284 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8286 bool slp_scheduled = false;
8287 unsigned int nunits;
8289 if (vect_print_dump_info (REPORT_DETAILS))
8290 fprintf (vect_dump, "=== vec_transform_loop ===");
8292 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
8293 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8294 vect_loop_versioning (loop_vinfo);
8296 /* CHECKME: we wouldn't need this if we called update_ssa once
8298 bitmap_zero (vect_memsyms_to_rename);
8300 /* Peel the loop if there are data refs with unknown alignment.
8301 Only one data ref with unknown store is allowed. */
8303 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
8304 vect_do_peeling_for_alignment (loop_vinfo);
8306 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
8307 compile time constant), or it is a constant that doesn't divide by the
8308 vectorization factor, then an epilog loop needs to be created.
8309 We therefore duplicate the loop: the original loop will be vectorized,
8310 and will compute the first (n/VF) iterations. The second copy of the loop
8311 will remain scalar and will compute the remaining (n%VF) iterations.
8312 (VF is the vectorization factor). */
8314 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8315 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8316 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
8317 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
8319 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8320 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
8322 /* 1) Make sure the loop header has exactly two entries
8323 2) Make sure we have a preheader basic block. */
8325 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8327 split_edge (loop_preheader_edge (loop));
8329 /* FORNOW: the vectorizer supports only loops which body consist
8330 of one basic block (header + empty latch). When the vectorizer will
8331 support more involved loop forms, the order by which the BBs are
8332 traversed need to be reconsidered. */
8334 for (i = 0; i < nbbs; i++)
8336 basic_block bb = bbs[i];
8337 stmt_vec_info stmt_info;
8340 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
8342 phi = gsi_stmt (si);
8343 if (vect_print_dump_info (REPORT_DETAILS))
8345 fprintf (vect_dump, "------>vectorizing phi: ");
8346 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
8348 stmt_info = vinfo_for_stmt (phi);
8352 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8353 && !STMT_VINFO_LIVE_P (stmt_info))
8356 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
8357 != (unsigned HOST_WIDE_INT) vectorization_factor)
8358 && vect_print_dump_info (REPORT_DETAILS))
8359 fprintf (vect_dump, "multiple-types.");
8361 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
8363 if (vect_print_dump_info (REPORT_DETAILS))
8364 fprintf (vect_dump, "transform phi.");
8365 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8369 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
8371 gimple stmt = gsi_stmt (si);
8374 if (vect_print_dump_info (REPORT_DETAILS))
8376 fprintf (vect_dump, "------>vectorizing statement: ");
8377 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8380 stmt_info = vinfo_for_stmt (stmt);
8382 /* vector stmts created in the outer-loop during vectorization of
8383 stmts in an inner-loop may not have a stmt_info, and do not
8384 need to be vectorized. */
8391 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8392 && !STMT_VINFO_LIVE_P (stmt_info))
8398 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
8400 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8401 if (!STMT_SLP_TYPE (stmt_info)
8402 && nunits != (unsigned int) vectorization_factor
8403 && vect_print_dump_info (REPORT_DETAILS))
8404 /* For SLP VF is set according to unrolling factor, and not to
8405 vector size, hence for SLP this print is not valid. */
8406 fprintf (vect_dump, "multiple-types.");
8408 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8410 if (STMT_SLP_TYPE (stmt_info))
8414 slp_scheduled = true;
8416 if (vect_print_dump_info (REPORT_DETAILS))
8417 fprintf (vect_dump, "=== scheduling SLP instances ===");
8419 is_store = vect_schedule_slp (loop_vinfo);
8421 /* IS_STORE is true if STMT is a store. Stores cannot be of
8422 hybrid SLP type. They are removed in
8423 vect_schedule_slp_instance and their vinfo is destroyed. */
8431 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8432 if (PURE_SLP_STMT (stmt_info))
8439 /* -------- vectorize statement ------------ */
8440 if (vect_print_dump_info (REPORT_DETAILS))
8441 fprintf (vect_dump, "transform statement.");
8443 strided_store = false;
8444 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
8447 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
8449 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8450 interleaving chain was completed - free all the stores in
8452 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8453 gsi_remove (&si, true);
8458 /* Free the attached stmt_vec_info and remove the stmt. */
8459 free_stmt_vec_info (stmt);
8460 gsi_remove (&si, true);
8468 slpeel_make_loop_iterate_ntimes (loop, ratio);
8470 mark_set_for_renaming (vect_memsyms_to_rename);
8472 /* The memory tags and pointers in vectorized statements need to
8473 have their SSA forms updated. FIXME, why can't this be delayed
8474 until all the loops have been transformed? */
8475 update_ssa (TODO_update_ssa);
8477 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8478 fprintf (vect_dump, "LOOP VECTORIZED.");
8479 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8480 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");