1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (gimple, gimple_stmt_iterator *, bool *,
50 slp_tree, slp_instance);
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (gimple, struct loop*, tree, tree *, gimple *, bool, bool *, tree);
54 static tree vect_create_addr_base_for_vector_ref
55 (gimple, gimple_seq *, tree, struct loop *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, gimple, tree *);
58 static tree vect_init_vector (gimple, tree, tree, gimple_stmt_iterator *);
59 static void vect_finish_stmt_generation
60 (gimple stmt, gimple vec_stmt, gimple_stmt_iterator *);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void vect_create_epilog_for_reduction
63 (tree, gimple, int, enum tree_code, gimple);
64 static tree get_initial_def_for_reduction (gimple, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
78 cost_for_stmt (gimple stmt)
80 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
82 switch (STMT_VINFO_TYPE (stmt_info))
84 case load_vec_info_type:
85 return TARG_SCALAR_LOAD_COST;
86 case store_vec_info_type:
87 return TARG_SCALAR_STORE_COST;
88 case op_vec_info_type:
89 case condition_vec_info_type:
90 case assignment_vec_info_type:
91 case reduc_vec_info_type:
92 case induc_vec_info_type:
93 case type_promotion_vec_info_type:
94 case type_demotion_vec_info_type:
95 case type_conversion_vec_info_type:
96 case call_vec_info_type:
97 return TARG_SCALAR_STMT_COST;
98 case undef_vec_info_type:
105 /* Function vect_estimate_min_profitable_iters
107 Return the number of iterations required for the vector version of the
108 loop to be profitable relative to the cost of the scalar version of the
111 TODO: Take profile info into account before making vectorization
112 decisions, if available. */
115 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
118 int min_profitable_iters;
119 int peel_iters_prologue;
120 int peel_iters_epilogue;
121 int vec_inside_cost = 0;
122 int vec_outside_cost = 0;
123 int scalar_single_iter_cost = 0;
124 int scalar_outside_cost = 0;
125 bool runtime_test = false;
126 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
127 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
128 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
129 int nbbs = loop->num_nodes;
130 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
131 int peel_guard_costs = 0;
132 int innerloop_iters = 0, factor;
133 VEC (slp_instance, heap) *slp_instances;
134 slp_instance instance;
136 /* Cost model disabled. */
137 if (!flag_vect_cost_model)
139 if (vect_print_dump_info (REPORT_COST))
140 fprintf (vect_dump, "cost model disabled.");
144 /* If the number of iterations is unknown, or the
145 peeling-for-misalignment amount is unknown, we will have to generate
146 a runtime test to test the loop count against the threshold. */
147 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
148 || (byte_misalign < 0))
151 /* Requires loop versioning tests to handle misalignment. */
153 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
155 /* FIXME: Make cost depend on complexity of individual check. */
157 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
158 if (vect_print_dump_info (REPORT_COST))
159 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
160 "versioning to treat misalignment.\n");
163 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
165 /* FIXME: Make cost depend on complexity of individual check. */
167 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
168 if (vect_print_dump_info (REPORT_COST))
169 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
170 "versioning aliasing.\n");
173 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
174 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
176 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
179 /* Count statements in scalar loop. Using this as scalar cost for a single
182 TODO: Add outer loop support.
184 TODO: Consider assigning different costs to different scalar
189 innerloop_iters = 50; /* FIXME */
191 for (i = 0; i < nbbs; i++)
193 gimple_stmt_iterator si;
194 basic_block bb = bbs[i];
196 if (bb->loop_father == loop->inner)
197 factor = innerloop_iters;
201 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
203 gimple stmt = gsi_stmt (si);
204 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
205 /* Skip stmts that are not vectorized inside the loop. */
206 if (!STMT_VINFO_RELEVANT_P (stmt_info)
207 && (!STMT_VINFO_LIVE_P (stmt_info)
208 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
210 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
211 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
212 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
213 some of the "outside" costs are generated inside the outer-loop. */
214 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
218 /* Add additional cost for the peeled instructions in prologue and epilogue
221 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
222 at compile-time - we assume it's vf/2 (the worst would be vf-1).
224 TODO: Build an expression that represents peel_iters for prologue and
225 epilogue to be used in a run-time test. */
227 if (byte_misalign < 0)
229 peel_iters_prologue = vf/2;
230 if (vect_print_dump_info (REPORT_COST))
231 fprintf (vect_dump, "cost model: "
232 "prologue peel iters set to vf/2.");
234 /* If peeling for alignment is unknown, loop bound of main loop becomes
236 peel_iters_epilogue = vf/2;
237 if (vect_print_dump_info (REPORT_COST))
238 fprintf (vect_dump, "cost model: "
239 "epilogue peel iters set to vf/2 because "
240 "peeling for alignment is unknown .");
242 /* If peeled iterations are unknown, count a taken branch and a not taken
243 branch per peeled loop. Even if scalar loop iterations are known,
244 vector iterations are not known since peeled prologue iterations are
245 not known. Hence guards remain the same. */
246 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
247 + TARG_COND_NOT_TAKEN_BRANCH_COST);
254 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
255 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
256 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
257 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
259 peel_iters_prologue = nelements - (byte_misalign / element_size);
262 peel_iters_prologue = 0;
264 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
266 peel_iters_epilogue = vf/2;
267 if (vect_print_dump_info (REPORT_COST))
268 fprintf (vect_dump, "cost model: "
269 "epilogue peel iters set to vf/2 because "
270 "loop iterations are unknown .");
272 /* If peeled iterations are known but number of scalar loop
273 iterations are unknown, count a taken branch per peeled loop. */
274 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
279 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
280 peel_iters_prologue = niters < peel_iters_prologue ?
281 niters : peel_iters_prologue;
282 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
286 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
287 + (peel_iters_epilogue * scalar_single_iter_cost)
290 /* FORNOW: The scalar outside cost is incremented in one of the
293 1. The vectorizer checks for alignment and aliasing and generates
294 a condition that allows dynamic vectorization. A cost model
295 check is ANDED with the versioning condition. Hence scalar code
296 path now has the added cost of the versioning check.
298 if (cost > th & versioning_check)
301 Hence run-time scalar is incremented by not-taken branch cost.
303 2. The vectorizer then checks if a prologue is required. If the
304 cost model check was not done before during versioning, it has to
305 be done before the prologue check.
308 prologue = scalar_iters
313 if (prologue == num_iters)
316 Hence the run-time scalar cost is incremented by a taken branch,
317 plus a not-taken branch, plus a taken branch cost.
319 3. The vectorizer then checks if an epilogue is required. If the
320 cost model check was not done before during prologue check, it
321 has to be done with the epilogue check.
327 if (prologue == num_iters)
330 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
333 Hence the run-time scalar cost should be incremented by 2 taken
336 TODO: The back end may reorder the BBS's differently and reverse
337 conditions/branch directions. Change the estimates below to
338 something more reasonable. */
342 /* Cost model check occurs at versioning. */
343 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
344 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
345 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
348 /* Cost model occurs at prologue generation. */
349 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
350 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
351 + TARG_COND_NOT_TAKEN_BRANCH_COST;
352 /* Cost model check occurs at epilogue generation. */
354 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
359 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
360 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
362 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
363 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
366 /* Calculate number of iterations required to make the vector version
367 profitable, relative to the loop bodies only. The following condition
369 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
371 SIC = scalar iteration cost, VIC = vector iteration cost,
372 VOC = vector outside cost, VF = vectorization factor,
373 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
374 SOC = scalar outside cost for run time cost model check. */
376 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
378 if (vec_outside_cost <= 0)
379 min_profitable_iters = 1;
382 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
383 - vec_inside_cost * peel_iters_prologue
384 - vec_inside_cost * peel_iters_epilogue)
385 / ((scalar_single_iter_cost * vf)
388 if ((scalar_single_iter_cost * vf * min_profitable_iters)
389 <= ((vec_inside_cost * min_profitable_iters)
390 + ((vec_outside_cost - scalar_outside_cost) * vf)))
391 min_profitable_iters++;
394 /* vector version will never be profitable. */
397 if (vect_print_dump_info (REPORT_COST))
398 fprintf (vect_dump, "cost model: vector iteration cost = %d "
399 "is divisible by scalar iteration cost = %d by a factor "
400 "greater than or equal to the vectorization factor = %d .",
401 vec_inside_cost, scalar_single_iter_cost, vf);
405 if (vect_print_dump_info (REPORT_COST))
407 fprintf (vect_dump, "Cost model analysis: \n");
408 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
410 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
412 fprintf (vect_dump, " Scalar iteration cost: %d\n",
413 scalar_single_iter_cost);
414 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
415 fprintf (vect_dump, " prologue iterations: %d\n",
416 peel_iters_prologue);
417 fprintf (vect_dump, " epilogue iterations: %d\n",
418 peel_iters_epilogue);
419 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
420 min_profitable_iters);
423 min_profitable_iters =
424 min_profitable_iters < vf ? vf : min_profitable_iters;
426 /* Because the condition we create is:
427 if (niters <= min_profitable_iters)
428 then skip the vectorized loop. */
429 min_profitable_iters--;
431 if (vect_print_dump_info (REPORT_COST))
432 fprintf (vect_dump, " Profitability threshold = %d\n",
433 min_profitable_iters);
435 return min_profitable_iters;
439 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
440 functions. Design better to avoid maintenance issues. */
442 /* Function vect_model_reduction_cost.
444 Models cost for a reduction operation, including the vector ops
445 generated within the strip-mine loop, the initial definition before
446 the loop, and the epilogue code that must be generated. */
449 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
456 gimple stmt, orig_stmt;
458 enum machine_mode mode;
459 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
460 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
463 /* Cost of reduction op inside loop. */
464 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
466 stmt = STMT_VINFO_STMT (stmt_info);
468 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
470 case GIMPLE_SINGLE_RHS:
471 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
472 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
474 case GIMPLE_UNARY_RHS:
475 reduction_op = gimple_assign_rhs1 (stmt);
477 case GIMPLE_BINARY_RHS:
478 reduction_op = gimple_assign_rhs2 (stmt);
484 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
487 if (vect_print_dump_info (REPORT_COST))
489 fprintf (vect_dump, "unsupported data-type ");
490 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
495 mode = TYPE_MODE (vectype);
496 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
499 orig_stmt = STMT_VINFO_STMT (stmt_info);
501 code = gimple_assign_rhs_code (orig_stmt);
503 /* Add in cost for initial definition. */
504 outer_cost += TARG_SCALAR_TO_VEC_COST;
506 /* Determine cost of epilogue code.
508 We have a reduction operator that will reduce the vector in one statement.
509 Also requires scalar extract. */
511 if (!nested_in_vect_loop_p (loop, orig_stmt))
513 if (reduc_code < NUM_TREE_CODES)
514 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
517 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
519 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
520 int element_bitsize = tree_low_cst (bitsize, 1);
521 int nelements = vec_size_in_bits / element_bitsize;
523 optab = optab_for_tree_code (code, vectype, optab_default);
525 /* We have a whole vector shift available. */
526 if (VECTOR_MODE_P (mode)
527 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
528 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
529 /* Final reduction via vector shifts and the reduction operator. Also
530 requires scalar extract. */
531 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
532 + TARG_VEC_TO_SCALAR_COST);
534 /* Use extracts and reduction op for final reduction. For N elements,
535 we have N extracts and N-1 reduction ops. */
536 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
540 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
542 if (vect_print_dump_info (REPORT_COST))
543 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
544 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
545 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
551 /* Function vect_model_induction_cost.
553 Models cost for induction operations. */
556 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
558 /* loop cost for vec_loop. */
559 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
560 /* prologue cost for vec_init and vec_step. */
561 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
563 if (vect_print_dump_info (REPORT_COST))
564 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
565 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
566 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
570 /* Function vect_model_simple_cost.
572 Models cost for simple operations, i.e. those that only emit ncopies of a
573 single op. Right now, this does not account for multiple insns that could
574 be generated for the single vector op. We will handle that shortly. */
577 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
578 enum vect_def_type *dt, slp_tree slp_node)
581 int inside_cost = 0, outside_cost = 0;
583 /* The SLP costs were already calculated during SLP tree build. */
584 if (PURE_SLP_STMT (stmt_info))
587 inside_cost = ncopies * TARG_VEC_STMT_COST;
589 /* FORNOW: Assuming maximum 2 args per stmts. */
590 for (i = 0; i < 2; i++)
592 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
593 outside_cost += TARG_SCALAR_TO_VEC_COST;
596 if (vect_print_dump_info (REPORT_COST))
597 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
598 "outside_cost = %d .", inside_cost, outside_cost);
600 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
601 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
602 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
606 /* Function vect_cost_strided_group_size
608 For strided load or store, return the group_size only if it is the first
609 load or store of a group, else return 1. This ensures that group size is
610 only returned once per group. */
613 vect_cost_strided_group_size (stmt_vec_info stmt_info)
615 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
617 if (first_stmt == STMT_VINFO_STMT (stmt_info))
618 return DR_GROUP_SIZE (stmt_info);
624 /* Function vect_model_store_cost
626 Models cost for stores. In the case of strided accesses, one access
627 has the overhead of the strided access attributed to it. */
630 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
631 enum vect_def_type dt, slp_tree slp_node)
634 int inside_cost = 0, outside_cost = 0;
636 /* The SLP costs were already calculated during SLP tree build. */
637 if (PURE_SLP_STMT (stmt_info))
640 if (dt == vect_constant_def || dt == vect_invariant_def)
641 outside_cost = TARG_SCALAR_TO_VEC_COST;
643 /* Strided access? */
644 if (DR_GROUP_FIRST_DR (stmt_info) && !slp_node)
645 group_size = vect_cost_strided_group_size (stmt_info);
646 /* Not a strided access. */
650 /* Is this an access in a group of stores, which provide strided access?
651 If so, add in the cost of the permutes. */
654 /* Uses a high and low interleave operation for each needed permute. */
655 inside_cost = ncopies * exact_log2(group_size) * group_size
656 * TARG_VEC_STMT_COST;
658 if (vect_print_dump_info (REPORT_COST))
659 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
664 /* Costs of the stores. */
665 inside_cost += ncopies * TARG_VEC_STORE_COST;
667 if (vect_print_dump_info (REPORT_COST))
668 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
669 "outside_cost = %d .", inside_cost, outside_cost);
671 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
672 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
673 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
677 /* Function vect_model_load_cost
679 Models cost for loads. In the case of strided accesses, the last access
680 has the overhead of the strided access attributed to it. Since unaligned
681 accesses are supported for loads, we also account for the costs of the
682 access scheme chosen. */
685 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
689 int alignment_support_cheme;
691 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
692 int inside_cost = 0, outside_cost = 0;
694 /* The SLP costs were already calculated during SLP tree build. */
695 if (PURE_SLP_STMT (stmt_info))
698 /* Strided accesses? */
699 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
700 if (first_stmt && !slp_node)
702 group_size = vect_cost_strided_group_size (stmt_info);
703 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
705 /* Not a strided access. */
712 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
714 /* Is this an access in a group of loads providing strided access?
715 If so, add in the cost of the permutes. */
718 /* Uses an even and odd extract operations for each needed permute. */
719 inside_cost = ncopies * exact_log2(group_size) * group_size
720 * TARG_VEC_STMT_COST;
722 if (vect_print_dump_info (REPORT_COST))
723 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
728 /* The loads themselves. */
729 switch (alignment_support_cheme)
733 inside_cost += ncopies * TARG_VEC_LOAD_COST;
735 if (vect_print_dump_info (REPORT_COST))
736 fprintf (vect_dump, "vect_model_load_cost: aligned.");
740 case dr_unaligned_supported:
742 /* Here, we assign an additional cost for the unaligned load. */
743 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
745 if (vect_print_dump_info (REPORT_COST))
746 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
751 case dr_explicit_realign:
753 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
755 /* FIXME: If the misalignment remains fixed across the iterations of
756 the containing loop, the following cost should be added to the
758 if (targetm.vectorize.builtin_mask_for_load)
759 inside_cost += TARG_VEC_STMT_COST;
763 case dr_explicit_realign_optimized:
765 if (vect_print_dump_info (REPORT_COST))
766 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
769 /* Unaligned software pipeline has a load of an address, an initial
770 load, and possibly a mask operation to "prime" the loop. However,
771 if this is an access in a group of loads, which provide strided
772 access, then the above cost should only be considered for one
773 access in the group. Inside the loop, there is a load op
774 and a realignment op. */
776 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
778 outside_cost = 2*TARG_VEC_STMT_COST;
779 if (targetm.vectorize.builtin_mask_for_load)
780 outside_cost += TARG_VEC_STMT_COST;
783 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
792 if (vect_print_dump_info (REPORT_COST))
793 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
794 "outside_cost = %d .", inside_cost, outside_cost);
796 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
797 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
798 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
802 /* Function vect_get_new_vect_var.
804 Returns a name for a new variable. The current naming scheme appends the
805 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
806 the name of vectorizer generated variables, and appends that to NAME if
810 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
817 case vect_simple_var:
820 case vect_scalar_var:
823 case vect_pointer_var:
832 char* tmp = concat (prefix, name, NULL);
833 new_vect_var = create_tmp_var (type, tmp);
837 new_vect_var = create_tmp_var (type, prefix);
839 /* Mark vector typed variable as a gimple register variable. */
840 if (TREE_CODE (type) == VECTOR_TYPE)
841 DECL_GIMPLE_REG_P (new_vect_var) = true;
847 /* Function vect_create_addr_base_for_vector_ref.
849 Create an expression that computes the address of the first memory location
850 that will be accessed for a data reference.
853 STMT: The statement containing the data reference.
854 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
855 OFFSET: Optional. If supplied, it is be added to the initial address.
856 LOOP: Specify relative to which loop-nest should the address be computed.
857 For example, when the dataref is in an inner-loop nested in an
858 outer-loop that is now being vectorized, LOOP can be either the
859 outer-loop, or the inner-loop. The first memory location accessed
860 by the following dataref ('in' points to short):
867 if LOOP=i_loop: &in (relative to i_loop)
868 if LOOP=j_loop: &in+i*2B (relative to j_loop)
871 1. Return an SSA_NAME whose value is the address of the memory location of
872 the first vector of the data reference.
873 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
874 these statement(s) which define the returned SSA_NAME.
876 FORNOW: We are only handling array accesses with step 1. */
879 vect_create_addr_base_for_vector_ref (gimple stmt,
880 gimple_seq *new_stmt_list,
884 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
885 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
886 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
887 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
889 tree data_ref_base_var;
891 tree addr_base, addr_expr;
893 gimple_seq seq = NULL;
894 tree base_offset = unshare_expr (DR_OFFSET (dr));
895 tree init = unshare_expr (DR_INIT (dr));
896 tree vect_ptr_type, addr_expr2;
897 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
900 if (loop != containing_loop)
902 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
903 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
905 gcc_assert (nested_in_vect_loop_p (loop, stmt));
907 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
908 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
909 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
912 /* Create data_ref_base */
913 base_name = build_fold_indirect_ref (data_ref_base);
914 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
915 add_referenced_var (data_ref_base_var);
916 data_ref_base = force_gimple_operand (data_ref_base, &seq, true,
918 gimple_seq_add_seq (new_stmt_list, seq);
920 /* Create base_offset */
921 base_offset = size_binop (PLUS_EXPR,
922 fold_convert (sizetype, base_offset),
923 fold_convert (sizetype, init));
924 dest = create_tmp_var (sizetype, "base_off");
925 add_referenced_var (dest);
926 base_offset = force_gimple_operand (base_offset, &seq, true, dest);
927 gimple_seq_add_seq (new_stmt_list, seq);
931 tree tmp = create_tmp_var (sizetype, "offset");
933 add_referenced_var (tmp);
934 offset = fold_build2 (MULT_EXPR, sizetype,
935 fold_convert (sizetype, offset), step);
936 base_offset = fold_build2 (PLUS_EXPR, sizetype,
937 base_offset, offset);
938 base_offset = force_gimple_operand (base_offset, &seq, false, tmp);
939 gimple_seq_add_seq (new_stmt_list, seq);
942 /* base + base_offset */
943 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
944 data_ref_base, base_offset);
946 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
948 /* addr_expr = addr_base */
949 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
950 get_name (base_name));
951 add_referenced_var (addr_expr);
952 vec_stmt = fold_convert (vect_ptr_type, addr_base);
953 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
954 get_name (base_name));
955 add_referenced_var (addr_expr2);
956 vec_stmt = force_gimple_operand (vec_stmt, &seq, false, addr_expr2);
957 gimple_seq_add_seq (new_stmt_list, seq);
959 if (vect_print_dump_info (REPORT_DETAILS))
961 fprintf (vect_dump, "created ");
962 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
968 /* Function vect_create_data_ref_ptr.
970 Create a new pointer to vector type (vp), that points to the first location
971 accessed in the loop by STMT, along with the def-use update chain to
972 appropriately advance the pointer through the loop iterations. Also set
973 aliasing information for the pointer. This vector pointer is used by the
974 callers to this function to create a memory reference expression for vector
978 1. STMT: a stmt that references memory. Expected to be of the form
979 GIMPLE_ASSIGN <name, data-ref> or
980 GIMPLE_ASSIGN <data-ref, name>.
981 2. AT_LOOP: the loop where the vector memref is to be created.
982 3. OFFSET (optional): an offset to be added to the initial address accessed
983 by the data-ref in STMT.
984 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
985 pointing to the initial address.
986 5. TYPE: if not NULL indicates the required type of the data-ref.
989 1. Declare a new ptr to vector_type, and have it point to the base of the
990 data reference (initial addressed accessed by the data reference).
991 For example, for vector of type V8HI, the following code is generated:
994 vp = (v8hi *)initial_address;
996 if OFFSET is not supplied:
997 initial_address = &a[init];
998 if OFFSET is supplied:
999 initial_address = &a[init + OFFSET];
1001 Return the initial_address in INITIAL_ADDRESS.
1003 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
1004 update the pointer in each iteration of the loop.
1006 Return the increment stmt that updates the pointer in PTR_INCR.
1008 3. Set INV_P to true if the access pattern of the data reference in the
1009 vectorized loop is invariant. Set it to false otherwise.
1011 4. Return the pointer. */
1014 vect_create_data_ref_ptr (gimple stmt, struct loop *at_loop,
1015 tree offset, tree *initial_address, gimple *ptr_incr,
1016 bool only_init, bool *inv_p, tree type)
1019 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1020 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1021 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1022 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
1023 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
1024 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1030 gimple_seq new_stmt_list = NULL;
1034 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1036 gimple_stmt_iterator incr_gsi;
1038 tree indx_before_incr, indx_after_incr;
1042 /* Check the step (evolution) of the load in LOOP, and record
1043 whether it's invariant. */
1044 if (nested_in_vect_loop)
1045 step = STMT_VINFO_DR_STEP (stmt_info);
1047 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1049 if (tree_int_cst_compare (step, size_zero_node) == 0)
1054 /* Create an expression for the first address accessed by this load
1056 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1058 if (vect_print_dump_info (REPORT_DETAILS))
1060 tree data_ref_base = base_name;
1061 fprintf (vect_dump, "create vector-pointer variable to type: ");
1062 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1063 if (TREE_CODE (data_ref_base) == VAR_DECL)
1064 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
1065 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1066 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
1067 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1068 fprintf (vect_dump, " vectorizing a record based array ref: ");
1069 else if (TREE_CODE (data_ref_base) == SSA_NAME)
1070 fprintf (vect_dump, " vectorizing a pointer ref: ");
1071 print_generic_expr (vect_dump, base_name, TDF_SLIM);
1074 /** (1) Create the new vector-pointer variable: **/
1076 vect_ptr_type = build_pointer_type (type);
1078 vect_ptr_type = build_pointer_type (vectype);
1080 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == SSA_NAME
1081 && TYPE_RESTRICT (TREE_TYPE (DR_BASE_ADDRESS (dr))))
1082 vect_ptr_type = build_qualified_type (vect_ptr_type, TYPE_QUAL_RESTRICT);
1083 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1084 get_name (base_name));
1085 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == SSA_NAME
1086 && TYPE_RESTRICT (TREE_TYPE (DR_BASE_ADDRESS (dr))))
1088 get_alias_set (base_name);
1089 DECL_POINTER_ALIAS_SET (vect_ptr)
1090 = DECL_POINTER_ALIAS_SET (SSA_NAME_VAR (DR_BASE_ADDRESS (dr)));
1093 add_referenced_var (vect_ptr);
1095 /** (2) Add aliasing information to the new vector-pointer:
1096 (The points-to info (DR_PTR_INFO) may be defined later.) **/
1098 tag = DR_SYMBOL_TAG (dr);
1101 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1102 tag must be created with tag added to its may alias list. */
1104 new_type_alias (vect_ptr, tag, DR_REF (dr));
1106 set_symbol_mem_tag (vect_ptr, tag);
1108 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1109 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1110 def-use update cycles for the pointer: One relative to the outer-loop
1111 (LOOP), which is what steps (3) and (4) below do. The other is relative
1112 to the inner-loop (which is the inner-most loop containing the dataref),
1113 and this is done be step (5) below.
1115 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1116 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1117 redundant. Steps (3),(4) create the following:
1120 LOOP: vp1 = phi(vp0,vp2)
1126 If there is an inner-loop nested in loop, then step (5) will also be
1127 applied, and an additional update in the inner-loop will be created:
1130 LOOP: vp1 = phi(vp0,vp2)
1132 inner: vp3 = phi(vp1,vp4)
1133 vp4 = vp3 + inner_step
1139 /** (3) Calculate the initial address the vector-pointer, and set
1140 the vector-pointer to point to it before the loop: **/
1142 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1144 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1146 pe = loop_preheader_edge (loop);
1149 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
1150 gcc_assert (!new_bb);
1153 *initial_address = new_temp;
1155 /* Create: p = (vectype *) initial_base */
1156 vec_stmt = gimple_build_assign (vect_ptr,
1157 fold_convert (vect_ptr_type, new_temp));
1158 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1159 gimple_assign_set_lhs (vec_stmt, vect_ptr_init);
1160 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
1161 gcc_assert (!new_bb);
1164 /** (4) Handle the updating of the vector-pointer inside the loop.
1165 This is needed when ONLY_INIT is false, and also when AT_LOOP
1166 is the inner-loop nested in LOOP (during outer-loop vectorization).
1169 if (only_init && at_loop == loop) /* No update in loop is required. */
1171 /* Copy the points-to information if it exists. */
1172 if (DR_PTR_INFO (dr))
1173 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1174 vptr = vect_ptr_init;
1178 /* The step of the vector pointer is the Vector Size. */
1179 tree step = TYPE_SIZE_UNIT (vectype);
1180 /* One exception to the above is when the scalar step of the load in
1181 LOOP is zero. In this case the step here is also zero. */
1183 step = size_zero_node;
1185 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1187 create_iv (vect_ptr_init,
1188 fold_convert (vect_ptr_type, step),
1189 vect_ptr, loop, &incr_gsi, insert_after,
1190 &indx_before_incr, &indx_after_incr);
1191 incr = gsi_stmt (incr_gsi);
1192 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1194 /* Copy the points-to information if it exists. */
1195 if (DR_PTR_INFO (dr))
1197 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1198 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1200 merge_alias_info (vect_ptr_init, indx_before_incr);
1201 merge_alias_info (vect_ptr_init, indx_after_incr);
1205 vptr = indx_before_incr;
1208 if (!nested_in_vect_loop || only_init)
1212 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1213 nested in LOOP, if exists: **/
1215 gcc_assert (nested_in_vect_loop);
1218 standard_iv_increment_position (containing_loop, &incr_gsi,
1220 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), vect_ptr,
1221 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
1223 incr = gsi_stmt (incr_gsi);
1224 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1226 /* Copy the points-to information if it exists. */
1227 if (DR_PTR_INFO (dr))
1229 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1230 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1232 merge_alias_info (vect_ptr_init, indx_before_incr);
1233 merge_alias_info (vect_ptr_init, indx_after_incr);
1237 return indx_before_incr;
1244 /* Function bump_vector_ptr
1246 Increment a pointer (to a vector type) by vector-size. If requested,
1247 i.e. if PTR-INCR is given, then also connect the new increment stmt
1248 to the existing def-use update-chain of the pointer, by modifying
1249 the PTR_INCR as illustrated below:
1251 The pointer def-use update-chain before this function:
1252 DATAREF_PTR = phi (p_0, p_2)
1254 PTR_INCR: p_2 = DATAREF_PTR + step
1256 The pointer def-use update-chain after this function:
1257 DATAREF_PTR = phi (p_0, p_2)
1259 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1261 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1264 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1266 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1267 the loop. The increment amount across iterations is expected
1269 BSI - location where the new update stmt is to be placed.
1270 STMT - the original scalar memory-access stmt that is being vectorized.
1271 BUMP - optional. The offset by which to bump the pointer. If not given,
1272 the offset is assumed to be vector_size.
1274 Output: Return NEW_DATAREF_PTR as illustrated above.
1279 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
1280 gimple stmt, tree bump)
1282 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1283 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1284 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1285 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1286 tree update = TYPE_SIZE_UNIT (vectype);
1289 use_operand_p use_p;
1290 tree new_dataref_ptr;
1295 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, ptr_var,
1296 dataref_ptr, update);
1297 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1298 gimple_assign_set_lhs (incr_stmt, new_dataref_ptr);
1299 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
1301 /* Copy the points-to information if it exists. */
1302 if (DR_PTR_INFO (dr))
1303 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1304 merge_alias_info (new_dataref_ptr, dataref_ptr);
1307 return new_dataref_ptr;
1309 /* Update the vector-pointer's cross-iteration increment. */
1310 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1312 tree use = USE_FROM_PTR (use_p);
1314 if (use == dataref_ptr)
1315 SET_USE (use_p, new_dataref_ptr);
1317 gcc_assert (tree_int_cst_compare (use, update) == 0);
1320 return new_dataref_ptr;
1324 /* Function vect_create_destination_var.
1326 Create a new temporary of type VECTYPE. */
1329 vect_create_destination_var (tree scalar_dest, tree vectype)
1332 const char *new_name;
1334 enum vect_var_kind kind;
1336 kind = vectype ? vect_simple_var : vect_scalar_var;
1337 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1339 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1341 new_name = get_name (scalar_dest);
1344 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1345 add_referenced_var (vec_dest);
1351 /* Function vect_init_vector.
1353 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1354 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1355 is not NULL. Otherwise, place the initialization at the loop preheader.
1356 Return the DEF of INIT_STMT.
1357 It will be used in the vectorization of STMT. */
1360 vect_init_vector (gimple stmt, tree vector_var, tree vector_type,
1361 gimple_stmt_iterator *gsi)
1363 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1371 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1372 add_referenced_var (new_var);
1373 init_stmt = gimple_build_assign (new_var, vector_var);
1374 new_temp = make_ssa_name (new_var, init_stmt);
1375 gimple_assign_set_lhs (init_stmt, new_temp);
1378 vect_finish_stmt_generation (stmt, init_stmt, gsi);
1381 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1382 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1384 if (nested_in_vect_loop_p (loop, stmt))
1386 pe = loop_preheader_edge (loop);
1387 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1388 gcc_assert (!new_bb);
1391 if (vect_print_dump_info (REPORT_DETAILS))
1393 fprintf (vect_dump, "created new init_stmt: ");
1394 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1397 vec_oprnd = gimple_assign_lhs (init_stmt);
1402 /* For constant and loop invariant defs of SLP_NODE this function returns
1403 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1404 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1405 stmts. NUMBER_OF_VECTORS is the number of vector defs to create. */
1408 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1409 unsigned int op_num, unsigned int number_of_vectors)
1411 VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1412 gimple stmt = VEC_index (gimple, stmts, 0);
1413 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1414 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1418 int j, number_of_places_left_in_vector;
1421 int group_size = VEC_length (gimple, stmts);
1422 unsigned int vec_num, i;
1423 int number_of_copies = 1;
1424 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1425 bool constant_p, is_store;
1427 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1430 op = gimple_assign_rhs1 (stmt);
1435 op = gimple_op (stmt, op_num + 1);
1438 if (CONSTANT_CLASS_P (op))
1440 vector_type = vectype;
1445 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1446 gcc_assert (vector_type);
1450 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
1452 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1453 created vectors. It is greater than 1 if unrolling is performed.
1455 For example, we have two scalar operands, s1 and s2 (e.g., group of
1456 strided accesses of size two), while NUNITS is four (i.e., four scalars
1457 of this type can be packed in a vector). The output vector will contain
1458 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1461 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1462 containing the operands.
1464 For example, NUNITS is four as before, and the group size is 8
1465 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1466 {s5, s6, s7, s8}. */
1468 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1470 number_of_places_left_in_vector = nunits;
1471 for (j = 0; j < number_of_copies; j++)
1473 for (i = group_size - 1; VEC_iterate (gimple, stmts, i, stmt); i--)
1476 op = gimple_assign_rhs1 (stmt);
1478 op = gimple_op (stmt, op_num + 1);
1480 /* Create 'vect_ = {op0,op1,...,opn}'. */
1481 t = tree_cons (NULL_TREE, op, t);
1483 number_of_places_left_in_vector--;
1485 if (number_of_places_left_in_vector == 0)
1487 number_of_places_left_in_vector = nunits;
1490 vec_cst = build_vector (vector_type, t);
1492 vec_cst = build_constructor_from_list (vector_type, t);
1493 VEC_quick_push (tree, voprnds,
1494 vect_init_vector (stmt, vec_cst, vector_type, NULL));
1500 /* Since the vectors are created in the reverse order, we should invert
1502 vec_num = VEC_length (tree, voprnds);
1503 for (j = vec_num - 1; j >= 0; j--)
1505 vop = VEC_index (tree, voprnds, j);
1506 VEC_quick_push (tree, *vec_oprnds, vop);
1509 VEC_free (tree, heap, voprnds);
1511 /* In case that VF is greater than the unrolling factor needed for the SLP
1512 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1513 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1514 to replicate the vectors. */
1515 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1517 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1518 VEC_quick_push (tree, *vec_oprnds, vop);
1523 /* Get vectorized definitions from SLP_NODE that contains corresponding
1524 vectorized def-stmts. */
1527 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1530 gimple vec_def_stmt;
1533 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1536 VEC_iterate (gimple, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1539 gcc_assert (vec_def_stmt);
1540 vec_oprnd = gimple_get_lhs (vec_def_stmt);
1541 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1546 /* Get vectorized definitions for SLP_NODE.
1547 If the scalar definitions are loop invariants or constants, collect them and
1548 call vect_get_constant_vectors() to create vector stmts.
1549 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1550 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1551 vect_get_slp_vect_defs() to retrieve them.
1552 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1553 the right node. This is used when the second operand must remain scalar. */
1556 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1557 VEC (tree,heap) **vec_oprnds1)
1560 enum tree_code code;
1561 int number_of_vects;
1562 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
1564 first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1565 /* The number of vector defs is determined by the number of vector statements
1566 in the node from which we get those statements. */
1567 if (SLP_TREE_LEFT (slp_node))
1568 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_LEFT (slp_node));
1571 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1572 /* Number of vector stmts was calculated according to LHS in
1573 vect_schedule_slp_instance(), fix it by replacing LHS with RHS, if
1574 necessary. See vect_get_smallest_scalar_type() for details. */
1575 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
1577 if (rhs_size_unit != lhs_size_unit)
1579 number_of_vects *= rhs_size_unit;
1580 number_of_vects /= lhs_size_unit;
1584 /* Allocate memory for vectorized defs. */
1585 *vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects);
1587 /* SLP_NODE corresponds either to a group of stores or to a group of
1588 unary/binary operations. We don't call this function for loads. */
1589 if (SLP_TREE_LEFT (slp_node))
1590 /* The defs are already vectorized. */
1591 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1593 /* Build vectors from scalar defs. */
1594 vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects);
1596 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1597 /* Since we don't call this function with loads, this is a group of
1601 code = gimple_assign_rhs_code (first_stmt);
1602 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1)
1605 /* The number of vector defs is determined by the number of vector statements
1606 in the node from which we get those statements. */
1607 if (SLP_TREE_RIGHT (slp_node))
1608 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_RIGHT (slp_node));
1610 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1612 *vec_oprnds1 = VEC_alloc (tree, heap, number_of_vects);
1614 if (SLP_TREE_RIGHT (slp_node))
1615 /* The defs are already vectorized. */
1616 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1618 /* Build vectors from scalar defs. */
1619 vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects);
1623 /* Function get_initial_def_for_induction
1626 STMT - a stmt that performs an induction operation in the loop.
1627 IV_PHI - the initial value of the induction variable
1630 Return a vector variable, initialized with the first VF values of
1631 the induction variable. E.g., for an iv with IV_PHI='X' and
1632 evolution S, for a vector of 4 units, we want to return:
1633 [X, X + S, X + 2*S, X + 3*S]. */
1636 get_initial_def_for_induction (gimple iv_phi)
1638 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1639 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1640 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1641 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
1644 edge pe = loop_preheader_edge (loop);
1645 struct loop *iv_loop;
1647 tree vec, vec_init, vec_step, t;
1651 gimple init_stmt, induction_phi, new_stmt;
1652 tree induc_def, vec_def, vec_dest;
1653 tree init_expr, step_expr;
1654 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1659 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1660 bool nested_in_vect_loop = false;
1661 gimple_seq stmts = NULL;
1662 imm_use_iterator imm_iter;
1663 use_operand_p use_p;
1667 gimple_stmt_iterator si;
1668 basic_block bb = gimple_bb (iv_phi);
1670 vectype = get_vectype_for_scalar_type (scalar_type);
1671 gcc_assert (vectype);
1672 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1673 ncopies = vf / nunits;
1675 gcc_assert (phi_info);
1676 gcc_assert (ncopies >= 1);
1678 /* Find the first insertion point in the BB. */
1679 si = gsi_after_labels (bb);
1681 if (INTEGRAL_TYPE_P (scalar_type) || POINTER_TYPE_P (scalar_type))
1682 step_expr = build_int_cst (scalar_type, 0);
1684 step_expr = build_real (scalar_type, dconst0);
1686 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1687 if (nested_in_vect_loop_p (loop, iv_phi))
1689 nested_in_vect_loop = true;
1690 iv_loop = loop->inner;
1694 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
1696 latch_e = loop_latch_edge (iv_loop);
1697 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1699 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1700 gcc_assert (access_fn);
1701 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1702 &init_expr, &step_expr);
1704 pe = loop_preheader_edge (iv_loop);
1706 /* Create the vector that holds the initial_value of the induction. */
1707 if (nested_in_vect_loop)
1709 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1710 been created during vectorization of previous stmts; We obtain it from
1711 the STMT_VINFO_VEC_STMT of the defining stmt. */
1712 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1713 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1717 /* iv_loop is the loop to be vectorized. Create:
1718 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1719 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1720 add_referenced_var (new_var);
1722 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1725 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
1726 gcc_assert (!new_bb);
1730 t = tree_cons (NULL_TREE, init_expr, t);
1731 for (i = 1; i < nunits; i++)
1733 /* Create: new_name_i = new_name + step_expr */
1734 enum tree_code code = POINTER_TYPE_P (scalar_type)
1735 ? POINTER_PLUS_EXPR : PLUS_EXPR;
1736 init_stmt = gimple_build_assign_with_ops (code, new_var,
1737 new_name, step_expr);
1738 new_name = make_ssa_name (new_var, init_stmt);
1739 gimple_assign_set_lhs (init_stmt, new_name);
1741 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1742 gcc_assert (!new_bb);
1744 if (vect_print_dump_info (REPORT_DETAILS))
1746 fprintf (vect_dump, "created new init_stmt: ");
1747 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1749 t = tree_cons (NULL_TREE, new_name, t);
1751 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1752 vec = build_constructor_from_list (vectype, nreverse (t));
1753 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1757 /* Create the vector that holds the step of the induction. */
1758 if (nested_in_vect_loop)
1759 /* iv_loop is nested in the loop to be vectorized. Generate:
1760 vec_step = [S, S, S, S] */
1761 new_name = step_expr;
1764 /* iv_loop is the loop to be vectorized. Generate:
1765 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1766 expr = build_int_cst (scalar_type, vf);
1767 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1771 for (i = 0; i < nunits; i++)
1772 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1773 gcc_assert (CONSTANT_CLASS_P (new_name));
1774 vec = build_vector (vectype, t);
1775 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1778 /* Create the following def-use cycle:
1783 vec_iv = PHI <vec_init, vec_loop>
1787 vec_loop = vec_iv + vec_step; */
1789 /* Create the induction-phi that defines the induction-operand. */
1790 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1791 add_referenced_var (vec_dest);
1792 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1793 set_vinfo_for_stmt (induction_phi,
1794 new_stmt_vec_info (induction_phi, loop_vinfo));
1795 induc_def = PHI_RESULT (induction_phi);
1797 /* Create the iv update inside the loop */
1798 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1799 induc_def, vec_step);
1800 vec_def = make_ssa_name (vec_dest, new_stmt);
1801 gimple_assign_set_lhs (new_stmt, vec_def);
1802 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1803 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
1805 /* Set the arguments of the phi node: */
1806 add_phi_arg (induction_phi, vec_init, pe);
1807 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1810 /* In case that vectorization factor (VF) is bigger than the number
1811 of elements that we can fit in a vectype (nunits), we have to generate
1812 more than one vector stmt - i.e - we need to "unroll" the
1813 vector stmt by a factor VF/nunits. For more details see documentation
1814 in vectorizable_operation. */
1818 stmt_vec_info prev_stmt_vinfo;
1819 /* FORNOW. This restriction should be relaxed. */
1820 gcc_assert (!nested_in_vect_loop);
1822 /* Create the vector that holds the step of the induction. */
1823 expr = build_int_cst (scalar_type, nunits);
1824 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1826 for (i = 0; i < nunits; i++)
1827 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1828 gcc_assert (CONSTANT_CLASS_P (new_name));
1829 vec = build_vector (vectype, t);
1830 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1832 vec_def = induc_def;
1833 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1834 for (i = 1; i < ncopies; i++)
1836 /* vec_i = vec_prev + vec_step */
1837 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1839 vec_def = make_ssa_name (vec_dest, new_stmt);
1840 gimple_assign_set_lhs (new_stmt, vec_def);
1842 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1843 set_vinfo_for_stmt (new_stmt,
1844 new_stmt_vec_info (new_stmt, loop_vinfo));
1845 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1846 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1850 if (nested_in_vect_loop)
1852 /* Find the loop-closed exit-phi of the induction, and record
1853 the final vector of induction results: */
1855 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1857 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
1859 exit_phi = USE_STMT (use_p);
1865 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1866 /* FORNOW. Currently not supporting the case that an inner-loop induction
1867 is not used in the outer-loop (i.e. only outside the outer-loop). */
1868 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1869 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1871 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1872 if (vect_print_dump_info (REPORT_DETAILS))
1874 fprintf (vect_dump, "vector of inductions after inner-loop:");
1875 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
1881 if (vect_print_dump_info (REPORT_DETAILS))
1883 fprintf (vect_dump, "transform induction: created def-use cycle: ");
1884 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
1885 fprintf (vect_dump, "\n");
1886 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
1889 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1894 /* Function vect_get_vec_def_for_operand.
1896 OP is an operand in STMT. This function returns a (vector) def that will be
1897 used in the vectorized stmt for STMT.
1899 In the case that OP is an SSA_NAME which is defined in the loop, then
1900 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1902 In case OP is an invariant or constant, a new stmt that creates a vector def
1903 needs to be introduced. */
1906 vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
1911 stmt_vec_info def_stmt_info = NULL;
1912 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1913 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1914 unsigned int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1915 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1921 enum vect_def_type dt;
1925 if (vect_print_dump_info (REPORT_DETAILS))
1927 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1928 print_generic_expr (vect_dump, op, TDF_SLIM);
1931 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1932 gcc_assert (is_simple_use);
1933 if (vect_print_dump_info (REPORT_DETAILS))
1937 fprintf (vect_dump, "def = ");
1938 print_generic_expr (vect_dump, def, TDF_SLIM);
1942 fprintf (vect_dump, " def_stmt = ");
1943 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1949 /* Case 1: operand is a constant. */
1950 case vect_constant_def:
1955 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1956 if (vect_print_dump_info (REPORT_DETAILS))
1957 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1959 for (i = nunits - 1; i >= 0; --i)
1961 t = tree_cons (NULL_TREE, op, t);
1963 vec_cst = build_vector (vectype, t);
1964 return vect_init_vector (stmt, vec_cst, vectype, NULL);
1967 /* Case 2: operand is defined outside the loop - loop invariant. */
1968 case vect_invariant_def:
1970 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1971 gcc_assert (vector_type);
1972 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
1977 /* Create 'vec_inv = {inv,inv,..,inv}' */
1978 if (vect_print_dump_info (REPORT_DETAILS))
1979 fprintf (vect_dump, "Create vector_inv.");
1981 for (i = nunits - 1; i >= 0; --i)
1983 t = tree_cons (NULL_TREE, def, t);
1986 /* FIXME: use build_constructor directly. */
1987 vec_inv = build_constructor_from_list (vector_type, t);
1988 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1991 /* Case 3: operand is defined inside the loop. */
1995 *scalar_def = NULL/* FIXME tuples: def_stmt*/;
1997 /* Get the def from the vectorized stmt. */
1998 def_stmt_info = vinfo_for_stmt (def_stmt);
1999 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
2000 gcc_assert (vec_stmt);
2001 if (gimple_code (vec_stmt) == GIMPLE_PHI)
2002 vec_oprnd = PHI_RESULT (vec_stmt);
2003 else if (is_gimple_call (vec_stmt))
2004 vec_oprnd = gimple_call_lhs (vec_stmt);
2006 vec_oprnd = gimple_assign_lhs (vec_stmt);
2010 /* Case 4: operand is defined by a loop header phi - reduction */
2011 case vect_reduction_def:
2015 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2016 loop = (gimple_bb (def_stmt))->loop_father;
2018 /* Get the def before the loop */
2019 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
2020 return get_initial_def_for_reduction (stmt, op, scalar_def);
2023 /* Case 5: operand is defined by loop-header phi - induction. */
2024 case vect_induction_def:
2026 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2028 /* Get the def from the vectorized stmt. */
2029 def_stmt_info = vinfo_for_stmt (def_stmt);
2030 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
2031 gcc_assert (vec_stmt && gimple_code (vec_stmt) == GIMPLE_PHI);
2032 vec_oprnd = PHI_RESULT (vec_stmt);
2042 /* Function vect_get_vec_def_for_stmt_copy
2044 Return a vector-def for an operand. This function is used when the
2045 vectorized stmt to be created (by the caller to this function) is a "copy"
2046 created in case the vectorized result cannot fit in one vector, and several
2047 copies of the vector-stmt are required. In this case the vector-def is
2048 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
2049 of the stmt that defines VEC_OPRND.
2050 DT is the type of the vector def VEC_OPRND.
2053 In case the vectorization factor (VF) is bigger than the number
2054 of elements that can fit in a vectype (nunits), we have to generate
2055 more than one vector stmt to vectorize the scalar stmt. This situation
2056 arises when there are multiple data-types operated upon in the loop; the
2057 smallest data-type determines the VF, and as a result, when vectorizing
2058 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
2059 vector stmt (each computing a vector of 'nunits' results, and together
2060 computing 'VF' results in each iteration). This function is called when
2061 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
2062 which VF=16 and nunits=4, so the number of copies required is 4):
2064 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
2066 S1: x = load VS1.0: vx.0 = memref0 VS1.1
2067 VS1.1: vx.1 = memref1 VS1.2
2068 VS1.2: vx.2 = memref2 VS1.3
2069 VS1.3: vx.3 = memref3
2071 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
2072 VSnew.1: vz1 = vx.1 + ... VSnew.2
2073 VSnew.2: vz2 = vx.2 + ... VSnew.3
2074 VSnew.3: vz3 = vx.3 + ...
2076 The vectorization of S1 is explained in vectorizable_load.
2077 The vectorization of S2:
2078 To create the first vector-stmt out of the 4 copies - VSnew.0 -
2079 the function 'vect_get_vec_def_for_operand' is called to
2080 get the relevant vector-def for each operand of S2. For operand x it
2081 returns the vector-def 'vx.0'.
2083 To create the remaining copies of the vector-stmt (VSnew.j), this
2084 function is called to get the relevant vector-def for each operand. It is
2085 obtained from the respective VS1.j stmt, which is recorded in the
2086 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2088 For example, to obtain the vector-def 'vx.1' in order to create the
2089 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2090 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2091 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2092 and return its def ('vx.1').
2093 Overall, to create the above sequence this function will be called 3 times:
2094 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2095 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2096 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
2099 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2101 gimple vec_stmt_for_operand;
2102 stmt_vec_info def_stmt_info;
2104 /* Do nothing; can reuse same def. */
2105 if (dt == vect_invariant_def || dt == vect_constant_def )
2108 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2109 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2110 gcc_assert (def_stmt_info);
2111 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2112 gcc_assert (vec_stmt_for_operand);
2113 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2114 if (gimple_code (vec_stmt_for_operand) == GIMPLE_PHI)
2115 vec_oprnd = PHI_RESULT (vec_stmt_for_operand);
2117 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2122 /* Get vectorized definitions for the operands to create a copy of an original
2123 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
2126 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2127 VEC(tree,heap) **vec_oprnds0,
2128 VEC(tree,heap) **vec_oprnds1)
2130 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2132 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2133 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2135 if (vec_oprnds1 && *vec_oprnds1)
2137 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2138 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2139 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2144 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
2147 vect_get_vec_defs (tree op0, tree op1, gimple stmt,
2148 VEC(tree,heap) **vec_oprnds0, VEC(tree,heap) **vec_oprnds1,
2152 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2157 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2158 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2159 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2163 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2164 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2165 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2171 /* Function vect_finish_stmt_generation.
2173 Insert a new stmt. */
2176 vect_finish_stmt_generation (gimple stmt, gimple vec_stmt,
2177 gimple_stmt_iterator *gsi)
2179 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2180 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2182 gcc_assert (gimple_code (stmt) != GIMPLE_LABEL);
2184 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
2186 set_vinfo_for_stmt (vec_stmt, new_stmt_vec_info (vec_stmt, loop_vinfo));
2188 if (vect_print_dump_info (REPORT_DETAILS))
2190 fprintf (vect_dump, "add new stmt: ");
2191 print_gimple_stmt (vect_dump, vec_stmt, 0, TDF_SLIM);
2194 gimple_set_location (vec_stmt, gimple_location (gsi_stmt (*gsi)));
2198 /* Function get_initial_def_for_reduction
2201 STMT - a stmt that performs a reduction operation in the loop.
2202 INIT_VAL - the initial value of the reduction variable
2205 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2206 of the reduction (used for adjusting the epilog - see below).
2207 Return a vector variable, initialized according to the operation that STMT
2208 performs. This vector will be used as the initial value of the
2209 vector of partial results.
2211 Option1 (adjust in epilog): Initialize the vector as follows:
2214 min/max: [init_val,init_val,..,init_val,init_val]
2215 bit and/or: [init_val,init_val,..,init_val,init_val]
2216 and when necessary (e.g. add/mult case) let the caller know
2217 that it needs to adjust the result by init_val.
2219 Option2: Initialize the vector as follows:
2220 add: [0,0,...,0,init_val]
2221 mult: [1,1,...,1,init_val]
2222 min/max: [init_val,init_val,...,init_val]
2223 bit and/or: [init_val,init_val,...,init_val]
2224 and no adjustments are needed.
2226 For example, for the following code:
2232 STMT is 's = s + a[i]', and the reduction variable is 's'.
2233 For a vector of 4 units, we want to return either [0,0,0,init_val],
2234 or [0,0,0,0] and let the caller know that it needs to adjust
2235 the result at the end by 'init_val'.
2237 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2238 initialization vector is simpler (same element in all entries).
2239 A cost model should help decide between these two schemes. */
2242 get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
2244 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2245 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2246 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2247 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2248 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2249 tree scalar_type = TREE_TYPE (vectype);
2250 enum tree_code code = gimple_assign_rhs_code (stmt);
2251 tree type = TREE_TYPE (init_val);
2257 bool nested_in_vect_loop = false;
2259 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2260 if (nested_in_vect_loop_p (loop, stmt))
2261 nested_in_vect_loop = true;
2263 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2265 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2269 case WIDEN_SUM_EXPR:
2272 if (nested_in_vect_loop)
2273 *adjustment_def = vecdef;
2275 *adjustment_def = init_val;
2276 /* Create a vector of zeros for init_def. */
2277 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2278 def_for_init = build_real (scalar_type, dconst0);
2280 def_for_init = build_int_cst (scalar_type, 0);
2282 for (i = nunits - 1; i >= 0; --i)
2283 t = tree_cons (NULL_TREE, def_for_init, t);
2284 init_def = build_vector (vectype, t);
2289 *adjustment_def = NULL_TREE;
2301 /* Function vect_create_epilog_for_reduction
2303 Create code at the loop-epilog to finalize the result of a reduction
2306 VECT_DEF is a vector of partial results.
2307 REDUC_CODE is the tree-code for the epilog reduction.
2308 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2309 number of elements that we can fit in a vectype (nunits). In this case
2310 we have to generate more than one vector stmt - i.e - we need to "unroll"
2311 the vector stmt by a factor VF/nunits. For more details see documentation
2312 in vectorizable_operation.
2313 STMT is the scalar reduction stmt that is being vectorized.
2314 REDUCTION_PHI is the phi-node that carries the reduction computation.
2317 1. Creates the reduction def-use cycle: sets the arguments for
2319 The loop-entry argument is the vectorized initial-value of the reduction.
2320 The loop-latch argument is VECT_DEF - the vector of partial sums.
2321 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2322 by applying the operation specified by REDUC_CODE if available, or by
2323 other means (whole-vector shifts or a scalar loop).
2324 The function also creates a new phi node at the loop exit to preserve
2325 loop-closed form, as illustrated below.
2327 The flow at the entry to this function:
2330 vec_def = phi <null, null> # REDUCTION_PHI
2331 VECT_DEF = vector_stmt # vectorized form of STMT
2332 s_loop = scalar_stmt # (scalar) STMT
2334 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2338 The above is transformed by this function into:
2341 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2342 VECT_DEF = vector_stmt # vectorized form of STMT
2343 s_loop = scalar_stmt # (scalar) STMT
2345 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2346 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2347 v_out2 = reduce <v_out1>
2348 s_out3 = extract_field <v_out2, 0>
2349 s_out4 = adjust_result <s_out3>
2355 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2357 enum tree_code reduc_code,
2358 gimple reduction_phi)
2360 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2361 stmt_vec_info prev_phi_info;
2363 enum machine_mode mode;
2364 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2365 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2366 basic_block exit_bb;
2369 gimple new_phi = NULL, phi;
2370 gimple_stmt_iterator exit_gsi;
2372 tree new_temp = NULL_TREE;
2374 gimple epilog_stmt = NULL;
2375 tree new_scalar_dest, new_dest;
2377 tree bitsize, bitpos, bytesize;
2378 enum tree_code code = gimple_assign_rhs_code (stmt);
2379 tree adjustment_def;
2380 tree vec_initial_def, def;
2382 imm_use_iterator imm_iter;
2383 use_operand_p use_p;
2384 bool extract_scalar_result = false;
2385 tree reduction_op, expr;
2388 bool nested_in_vect_loop = false;
2389 VEC(gimple,heap) *phis = NULL;
2390 enum vect_def_type dt = vect_unknown_def_type;
2393 if (nested_in_vect_loop_p (loop, stmt))
2396 nested_in_vect_loop = true;
2399 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2401 case GIMPLE_SINGLE_RHS:
2402 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2403 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2405 case GIMPLE_UNARY_RHS:
2406 reduction_op = gimple_assign_rhs1 (stmt);
2408 case GIMPLE_BINARY_RHS:
2409 reduction_op = gimple_assign_rhs2 (stmt);
2415 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2416 gcc_assert (vectype);
2417 mode = TYPE_MODE (vectype);
2419 /*** 1. Create the reduction def-use cycle ***/
2421 /* For the case of reduction, vect_get_vec_def_for_operand returns
2422 the scalar def before the loop, that defines the initial value
2423 of the reduction variable. */
2424 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2427 phi = reduction_phi;
2429 for (j = 0; j < ncopies; j++)
2431 /* 1.1 set the loop-entry arg of the reduction-phi: */
2432 add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop));
2434 /* 1.2 set the loop-latch arg for the reduction-phi: */
2436 def = vect_get_vec_def_for_stmt_copy (dt, def);
2437 add_phi_arg (phi, def, loop_latch_edge (loop));
2439 if (vect_print_dump_info (REPORT_DETAILS))
2441 fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2442 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2443 fprintf (vect_dump, "\n");
2444 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2447 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2450 /*** 2. Create epilog code
2451 The reduction epilog code operates across the elements of the vector
2452 of partial results computed by the vectorized loop.
2453 The reduction epilog code consists of:
2454 step 1: compute the scalar result in a vector (v_out2)
2455 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2456 step 3: adjust the scalar result (s_out3) if needed.
2458 Step 1 can be accomplished using one the following three schemes:
2459 (scheme 1) using reduc_code, if available.
2460 (scheme 2) using whole-vector shifts, if available.
2461 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2464 The overall epilog code looks like this:
2466 s_out0 = phi <s_loop> # original EXIT_PHI
2467 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2468 v_out2 = reduce <v_out1> # step 1
2469 s_out3 = extract_field <v_out2, 0> # step 2
2470 s_out4 = adjust_result <s_out3> # step 3
2472 (step 3 is optional, and steps 1 and 2 may be combined).
2473 Lastly, the uses of s_out0 are replaced by s_out4.
2477 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2478 v_out1 = phi <v_loop> */
2480 exit_bb = single_exit (loop)->dest;
2482 prev_phi_info = NULL;
2483 for (j = 0; j < ncopies; j++)
2485 phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2486 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
2491 def = vect_get_vec_def_for_stmt_copy (dt, def);
2492 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
2494 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
2495 prev_phi_info = vinfo_for_stmt (phi);
2497 exit_gsi = gsi_after_labels (exit_bb);
2499 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2500 (i.e. when reduc_code is not available) and in the final adjustment
2501 code (if needed). Also get the original scalar reduction variable as
2502 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2503 represents a reduction pattern), the tree-code and scalar-def are
2504 taken from the original stmt that the pattern-stmt (STMT) replaces.
2505 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2506 are taken from STMT. */
2508 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2511 /* Regular reduction */
2516 /* Reduction pattern */
2517 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2518 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2519 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2521 code = gimple_assign_rhs_code (orig_stmt);
2522 scalar_dest = gimple_assign_lhs (orig_stmt);
2523 scalar_type = TREE_TYPE (scalar_dest);
2524 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2525 bitsize = TYPE_SIZE (scalar_type);
2526 bytesize = TYPE_SIZE_UNIT (scalar_type);
2529 /* In case this is a reduction in an inner-loop while vectorizing an outer
2530 loop - we don't need to extract a single scalar result at the end of the
2531 inner-loop. The final vector of partial results will be used in the
2532 vectorized outer-loop, or reduced to a scalar result at the end of the
2534 if (nested_in_vect_loop)
2535 goto vect_finalize_reduction;
2538 gcc_assert (ncopies == 1);
2540 /* 2.3 Create the reduction code, using one of the three schemes described
2543 if (reduc_code < NUM_TREE_CODES)
2547 /*** Case 1: Create:
2548 v_out2 = reduc_expr <v_out1> */
2550 if (vect_print_dump_info (REPORT_DETAILS))
2551 fprintf (vect_dump, "Reduce using direct vector reduction.");
2553 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2554 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2555 epilog_stmt = gimple_build_assign (vec_dest, tmp);
2556 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2557 gimple_assign_set_lhs (epilog_stmt, new_temp);
2558 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2560 extract_scalar_result = true;
2564 enum tree_code shift_code = 0;
2565 bool have_whole_vector_shift = true;
2567 int element_bitsize = tree_low_cst (bitsize, 1);
2568 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2571 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2572 shift_code = VEC_RSHIFT_EXPR;
2574 have_whole_vector_shift = false;
2576 /* Regardless of whether we have a whole vector shift, if we're
2577 emulating the operation via tree-vect-generic, we don't want
2578 to use it. Only the first round of the reduction is likely
2579 to still be profitable via emulation. */
2580 /* ??? It might be better to emit a reduction tree code here, so that
2581 tree-vect-generic can expand the first round via bit tricks. */
2582 if (!VECTOR_MODE_P (mode))
2583 have_whole_vector_shift = false;
2586 optab optab = optab_for_tree_code (code, vectype, optab_default);
2587 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2588 have_whole_vector_shift = false;
2591 if (have_whole_vector_shift)
2593 /*** Case 2: Create:
2594 for (offset = VS/2; offset >= element_size; offset/=2)
2596 Create: va' = vec_shift <va, offset>
2597 Create: va = vop <va, va'>
2600 if (vect_print_dump_info (REPORT_DETAILS))
2601 fprintf (vect_dump, "Reduce using vector shifts");
2603 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2604 new_temp = PHI_RESULT (new_phi);
2606 for (bit_offset = vec_size_in_bits/2;
2607 bit_offset >= element_bitsize;
2610 tree bitpos = size_int (bit_offset);
2611 epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
2613 new_name = make_ssa_name (vec_dest, epilog_stmt);
2614 gimple_assign_set_lhs (epilog_stmt, new_name);
2615 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2617 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
2618 new_name, new_temp);
2619 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2620 gimple_assign_set_lhs (epilog_stmt, new_temp);
2621 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2624 extract_scalar_result = true;
2630 /*** Case 3: Create:
2631 s = extract_field <v_out2, 0>
2632 for (offset = element_size;
2633 offset < vector_size;
2634 offset += element_size;)
2636 Create: s' = extract_field <v_out2, offset>
2637 Create: s = op <s, s'>
2640 if (vect_print_dump_info (REPORT_DETAILS))
2641 fprintf (vect_dump, "Reduce using scalar code. ");
2643 vec_temp = PHI_RESULT (new_phi);
2644 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2645 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2647 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2648 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2649 gimple_assign_set_lhs (epilog_stmt, new_temp);
2650 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2652 for (bit_offset = element_bitsize;
2653 bit_offset < vec_size_in_bits;
2654 bit_offset += element_bitsize)
2656 tree bitpos = bitsize_int (bit_offset);
2657 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2660 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2661 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2662 gimple_assign_set_lhs (epilog_stmt, new_name);
2663 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2665 epilog_stmt = gimple_build_assign_with_ops (code,
2667 new_name, new_temp);
2668 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2669 gimple_assign_set_lhs (epilog_stmt, new_temp);
2670 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2673 extract_scalar_result = false;
2677 /* 2.4 Extract the final scalar result. Create:
2678 s_out3 = extract_field <v_out2, bitpos> */
2680 if (extract_scalar_result)
2684 gcc_assert (!nested_in_vect_loop);
2685 if (vect_print_dump_info (REPORT_DETAILS))
2686 fprintf (vect_dump, "extract scalar result");
2688 if (BYTES_BIG_ENDIAN)
2689 bitpos = size_binop (MULT_EXPR,
2690 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2691 TYPE_SIZE (scalar_type));
2693 bitpos = bitsize_zero_node;
2695 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2696 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2697 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2698 gimple_assign_set_lhs (epilog_stmt, new_temp);
2699 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2702 vect_finalize_reduction:
2704 /* 2.5 Adjust the final result by the initial value of the reduction
2705 variable. (When such adjustment is not needed, then
2706 'adjustment_def' is zero). For example, if code is PLUS we create:
2707 new_temp = loop_exit_def + adjustment_def */
2711 if (nested_in_vect_loop)
2713 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2714 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2715 new_dest = vect_create_destination_var (scalar_dest, vectype);
2719 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2720 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2721 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2723 epilog_stmt = gimple_build_assign (new_dest, expr);
2724 new_temp = make_ssa_name (new_dest, epilog_stmt);
2725 gimple_assign_set_lhs (epilog_stmt, new_temp);
2726 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
2727 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2731 /* 2.6 Handle the loop-exit phi */
2733 /* Replace uses of s_out0 with uses of s_out3:
2734 Find the loop-closed-use at the loop exit of the original scalar result.
2735 (The reduction result is expected to have two immediate uses - one at the
2736 latch block, and one at the loop exit). */
2737 phis = VEC_alloc (gimple, heap, 10);
2738 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2740 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2742 exit_phi = USE_STMT (use_p);
2743 VEC_quick_push (gimple, phis, exit_phi);
2746 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2747 gcc_assert (!VEC_empty (gimple, phis));
2749 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
2751 if (nested_in_vect_loop)
2753 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2755 /* FORNOW. Currently not supporting the case that an inner-loop
2756 reduction is not used in the outer-loop (but only outside the
2758 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2759 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2761 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2762 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2763 set_vinfo_for_stmt (epilog_stmt,
2764 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2766 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
2767 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
2771 /* Replace the uses: */
2772 orig_name = PHI_RESULT (exit_phi);
2773 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2774 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2775 SET_USE (use_p, new_temp);
2777 VEC_free (gimple, heap, phis);
2781 /* Function vectorizable_reduction.
2783 Check if STMT performs a reduction operation that can be vectorized.
2784 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2785 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2786 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2788 This function also handles reduction idioms (patterns) that have been
2789 recognized in advance during vect_pattern_recog. In this case, STMT may be
2791 X = pattern_expr (arg0, arg1, ..., X)
2792 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2793 sequence that had been detected and replaced by the pattern-stmt (STMT).
2795 In some cases of reduction patterns, the type of the reduction variable X is
2796 different than the type of the other arguments of STMT.
2797 In such cases, the vectype that is used when transforming STMT into a vector
2798 stmt is different than the vectype that is used to determine the
2799 vectorization factor, because it consists of a different number of elements
2800 than the actual number of elements that are being operated upon in parallel.
2802 For example, consider an accumulation of shorts into an int accumulator.
2803 On some targets it's possible to vectorize this pattern operating on 8
2804 shorts at a time (hence, the vectype for purposes of determining the
2805 vectorization factor should be V8HI); on the other hand, the vectype that
2806 is used to create the vector form is actually V4SI (the type of the result).
2808 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2809 indicates what is the actual level of parallelism (V8HI in the example), so
2810 that the right vectorization factor would be derived. This vectype
2811 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2812 be used to create the vectorized stmt. The right vectype for the vectorized
2813 stmt is obtained from the type of the result X:
2814 get_vectype_for_scalar_type (TREE_TYPE (X))
2816 This means that, contrary to "regular" reductions (or "regular" stmts in
2817 general), the following equation:
2818 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2819 does *NOT* necessarily hold for reduction patterns. */
2822 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
2827 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2828 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2829 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2830 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2831 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2832 enum tree_code code, orig_code, epilog_reduc_code = 0;
2833 enum machine_mode vec_mode;
2835 optab optab, reduc_optab;
2836 tree new_temp = NULL_TREE;
2839 enum vect_def_type dt;
2840 gimple new_phi = NULL;
2844 stmt_vec_info orig_stmt_info;
2845 tree expr = NULL_TREE;
2847 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2848 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2850 stmt_vec_info prev_stmt_info, prev_phi_info;
2851 gimple first_phi = NULL;
2852 bool single_defuse_cycle = false;
2854 gimple new_stmt = NULL;
2858 if (nested_in_vect_loop_p (loop, stmt))
2861 gcc_assert (ncopies >= 1);
2863 /* FORNOW: SLP not supported. */
2864 if (STMT_SLP_TYPE (stmt_info))
2867 /* 1. Is vectorizable reduction? */
2869 /* Not supportable if the reduction variable is used in the loop. */
2870 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2873 /* Reductions that are not used even in an enclosing outer-loop,
2874 are expected to be "live" (used out of the loop). */
2875 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2876 && !STMT_VINFO_LIVE_P (stmt_info))
2879 /* Make sure it was already recognized as a reduction computation. */
2880 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2883 /* 2. Has this been recognized as a reduction pattern?
2885 Check if STMT represents a pattern that has been recognized
2886 in earlier analysis stages. For stmts that represent a pattern,
2887 the STMT_VINFO_RELATED_STMT field records the last stmt in
2888 the original sequence that constitutes the pattern. */
2890 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2893 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2894 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2895 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2896 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2899 /* 3. Check the operands of the operation. The first operands are defined
2900 inside the loop body. The last operand is the reduction variable,
2901 which is defined by the loop-header-phi. */
2903 gcc_assert (is_gimple_assign (stmt));
2906 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2908 case GIMPLE_SINGLE_RHS:
2909 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
2910 if (op_type == ternary_op)
2912 tree rhs = gimple_assign_rhs1 (stmt);
2913 ops[0] = TREE_OPERAND (rhs, 0);
2914 ops[1] = TREE_OPERAND (rhs, 1);
2915 ops[2] = TREE_OPERAND (rhs, 2);
2916 code = TREE_CODE (rhs);
2922 case GIMPLE_BINARY_RHS:
2923 code = gimple_assign_rhs_code (stmt);
2924 op_type = TREE_CODE_LENGTH (code);
2925 gcc_assert (op_type == binary_op);
2926 ops[0] = gimple_assign_rhs1 (stmt);
2927 ops[1] = gimple_assign_rhs2 (stmt);
2930 case GIMPLE_UNARY_RHS:
2937 scalar_dest = gimple_assign_lhs (stmt);
2938 scalar_type = TREE_TYPE (scalar_dest);
2939 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2940 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2943 /* All uses but the last are expected to be defined in the loop.
2944 The last use is the reduction variable. */
2945 for (i = 0; i < op_type-1; i++)
2947 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt,
2949 gcc_assert (is_simple_use);
2950 if (dt != vect_loop_def
2951 && dt != vect_invariant_def
2952 && dt != vect_constant_def
2953 && dt != vect_induction_def)
2957 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt, &def, &dt);
2958 gcc_assert (is_simple_use);
2959 gcc_assert (dt == vect_reduction_def);
2960 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2962 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2964 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2966 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2969 /* 4. Supportable by target? */
2971 /* 4.1. check support for the operation in the loop */
2972 optab = optab_for_tree_code (code, vectype, optab_default);
2975 if (vect_print_dump_info (REPORT_DETAILS))
2976 fprintf (vect_dump, "no optab.");
2979 vec_mode = TYPE_MODE (vectype);
2980 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2982 if (vect_print_dump_info (REPORT_DETAILS))
2983 fprintf (vect_dump, "op not supported by target.");
2984 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2985 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2986 < vect_min_worthwhile_factor (code))
2988 if (vect_print_dump_info (REPORT_DETAILS))
2989 fprintf (vect_dump, "proceeding using word mode.");
2992 /* Worthwhile without SIMD support? */
2993 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2994 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2995 < vect_min_worthwhile_factor (code))
2997 if (vect_print_dump_info (REPORT_DETAILS))
2998 fprintf (vect_dump, "not worthwhile without SIMD support.");
3002 /* 4.2. Check support for the epilog operation.
3004 If STMT represents a reduction pattern, then the type of the
3005 reduction variable may be different than the type of the rest
3006 of the arguments. For example, consider the case of accumulation
3007 of shorts into an int accumulator; The original code:
3008 S1: int_a = (int) short_a;
3009 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
3012 STMT: int_acc = widen_sum <short_a, int_acc>
3015 1. The tree-code that is used to create the vector operation in the
3016 epilog code (that reduces the partial results) is not the
3017 tree-code of STMT, but is rather the tree-code of the original
3018 stmt from the pattern that STMT is replacing. I.e, in the example
3019 above we want to use 'widen_sum' in the loop, but 'plus' in the
3021 2. The type (mode) we use to check available target support
3022 for the vector operation to be created in the *epilog*, is
3023 determined by the type of the reduction variable (in the example
3024 above we'd check this: plus_optab[vect_int_mode]).
3025 However the type (mode) we use to check available target support
3026 for the vector operation to be created *inside the loop*, is
3027 determined by the type of the other arguments to STMT (in the
3028 example we'd check this: widen_sum_optab[vect_short_mode]).
3030 This is contrary to "regular" reductions, in which the types of all
3031 the arguments are the same as the type of the reduction variable.
3032 For "regular" reductions we can therefore use the same vector type
3033 (and also the same tree-code) when generating the epilog code and
3034 when generating the code inside the loop. */
3038 /* This is a reduction pattern: get the vectype from the type of the
3039 reduction variable, and get the tree-code from orig_stmt. */
3040 orig_code = gimple_assign_rhs_code (orig_stmt);
3041 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3044 if (vect_print_dump_info (REPORT_DETAILS))
3046 fprintf (vect_dump, "unsupported data-type ");
3047 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3052 vec_mode = TYPE_MODE (vectype);
3056 /* Regular reduction: use the same vectype and tree-code as used for
3057 the vector code inside the loop can be used for the epilog code. */
3061 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3063 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default);
3066 if (vect_print_dump_info (REPORT_DETAILS))
3067 fprintf (vect_dump, "no optab for reduction.");
3068 epilog_reduc_code = NUM_TREE_CODES;
3070 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
3072 if (vect_print_dump_info (REPORT_DETAILS))
3073 fprintf (vect_dump, "reduc op not supported by target.");
3074 epilog_reduc_code = NUM_TREE_CODES;
3077 if (!vec_stmt) /* transformation not required. */
3079 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3080 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3087 if (vect_print_dump_info (REPORT_DETAILS))
3088 fprintf (vect_dump, "transform reduction.");
3090 /* Create the destination vector */
3091 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3093 /* In case the vectorization factor (VF) is bigger than the number
3094 of elements that we can fit in a vectype (nunits), we have to generate
3095 more than one vector stmt - i.e - we need to "unroll" the
3096 vector stmt by a factor VF/nunits. For more details see documentation
3097 in vectorizable_operation. */
3099 /* If the reduction is used in an outer loop we need to generate
3100 VF intermediate results, like so (e.g. for ncopies=2):
3105 (i.e. we generate VF results in 2 registers).
3106 In this case we have a separate def-use cycle for each copy, and therefore
3107 for each copy we get the vector def for the reduction variable from the
3108 respective phi node created for this copy.
3110 Otherwise (the reduction is unused in the loop nest), we can combine
3111 together intermediate results, like so (e.g. for ncopies=2):
3115 (i.e. we generate VF/2 results in a single register).
3116 In this case for each copy we get the vector def for the reduction variable
3117 from the vectorized reduction operation generated in the previous iteration.
3120 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop)
3122 single_defuse_cycle = true;
3126 epilog_copies = ncopies;
3128 prev_stmt_info = NULL;
3129 prev_phi_info = NULL;
3130 for (j = 0; j < ncopies; j++)
3132 if (j == 0 || !single_defuse_cycle)
3134 /* Create the reduction-phi that defines the reduction-operand. */
3135 new_phi = create_phi_node (vec_dest, loop->header);
3136 set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo));
3142 loop_vec_def0 = vect_get_vec_def_for_operand (ops[0], stmt, NULL);
3143 if (op_type == ternary_op)
3145 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, NULL);
3148 /* Get the vector def for the reduction variable from the phi node */
3149 reduc_def = PHI_RESULT (new_phi);
3150 first_phi = new_phi;
3154 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3155 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3156 if (op_type == ternary_op)
3157 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3159 if (single_defuse_cycle)
3160 reduc_def = gimple_assign_lhs (new_stmt);
3162 reduc_def = PHI_RESULT (new_phi);
3164 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3167 /* Arguments are ready. create the new vector stmt. */
3168 if (op_type == binary_op)
3169 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3171 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3173 new_stmt = gimple_build_assign (vec_dest, expr);
3174 new_temp = make_ssa_name (vec_dest, new_stmt);
3175 gimple_assign_set_lhs (new_stmt, new_temp);
3176 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3179 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3181 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3182 prev_stmt_info = vinfo_for_stmt (new_stmt);
3183 prev_phi_info = vinfo_for_stmt (new_phi);
3186 /* Finalize the reduction-phi (set its arguments) and create the
3187 epilog reduction code. */
3188 if (!single_defuse_cycle)
3189 new_temp = gimple_assign_lhs (*vec_stmt);
3190 vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3191 epilog_reduc_code, first_phi);
3195 /* Checks if CALL can be vectorized in type VECTYPE. Returns
3196 a function declaration if the target has a vectorized version
3197 of the function, or NULL_TREE if the function cannot be vectorized. */
3200 vectorizable_function (gimple call, tree vectype_out, tree vectype_in)
3202 tree fndecl = gimple_call_fndecl (call);
3203 enum built_in_function code;
3205 /* We only handle functions that do not read or clobber memory -- i.e.
3206 const or novops ones. */
3207 if (!(gimple_call_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3211 || TREE_CODE (fndecl) != FUNCTION_DECL
3212 || !DECL_BUILT_IN (fndecl))
3215 code = DECL_FUNCTION_CODE (fndecl);
3216 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3220 /* Function vectorizable_call.
3222 Check if STMT performs a function call that can be vectorized.
3223 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3224 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3225 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3228 vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
3233 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3234 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3235 tree vectype_out, vectype_in;
3238 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3239 tree fndecl, new_temp, def, rhs_type, lhs_type;
3241 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3244 VEC(tree, heap) *vargs = NULL;
3245 enum { NARROW, NONE, WIDEN } modifier;
3248 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3251 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3254 /* FORNOW: SLP not supported. */
3255 if (STMT_SLP_TYPE (stmt_info))
3258 /* Is STMT a vectorizable call? */
3259 if (!is_gimple_call (stmt))
3262 if (TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
3265 /* Process function arguments. */
3266 rhs_type = NULL_TREE;
3267 nargs = gimple_call_num_args (stmt);
3269 /* Bail out if the function has more than two arguments, we
3270 do not have interesting builtin functions to vectorize with
3271 more than two arguments. No arguments is also not good. */
3272 if (nargs == 0 || nargs > 2)
3275 for (i = 0; i < nargs; i++)
3277 op = gimple_call_arg (stmt, i);
3279 /* We can only handle calls with arguments of the same type. */
3281 && rhs_type != TREE_TYPE (op))
3283 if (vect_print_dump_info (REPORT_DETAILS))
3284 fprintf (vect_dump, "argument types differ.");
3287 rhs_type = TREE_TYPE (op);
3289 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[i]))
3291 if (vect_print_dump_info (REPORT_DETAILS))
3292 fprintf (vect_dump, "use not simple.");
3297 vectype_in = get_vectype_for_scalar_type (rhs_type);
3300 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3302 lhs_type = TREE_TYPE (gimple_call_lhs (stmt));
3303 vectype_out = get_vectype_for_scalar_type (lhs_type);
3306 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3309 if (nunits_in == nunits_out / 2)
3311 else if (nunits_out == nunits_in)
3313 else if (nunits_out == nunits_in / 2)
3318 /* For now, we only vectorize functions if a target specific builtin
3319 is available. TODO -- in some cases, it might be profitable to
3320 insert the calls for pieces of the vector, in order to be able
3321 to vectorize other operations in the loop. */
3322 fndecl = vectorizable_function (stmt, vectype_out, vectype_in);
3323 if (fndecl == NULL_TREE)
3325 if (vect_print_dump_info (REPORT_DETAILS))
3326 fprintf (vect_dump, "function is not vectorizable.");
3331 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3333 if (modifier == NARROW)
3334 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3336 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3338 /* Sanity check: make sure that at least one copy of the vectorized stmt
3339 needs to be generated. */
3340 gcc_assert (ncopies >= 1);
3342 if (!vec_stmt) /* transformation not required. */
3344 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3345 if (vect_print_dump_info (REPORT_DETAILS))
3346 fprintf (vect_dump, "=== vectorizable_call ===");
3347 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3353 if (vect_print_dump_info (REPORT_DETAILS))
3354 fprintf (vect_dump, "transform operation.");
3357 scalar_dest = gimple_call_lhs (stmt);
3358 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3360 prev_stmt_info = NULL;
3364 for (j = 0; j < ncopies; ++j)
3366 /* Build argument list for the vectorized call. */
3368 vargs = VEC_alloc (tree, heap, nargs);
3370 VEC_truncate (tree, vargs, 0);
3372 for (i = 0; i < nargs; i++)
3374 op = gimple_call_arg (stmt, i);
3377 = vect_get_vec_def_for_operand (op, stmt, NULL);
3380 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3382 VEC_quick_push (tree, vargs, vec_oprnd0);
3385 new_stmt = gimple_build_call_vec (fndecl, vargs);
3386 new_temp = make_ssa_name (vec_dest, new_stmt);
3387 gimple_call_set_lhs (new_stmt, new_temp);
3389 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3392 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3394 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3396 prev_stmt_info = vinfo_for_stmt (new_stmt);
3402 for (j = 0; j < ncopies; ++j)
3404 /* Build argument list for the vectorized call. */
3406 vargs = VEC_alloc (tree, heap, nargs * 2);
3408 VEC_truncate (tree, vargs, 0);
3410 for (i = 0; i < nargs; i++)
3412 op = gimple_call_arg (stmt, i);
3416 = vect_get_vec_def_for_operand (op, stmt, NULL);
3418 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3423 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3425 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3428 VEC_quick_push (tree, vargs, vec_oprnd0);
3429 VEC_quick_push (tree, vargs, vec_oprnd1);
3432 new_stmt = gimple_build_call_vec (fndecl, vargs);
3433 new_temp = make_ssa_name (vec_dest, new_stmt);
3434 gimple_call_set_lhs (new_stmt, new_temp);
3436 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3439 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3441 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3443 prev_stmt_info = vinfo_for_stmt (new_stmt);
3446 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3451 /* No current target implements this case. */
3455 VEC_free (tree, heap, vargs);
3457 /* The call in STMT might prevent it from being removed in dce.
3458 We however cannot remove it here, due to the way the ssa name
3459 it defines is mapped to the new definition. So just replace
3460 rhs of the statement with something harmless. */
3462 type = TREE_TYPE (scalar_dest);
3463 new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
3464 fold_convert (type, integer_zero_node));
3465 set_vinfo_for_stmt (new_stmt, stmt_info);
3466 set_vinfo_for_stmt (stmt, NULL);
3467 STMT_VINFO_STMT (stmt_info) = new_stmt;
3468 gsi_replace (gsi, new_stmt, false);
3469 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3475 /* Function vect_gen_widened_results_half
3477 Create a vector stmt whose code, type, number of arguments, and result
3478 variable are CODE, OP_TYPE, and VEC_DEST, and its arguments are
3479 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3480 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3481 needs to be created (DECL is a function-decl of a target-builtin).
3482 STMT is the original scalar stmt that we are vectorizing. */
3485 vect_gen_widened_results_half (enum tree_code code,
3487 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3488 tree vec_dest, gimple_stmt_iterator *gsi,
3496 /* Generate half of the widened result: */
3497 if (code == CALL_EXPR)
3499 /* Target specific support */
3500 if (op_type == binary_op)
3501 new_stmt = gimple_build_call (decl, 2, vec_oprnd0, vec_oprnd1);
3503 new_stmt = gimple_build_call (decl, 1, vec_oprnd0);
3504 new_temp = make_ssa_name (vec_dest, new_stmt);
3505 gimple_call_set_lhs (new_stmt, new_temp);
3509 /* Generic support */
3510 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3511 if (op_type != binary_op)
3513 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vec_oprnd0,
3515 new_temp = make_ssa_name (vec_dest, new_stmt);
3516 gimple_assign_set_lhs (new_stmt, new_temp);
3518 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3520 if (code == CALL_EXPR)
3522 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3524 if (TREE_CODE (sym) == SSA_NAME)
3525 sym = SSA_NAME_VAR (sym);
3526 mark_sym_for_renaming (sym);
3534 /* Check if STMT performs a conversion operation, that can be vectorized.
3535 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3536 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3537 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3540 vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
3541 gimple *vec_stmt, slp_tree slp_node)
3546 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3547 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3548 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3549 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3550 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3554 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3555 gimple new_stmt = NULL;
3556 stmt_vec_info prev_stmt_info;
3559 tree vectype_out, vectype_in;
3562 tree rhs_type, lhs_type;
3564 enum { NARROW, NONE, WIDEN } modifier;
3566 VEC(tree,heap) *vec_oprnds0 = NULL;
3569 VEC(tree,heap) *dummy = NULL;
3572 /* Is STMT a vectorizable conversion? */
3574 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3577 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3580 if (!is_gimple_assign (stmt))
3583 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
3586 code = gimple_assign_rhs_code (stmt);
3587 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3590 /* Check types of lhs and rhs. */
3591 op0 = gimple_assign_rhs1 (stmt);
3592 rhs_type = TREE_TYPE (op0);
3593 vectype_in = get_vectype_for_scalar_type (rhs_type);
3596 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3598 scalar_dest = gimple_assign_lhs (stmt);
3599 lhs_type = TREE_TYPE (scalar_dest);
3600 vectype_out = get_vectype_for_scalar_type (lhs_type);
3603 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3606 if (nunits_in == nunits_out / 2)
3608 else if (nunits_out == nunits_in)
3610 else if (nunits_out == nunits_in / 2)
3615 if (modifier == NONE)
3616 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3618 /* Bail out if the types are both integral or non-integral. */
3619 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3620 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3623 integral_type = INTEGRAL_TYPE_P (rhs_type) ? vectype_in : vectype_out;
3625 if (modifier == NARROW)
3626 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3628 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3630 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3631 this, so we can safely override NCOPIES with 1 here. */
3635 /* Sanity check: make sure that at least one copy of the vectorized stmt
3636 needs to be generated. */
3637 gcc_assert (ncopies >= 1);
3639 /* Check the operands of the operation. */
3640 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3642 if (vect_print_dump_info (REPORT_DETAILS))
3643 fprintf (vect_dump, "use not simple.");
3647 /* Supportable by target? */
3648 if ((modifier == NONE
3649 && !targetm.vectorize.builtin_conversion (code, integral_type))
3650 || (modifier == WIDEN
3651 && !supportable_widening_operation (code, stmt, vectype_in,
3654 &dummy_int, &dummy))
3655 || (modifier == NARROW
3656 && !supportable_narrowing_operation (code, stmt, vectype_in,
3657 &code1, &dummy_int, &dummy)))
3659 if (vect_print_dump_info (REPORT_DETAILS))
3660 fprintf (vect_dump, "conversion not supported by target.");
3664 if (modifier != NONE)
3666 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3667 /* FORNOW: SLP not supported. */
3668 if (STMT_SLP_TYPE (stmt_info))
3672 if (!vec_stmt) /* transformation not required. */
3674 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3679 if (vect_print_dump_info (REPORT_DETAILS))
3680 fprintf (vect_dump, "transform conversion.");
3683 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3685 if (modifier == NONE && !slp_node)
3686 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3688 prev_stmt_info = NULL;
3692 for (j = 0; j < ncopies; j++)
3698 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3700 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3703 targetm.vectorize.builtin_conversion (code, integral_type);
3704 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3706 /* Arguments are ready. create the new vector stmt. */
3707 new_stmt = gimple_build_call (builtin_decl, 1, vop0);
3708 new_temp = make_ssa_name (vec_dest, new_stmt);
3709 gimple_call_set_lhs (new_stmt, new_temp);
3710 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3711 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3712 SSA_OP_ALL_VIRTUALS)
3714 if (TREE_CODE (sym) == SSA_NAME)
3715 sym = SSA_NAME_VAR (sym);
3716 mark_sym_for_renaming (sym);
3719 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3723 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3725 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3726 prev_stmt_info = vinfo_for_stmt (new_stmt);
3731 /* In case the vectorization factor (VF) is bigger than the number
3732 of elements that we can fit in a vectype (nunits), we have to
3733 generate more than one vector stmt - i.e - we need to "unroll"
3734 the vector stmt by a factor VF/nunits. */
3735 for (j = 0; j < ncopies; j++)
3738 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3740 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3742 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3744 /* Generate first half of the widened result: */
3746 = vect_gen_widened_results_half (code1, decl1,
3747 vec_oprnd0, vec_oprnd1,
3748 unary_op, vec_dest, gsi, stmt);
3750 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3752 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3753 prev_stmt_info = vinfo_for_stmt (new_stmt);
3755 /* Generate second half of the widened result: */
3757 = vect_gen_widened_results_half (code2, decl2,
3758 vec_oprnd0, vec_oprnd1,
3759 unary_op, vec_dest, gsi, stmt);
3760 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3761 prev_stmt_info = vinfo_for_stmt (new_stmt);
3766 /* In case the vectorization factor (VF) is bigger than the number
3767 of elements that we can fit in a vectype (nunits), we have to
3768 generate more than one vector stmt - i.e - we need to "unroll"
3769 the vector stmt by a factor VF/nunits. */
3770 for (j = 0; j < ncopies; j++)
3775 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3776 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3780 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3781 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3784 /* Arguments are ready. Create the new vector stmt. */
3785 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3786 new_stmt = gimple_build_assign_with_ops (code1, vec_dest, vec_oprnd0,
3788 new_temp = make_ssa_name (vec_dest, new_stmt);
3789 gimple_assign_set_lhs (new_stmt, new_temp);
3790 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3793 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3795 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3797 prev_stmt_info = vinfo_for_stmt (new_stmt);
3800 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3804 VEC_free (tree, heap, vec_oprnds0);
3810 /* Function vectorizable_assignment.
3812 Check if STMT performs an assignment (copy) that can be vectorized.
3813 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3814 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3815 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3818 vectorizable_assignment (gimple stmt, gimple_stmt_iterator *gsi,
3819 gimple *vec_stmt, slp_tree slp_node)
3824 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3825 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3826 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3830 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3831 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3834 VEC(tree,heap) *vec_oprnds = NULL;
3837 /* Multiple types in SLP are handled by creating the appropriate number of
3838 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
3843 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3845 gcc_assert (ncopies >= 1);
3847 return false; /* FORNOW */
3849 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3852 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3855 /* Is vectorizable assignment? */
3856 if (!is_gimple_assign (stmt))
3859 scalar_dest = gimple_assign_lhs (stmt);
3860 if (TREE_CODE (scalar_dest) != SSA_NAME)
3863 if (gimple_assign_single_p (stmt)
3864 || gimple_assign_rhs_code (stmt) == PAREN_EXPR)
3865 op = gimple_assign_rhs1 (stmt);
3869 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3871 if (vect_print_dump_info (REPORT_DETAILS))
3872 fprintf (vect_dump, "use not simple.");
3876 if (!vec_stmt) /* transformation not required. */
3878 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3879 if (vect_print_dump_info (REPORT_DETAILS))
3880 fprintf (vect_dump, "=== vectorizable_assignment ===");
3881 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3886 if (vect_print_dump_info (REPORT_DETAILS))
3887 fprintf (vect_dump, "transform assignment.");
3890 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3893 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3895 /* Arguments are ready. create the new vector stmt. */
3896 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3898 *vec_stmt = gimple_build_assign (vec_dest, vop);
3899 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3900 gimple_assign_set_lhs (*vec_stmt, new_temp);
3901 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
3902 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3905 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3908 VEC_free (tree, heap, vec_oprnds);
3913 /* Function vect_min_worthwhile_factor.
3915 For a loop where we could vectorize the operation indicated by CODE,
3916 return the minimum vectorization factor that makes it worthwhile
3917 to use generic vectors. */
3919 vect_min_worthwhile_factor (enum tree_code code)
3940 /* Function vectorizable_induction
3942 Check if PHI performs an induction computation that can be vectorized.
3943 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3944 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3945 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3948 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3951 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3952 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3953 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3954 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3955 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3956 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3959 gcc_assert (ncopies >= 1);
3960 /* FORNOW. This restriction should be relaxed. */
3961 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
3963 if (vect_print_dump_info (REPORT_DETAILS))
3964 fprintf (vect_dump, "multiple types in nested loop.");
3968 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3971 /* FORNOW: SLP not supported. */
3972 if (STMT_SLP_TYPE (stmt_info))
3975 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3977 if (gimple_code (phi) != GIMPLE_PHI)
3980 if (!vec_stmt) /* transformation not required. */
3982 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3983 if (vect_print_dump_info (REPORT_DETAILS))
3984 fprintf (vect_dump, "=== vectorizable_induction ===");
3985 vect_model_induction_cost (stmt_info, ncopies);
3991 if (vect_print_dump_info (REPORT_DETAILS))
3992 fprintf (vect_dump, "transform induction phi.");
3994 vec_def = get_initial_def_for_induction (phi);
3995 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4000 /* Function vectorizable_operation.
4002 Check if STMT performs a binary or unary operation that can be vectorized.
4003 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4004 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4005 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4008 vectorizable_operation (gimple stmt, gimple_stmt_iterator *gsi,
4009 gimple *vec_stmt, slp_tree slp_node)
4013 tree op0, op1 = NULL;
4014 tree vec_oprnd1 = NULL_TREE;
4015 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4016 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4017 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4018 enum tree_code code;
4019 enum machine_mode vec_mode;
4024 enum machine_mode optab_op2_mode;
4027 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4028 gimple new_stmt = NULL;
4029 stmt_vec_info prev_stmt_info;
4030 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
4035 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4038 bool shift_p = false;
4039 bool scalar_shift_arg = false;
4041 /* Multiple types in SLP are handled by creating the appropriate number of
4042 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4047 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4049 gcc_assert (ncopies >= 1);
4051 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4054 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4057 /* Is STMT a vectorizable binary/unary operation? */
4058 if (!is_gimple_assign (stmt))
4061 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4064 scalar_dest = gimple_assign_lhs (stmt);
4065 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4068 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4069 if (nunits_out != nunits_in)
4072 code = gimple_assign_rhs_code (stmt);
4074 /* For pointer addition, we should use the normal plus for
4075 the vector addition. */
4076 if (code == POINTER_PLUS_EXPR)
4079 /* Support only unary or binary operations. */
4080 op_type = TREE_CODE_LENGTH (code);
4081 if (op_type != unary_op && op_type != binary_op)
4083 if (vect_print_dump_info (REPORT_DETAILS))
4084 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
4088 op0 = gimple_assign_rhs1 (stmt);
4089 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4091 if (vect_print_dump_info (REPORT_DETAILS))
4092 fprintf (vect_dump, "use not simple.");
4096 if (op_type == binary_op)
4098 op1 = gimple_assign_rhs2 (stmt);
4099 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4101 if (vect_print_dump_info (REPORT_DETAILS))
4102 fprintf (vect_dump, "use not simple.");
4107 /* If this is a shift/rotate, determine whether the shift amount is a vector,
4108 or scalar. If the shift/rotate amount is a vector, use the vector/vector
4110 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
4111 || code == RROTATE_EXPR)
4115 /* vector shifted by vector */
4116 if (dt[1] == vect_loop_def)
4118 optab = optab_for_tree_code (code, vectype, optab_vector);
4119 if (vect_print_dump_info (REPORT_DETAILS))
4120 fprintf (vect_dump, "vector/vector shift/rotate found.");
4123 /* See if the machine has a vector shifted by scalar insn and if not
4124 then see if it has a vector shifted by vector insn */
4125 else if (dt[1] == vect_constant_def || dt[1] == vect_invariant_def)
4127 optab = optab_for_tree_code (code, vectype, optab_scalar);
4129 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4130 != CODE_FOR_nothing))
4132 scalar_shift_arg = true;
4133 if (vect_print_dump_info (REPORT_DETAILS))
4134 fprintf (vect_dump, "vector/scalar shift/rotate found.");
4138 optab = optab_for_tree_code (code, vectype, optab_vector);
4139 if (vect_print_dump_info (REPORT_DETAILS)
4141 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4142 != CODE_FOR_nothing))
4143 fprintf (vect_dump, "vector/vector shift/rotate found.");
4149 if (vect_print_dump_info (REPORT_DETAILS))
4150 fprintf (vect_dump, "operand mode requires invariant argument.");
4155 optab = optab_for_tree_code (code, vectype, optab_default);
4157 /* Supportable by target? */
4160 if (vect_print_dump_info (REPORT_DETAILS))
4161 fprintf (vect_dump, "no optab.");
4164 vec_mode = TYPE_MODE (vectype);
4165 icode = (int) optab_handler (optab, vec_mode)->insn_code;
4166 if (icode == CODE_FOR_nothing)
4168 if (vect_print_dump_info (REPORT_DETAILS))
4169 fprintf (vect_dump, "op not supported by target.");
4170 /* Check only during analysis. */
4171 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4172 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4173 < vect_min_worthwhile_factor (code)
4176 if (vect_print_dump_info (REPORT_DETAILS))
4177 fprintf (vect_dump, "proceeding using word mode.");
4180 /* Worthwhile without SIMD support? Check only during analysis. */
4181 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
4182 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4183 < vect_min_worthwhile_factor (code)
4186 if (vect_print_dump_info (REPORT_DETAILS))
4187 fprintf (vect_dump, "not worthwhile without SIMD support.");
4191 if (!vec_stmt) /* transformation not required. */
4193 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
4194 if (vect_print_dump_info (REPORT_DETAILS))
4195 fprintf (vect_dump, "=== vectorizable_operation ===");
4196 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4202 if (vect_print_dump_info (REPORT_DETAILS))
4203 fprintf (vect_dump, "transform binary/unary operation.");
4206 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4208 /* Allocate VECs for vector operands. In case of SLP, vector operands are
4209 created in the previous stages of the recursion, so no allocation is
4210 needed, except for the case of shift with scalar shift argument. In that
4211 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
4212 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
4213 In case of loop-based vectorization we allocate VECs of size 1. We
4214 allocate VEC_OPRNDS1 only in case of binary operation. */
4217 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4218 if (op_type == binary_op)
4219 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4221 else if (scalar_shift_arg)
4222 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4224 /* In case the vectorization factor (VF) is bigger than the number
4225 of elements that we can fit in a vectype (nunits), we have to generate
4226 more than one vector stmt - i.e - we need to "unroll" the
4227 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4228 from one copy of the vector stmt to the next, in the field
4229 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4230 stages to find the correct vector defs to be used when vectorizing
4231 stmts that use the defs of the current stmt. The example below illustrates
4232 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4233 4 vectorized stmts):
4235 before vectorization:
4236 RELATED_STMT VEC_STMT
4240 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4242 RELATED_STMT VEC_STMT
4243 VS1_0: vx0 = memref0 VS1_1 -
4244 VS1_1: vx1 = memref1 VS1_2 -
4245 VS1_2: vx2 = memref2 VS1_3 -
4246 VS1_3: vx3 = memref3 - -
4247 S1: x = load - VS1_0
4250 step2: vectorize stmt S2 (done here):
4251 To vectorize stmt S2 we first need to find the relevant vector
4252 def for the first operand 'x'. This is, as usual, obtained from
4253 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4254 that defines 'x' (S1). This way we find the stmt VS1_0, and the
4255 relevant vector def 'vx0'. Having found 'vx0' we can generate
4256 the vector stmt VS2_0, and as usual, record it in the
4257 STMT_VINFO_VEC_STMT of stmt S2.
4258 When creating the second copy (VS2_1), we obtain the relevant vector
4259 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4260 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4261 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4262 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4263 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4264 chain of stmts and pointers:
4265 RELATED_STMT VEC_STMT
4266 VS1_0: vx0 = memref0 VS1_1 -
4267 VS1_1: vx1 = memref1 VS1_2 -
4268 VS1_2: vx2 = memref2 VS1_3 -
4269 VS1_3: vx3 = memref3 - -
4270 S1: x = load - VS1_0
4271 VS2_0: vz0 = vx0 + v1 VS2_1 -
4272 VS2_1: vz1 = vx1 + v1 VS2_2 -
4273 VS2_2: vz2 = vx2 + v1 VS2_3 -
4274 VS2_3: vz3 = vx3 + v1 - -
4275 S2: z = x + 1 - VS2_0 */
4277 prev_stmt_info = NULL;
4278 for (j = 0; j < ncopies; j++)
4283 if (op_type == binary_op && scalar_shift_arg)
4285 /* Vector shl and shr insn patterns can be defined with scalar
4286 operand 2 (shift operand). In this case, use constant or loop
4287 invariant op1 directly, without extending it to vector mode
4289 optab_op2_mode = insn_data[icode].operand[2].mode;
4290 if (!VECTOR_MODE_P (optab_op2_mode))
4292 if (vect_print_dump_info (REPORT_DETAILS))
4293 fprintf (vect_dump, "operand 1 using scalar mode.");
4295 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4298 /* Store vec_oprnd1 for every vector stmt to be created
4299 for SLP_NODE. We check during the analysis that all the
4300 shift arguments are the same.
4301 TODO: Allow different constants for different vector
4302 stmts generated for an SLP instance. */
4303 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4304 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4309 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4310 (a special case for certain kind of vector shifts); otherwise,
4311 operand 1 should be of a vector type (the usual case). */
4312 if (op_type == binary_op && !vec_oprnd1)
4313 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4316 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4320 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4322 /* Arguments are ready. Create the new vector stmt. */
4323 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4325 vop1 = ((op_type == binary_op)
4326 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4327 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4328 new_temp = make_ssa_name (vec_dest, new_stmt);
4329 gimple_assign_set_lhs (new_stmt, new_temp);
4330 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4332 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4339 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4341 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4342 prev_stmt_info = vinfo_for_stmt (new_stmt);
4345 VEC_free (tree, heap, vec_oprnds0);
4347 VEC_free (tree, heap, vec_oprnds1);
4353 /* Get vectorized definitions for loop-based vectorization. For the first
4354 operand we call vect_get_vec_def_for_operand() (with OPRND containing
4355 scalar operand), and for the rest we get a copy with
4356 vect_get_vec_def_for_stmt_copy() using the previous vector definition
4357 (stored in OPRND). See vect_get_vec_def_for_stmt_copy() for details.
4358 The vectors are collected into VEC_OPRNDS. */
4361 vect_get_loop_based_defs (tree *oprnd, gimple stmt, enum vect_def_type dt,
4362 VEC (tree, heap) **vec_oprnds, int multi_step_cvt)
4366 /* Get first vector operand. */
4367 /* All the vector operands except the very first one (that is scalar oprnd)
4369 if (TREE_CODE (TREE_TYPE (*oprnd)) != VECTOR_TYPE)
4370 vec_oprnd = vect_get_vec_def_for_operand (*oprnd, stmt, NULL);
4372 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, *oprnd);
4374 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4376 /* Get second vector operand. */
4377 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd);
4378 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4382 /* For conversion in multiple steps, continue to get operands
4385 vect_get_loop_based_defs (oprnd, stmt, dt, vec_oprnds, multi_step_cvt - 1);
4389 /* Create vectorized demotion statements for vector operands from VEC_OPRNDS.
4390 For multi-step conversions store the resulting vectors and call the function
4394 vect_create_vectorized_demotion_stmts (VEC (tree, heap) **vec_oprnds,
4395 int multi_step_cvt, gimple stmt,
4396 VEC (tree, heap) *vec_dsts,
4397 gimple_stmt_iterator *gsi,
4398 slp_tree slp_node, enum tree_code code,
4399 stmt_vec_info *prev_stmt_info)
4402 tree vop0, vop1, new_tmp, vec_dest;
4404 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4406 vec_dest = VEC_pop (tree, vec_dsts);
4408 for (i = 0; i < VEC_length (tree, *vec_oprnds); i += 2)
4410 /* Create demotion operation. */
4411 vop0 = VEC_index (tree, *vec_oprnds, i);
4412 vop1 = VEC_index (tree, *vec_oprnds, i + 1);
4413 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4414 new_tmp = make_ssa_name (vec_dest, new_stmt);
4415 gimple_assign_set_lhs (new_stmt, new_tmp);
4416 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4419 /* Store the resulting vector for next recursive call. */
4420 VEC_replace (tree, *vec_oprnds, i/2, new_tmp);
4423 /* This is the last step of the conversion sequence. Store the
4424 vectors in SLP_NODE or in vector info of the scalar statement
4425 (or in STMT_VINFO_RELATED_STMT chain). */
4427 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4430 if (!*prev_stmt_info)
4431 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4433 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt;
4435 *prev_stmt_info = vinfo_for_stmt (new_stmt);
4440 /* For multi-step demotion operations we first generate demotion operations
4441 from the source type to the intermediate types, and then combine the
4442 results (stored in VEC_OPRNDS) in demotion operation to the destination
4446 /* At each level of recursion we have have of the operands we had at the
4448 VEC_truncate (tree, *vec_oprnds, (i+1)/2);
4449 vect_create_vectorized_demotion_stmts (vec_oprnds, multi_step_cvt - 1,
4450 stmt, vec_dsts, gsi, slp_node,
4451 code, prev_stmt_info);
4456 /* Function vectorizable_type_demotion
4458 Check if STMT performs a binary or unary operation that involves
4459 type demotion, and if it can be vectorized.
4460 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4461 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4462 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4465 vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi,
4466 gimple *vec_stmt, slp_tree slp_node)
4471 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4472 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4473 enum tree_code code, code1 = ERROR_MARK;
4476 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4477 stmt_vec_info prev_stmt_info;
4484 int multi_step_cvt = 0;
4485 VEC (tree, heap) *vec_oprnds0 = NULL;
4486 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4487 tree last_oprnd, intermediate_type;
4489 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4492 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4495 /* Is STMT a vectorizable type-demotion operation? */
4496 if (!is_gimple_assign (stmt))
4499 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4502 code = gimple_assign_rhs_code (stmt);
4503 if (!CONVERT_EXPR_CODE_P (code))
4506 op0 = gimple_assign_rhs1 (stmt);
4507 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4510 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4512 scalar_dest = gimple_assign_lhs (stmt);
4513 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4516 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4517 if (nunits_in >= nunits_out)
4520 /* Multiple types in SLP are handled by creating the appropriate number of
4521 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4526 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4528 gcc_assert (ncopies >= 1);
4530 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4531 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4532 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4533 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4534 && CONVERT_EXPR_CODE_P (code))))
4537 /* Check the operands of the operation. */
4538 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4540 if (vect_print_dump_info (REPORT_DETAILS))
4541 fprintf (vect_dump, "use not simple.");
4545 /* Supportable by target? */
4546 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1,
4547 &multi_step_cvt, &interm_types))
4550 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4552 if (!vec_stmt) /* transformation not required. */
4554 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4555 if (vect_print_dump_info (REPORT_DETAILS))
4556 fprintf (vect_dump, "=== vectorizable_demotion ===");
4557 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4562 if (vect_print_dump_info (REPORT_DETAILS))
4563 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4566 /* In case of multi-step demotion, we first generate demotion operations to
4567 the intermediate types, and then from that types to the final one.
4568 We create vector destinations for the intermediate type (TYPES) received
4569 from supportable_narrowing_operation, and store them in the correct order
4570 for future use in vect_create_vectorized_demotion_stmts(). */
4572 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4574 vec_dsts = VEC_alloc (tree, heap, 1);
4576 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4577 VEC_quick_push (tree, vec_dsts, vec_dest);
4581 for (i = VEC_length (tree, interm_types) - 1;
4582 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4584 vec_dest = vect_create_destination_var (scalar_dest,
4586 VEC_quick_push (tree, vec_dsts, vec_dest);
4590 /* In case the vectorization factor (VF) is bigger than the number
4591 of elements that we can fit in a vectype (nunits), we have to generate
4592 more than one vector stmt - i.e - we need to "unroll" the
4593 vector stmt by a factor VF/nunits. */
4595 prev_stmt_info = NULL;
4596 for (j = 0; j < ncopies; j++)
4600 vect_get_slp_defs (slp_node, &vec_oprnds0, NULL);
4603 VEC_free (tree, heap, vec_oprnds0);
4604 vec_oprnds0 = VEC_alloc (tree, heap,
4605 (multi_step_cvt ? vect_pow2 (multi_step_cvt) * 2 : 2));
4606 vect_get_loop_based_defs (&last_oprnd, stmt, dt[0], &vec_oprnds0,
4607 vect_pow2 (multi_step_cvt) - 1);
4610 /* Arguments are ready. Create the new vector stmts. */
4611 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4612 vect_create_vectorized_demotion_stmts (&vec_oprnds0,
4613 multi_step_cvt, stmt, tmp_vec_dsts,
4614 gsi, slp_node, code1,
4618 VEC_free (tree, heap, vec_oprnds0);
4619 VEC_free (tree, heap, vec_dsts);
4620 VEC_free (tree, heap, tmp_vec_dsts);
4621 VEC_free (tree, heap, interm_types);
4623 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4628 /* Create vectorized promotion statements for vector operands from VEC_OPRNDS0
4629 and VEC_OPRNDS1 (for binary operations). For multi-step conversions store
4630 the resulting vectors and call the function recursively. */
4633 vect_create_vectorized_promotion_stmts (VEC (tree, heap) **vec_oprnds0,
4634 VEC (tree, heap) **vec_oprnds1,
4635 int multi_step_cvt, gimple stmt,
4636 VEC (tree, heap) *vec_dsts,
4637 gimple_stmt_iterator *gsi,
4638 slp_tree slp_node, enum tree_code code1,
4639 enum tree_code code2, tree decl1,
4640 tree decl2, int op_type,
4641 stmt_vec_info *prev_stmt_info)
4644 tree vop0, vop1, new_tmp1, new_tmp2, vec_dest;
4645 gimple new_stmt1, new_stmt2;
4646 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4647 VEC (tree, heap) *vec_tmp;
4649 vec_dest = VEC_pop (tree, vec_dsts);
4650 vec_tmp = VEC_alloc (tree, heap, VEC_length (tree, *vec_oprnds0) * 2);
4652 for (i = 0; VEC_iterate (tree, *vec_oprnds0, i, vop0); i++)
4654 if (op_type == binary_op)
4655 vop1 = VEC_index (tree, *vec_oprnds1, i);
4659 /* Generate the two halves of promotion operation. */
4660 new_stmt1 = vect_gen_widened_results_half (code1, decl1, vop0, vop1,
4661 op_type, vec_dest, gsi, stmt);
4662 new_stmt2 = vect_gen_widened_results_half (code2, decl2, vop0, vop1,
4663 op_type, vec_dest, gsi, stmt);
4664 if (is_gimple_call (new_stmt1))
4666 new_tmp1 = gimple_call_lhs (new_stmt1);
4667 new_tmp2 = gimple_call_lhs (new_stmt2);
4671 new_tmp1 = gimple_assign_lhs (new_stmt1);
4672 new_tmp2 = gimple_assign_lhs (new_stmt2);
4677 /* Store the results for the recursive call. */
4678 VEC_quick_push (tree, vec_tmp, new_tmp1);
4679 VEC_quick_push (tree, vec_tmp, new_tmp2);
4683 /* Last step of promotion sequience - store the results. */
4686 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt1);
4687 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt2);
4691 if (!*prev_stmt_info)
4692 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt1;
4694 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt1;
4696 *prev_stmt_info = vinfo_for_stmt (new_stmt1);
4697 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt2;
4698 *prev_stmt_info = vinfo_for_stmt (new_stmt2);
4705 /* For multi-step promotion operation we first generate we call the
4706 function recurcively for every stage. We start from the input type,
4707 create promotion operations to the intermediate types, and then
4708 create promotions to the output type. */
4709 *vec_oprnds0 = VEC_copy (tree, heap, vec_tmp);
4710 VEC_free (tree, heap, vec_tmp);
4711 vect_create_vectorized_promotion_stmts (vec_oprnds0, vec_oprnds1,
4712 multi_step_cvt - 1, stmt,
4713 vec_dsts, gsi, slp_node, code1,
4714 code2, decl2, decl2, op_type,
4720 /* Function vectorizable_type_promotion
4722 Check if STMT performs a binary or unary operation that involves
4723 type promotion, and if it can be vectorized.
4724 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4725 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4726 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4729 vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
4730 gimple *vec_stmt, slp_tree slp_node)
4734 tree op0, op1 = NULL;
4735 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4736 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4737 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4738 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4739 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4743 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4744 stmt_vec_info prev_stmt_info;
4751 tree intermediate_type = NULL_TREE;
4752 int multi_step_cvt = 0;
4753 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4754 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4756 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4759 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4762 /* Is STMT a vectorizable type-promotion operation? */
4763 if (!is_gimple_assign (stmt))
4766 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4769 code = gimple_assign_rhs_code (stmt);
4770 if (!CONVERT_EXPR_CODE_P (code)
4771 && code != WIDEN_MULT_EXPR)
4774 op0 = gimple_assign_rhs1 (stmt);
4775 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4778 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4780 scalar_dest = gimple_assign_lhs (stmt);
4781 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4784 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4785 if (nunits_in <= nunits_out)
4788 /* Multiple types in SLP are handled by creating the appropriate number of
4789 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4794 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4796 gcc_assert (ncopies >= 1);
4798 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4799 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4800 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4801 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4802 && CONVERT_EXPR_CODE_P (code))))
4805 /* Check the operands of the operation. */
4806 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4808 if (vect_print_dump_info (REPORT_DETAILS))
4809 fprintf (vect_dump, "use not simple.");
4813 op_type = TREE_CODE_LENGTH (code);
4814 if (op_type == binary_op)
4816 op1 = gimple_assign_rhs2 (stmt);
4817 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4819 if (vect_print_dump_info (REPORT_DETAILS))
4820 fprintf (vect_dump, "use not simple.");
4825 /* Supportable by target? */
4826 if (!supportable_widening_operation (code, stmt, vectype_in,
4827 &decl1, &decl2, &code1, &code2,
4828 &multi_step_cvt, &interm_types))
4831 /* Binary widening operation can only be supported directly by the
4833 gcc_assert (!(multi_step_cvt && op_type == binary_op));
4835 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4837 if (!vec_stmt) /* transformation not required. */
4839 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4840 if (vect_print_dump_info (REPORT_DETAILS))
4841 fprintf (vect_dump, "=== vectorizable_promotion ===");
4842 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4848 if (vect_print_dump_info (REPORT_DETAILS))
4849 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4853 /* In case of multi-step promotion, we first generate promotion operations
4854 to the intermediate types, and then from that types to the final one.
4855 We store vector destination in VEC_DSTS in the correct order for
4856 recursive creation of promotion operations in
4857 vect_create_vectorized_promotion_stmts(). Vector destinations are created
4858 according to TYPES recieved from supportable_widening_operation(). */
4860 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4862 vec_dsts = VEC_alloc (tree, heap, 1);
4864 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4865 VEC_quick_push (tree, vec_dsts, vec_dest);
4869 for (i = VEC_length (tree, interm_types) - 1;
4870 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4872 vec_dest = vect_create_destination_var (scalar_dest,
4874 VEC_quick_push (tree, vec_dsts, vec_dest);
4880 vec_oprnds0 = VEC_alloc (tree, heap,
4881 (multi_step_cvt ? vect_pow2 (multi_step_cvt) : 1));
4882 if (op_type == binary_op)
4883 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4886 /* In case the vectorization factor (VF) is bigger than the number
4887 of elements that we can fit in a vectype (nunits), we have to generate
4888 more than one vector stmt - i.e - we need to "unroll" the
4889 vector stmt by a factor VF/nunits. */
4891 prev_stmt_info = NULL;
4892 for (j = 0; j < ncopies; j++)
4898 vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1);
4901 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4902 VEC_quick_push (tree, vec_oprnds0, vec_oprnd0);
4903 if (op_type == binary_op)
4905 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4906 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4912 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4913 VEC_replace (tree, vec_oprnds0, 0, vec_oprnd0);
4914 if (op_type == binary_op)
4916 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4917 VEC_replace (tree, vec_oprnds1, 0, vec_oprnd1);
4921 /* Arguments are ready. Create the new vector stmts. */
4922 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4923 vect_create_vectorized_promotion_stmts (&vec_oprnds0, &vec_oprnds1,
4924 multi_step_cvt, stmt,
4926 gsi, slp_node, code1, code2,
4927 decl1, decl2, op_type,
4931 VEC_free (tree, heap, vec_dsts);
4932 VEC_free (tree, heap, tmp_vec_dsts);
4933 VEC_free (tree, heap, interm_types);
4934 VEC_free (tree, heap, vec_oprnds0);
4935 VEC_free (tree, heap, vec_oprnds1);
4937 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4942 /* Function vect_strided_store_supported.
4944 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4945 and FALSE otherwise. */
4948 vect_strided_store_supported (tree vectype)
4950 optab interleave_high_optab, interleave_low_optab;
4953 mode = (int) TYPE_MODE (vectype);
4955 /* Check that the operation is supported. */
4956 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4957 vectype, optab_default);
4958 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4959 vectype, optab_default);
4960 if (!interleave_high_optab || !interleave_low_optab)
4962 if (vect_print_dump_info (REPORT_DETAILS))
4963 fprintf (vect_dump, "no optab for interleave.");
4967 if (optab_handler (interleave_high_optab, mode)->insn_code
4969 || optab_handler (interleave_low_optab, mode)->insn_code
4970 == CODE_FOR_nothing)
4972 if (vect_print_dump_info (REPORT_DETAILS))
4973 fprintf (vect_dump, "interleave op not supported by target.");
4981 /* Function vect_permute_store_chain.
4983 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4984 a power of 2, generate interleave_high/low stmts to reorder the data
4985 correctly for the stores. Return the final references for stores in
4988 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4989 The input is 4 vectors each containing 8 elements. We assign a number to each
4990 element, the input sequence is:
4992 1st vec: 0 1 2 3 4 5 6 7
4993 2nd vec: 8 9 10 11 12 13 14 15
4994 3rd vec: 16 17 18 19 20 21 22 23
4995 4th vec: 24 25 26 27 28 29 30 31
4997 The output sequence should be:
4999 1st vec: 0 8 16 24 1 9 17 25
5000 2nd vec: 2 10 18 26 3 11 19 27
5001 3rd vec: 4 12 20 28 5 13 21 30
5002 4th vec: 6 14 22 30 7 15 23 31
5004 i.e., we interleave the contents of the four vectors in their order.
5006 We use interleave_high/low instructions to create such output. The input of
5007 each interleave_high/low operation is two vectors:
5010 the even elements of the result vector are obtained left-to-right from the
5011 high/low elements of the first vector. The odd elements of the result are
5012 obtained left-to-right from the high/low elements of the second vector.
5013 The output of interleave_high will be: 0 4 1 5
5014 and of interleave_low: 2 6 3 7
5017 The permutation is done in log LENGTH stages. In each stage interleave_high
5018 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5019 where the first argument is taken from the first half of DR_CHAIN and the
5020 second argument from it's second half.
5023 I1: interleave_high (1st vec, 3rd vec)
5024 I2: interleave_low (1st vec, 3rd vec)
5025 I3: interleave_high (2nd vec, 4th vec)
5026 I4: interleave_low (2nd vec, 4th vec)
5028 The output for the first stage is:
5030 I1: 0 16 1 17 2 18 3 19
5031 I2: 4 20 5 21 6 22 7 23
5032 I3: 8 24 9 25 10 26 11 27
5033 I4: 12 28 13 29 14 30 15 31
5035 The output of the second stage, i.e. the final result is:
5037 I1: 0 8 16 24 1 9 17 25
5038 I2: 2 10 18 26 3 11 19 27
5039 I3: 4 12 20 28 5 13 21 30
5040 I4: 6 14 22 30 7 15 23 31. */
5043 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
5044 unsigned int length,
5046 gimple_stmt_iterator *gsi,
5047 VEC(tree,heap) **result_chain)
5049 tree perm_dest, vect1, vect2, high, low;
5051 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5055 enum tree_code high_code, low_code;
5057 scalar_dest = gimple_assign_lhs (stmt);
5059 /* Check that the operation is supported. */
5060 if (!vect_strided_store_supported (vectype))
5063 *result_chain = VEC_copy (tree, heap, dr_chain);
5065 for (i = 0; i < exact_log2 (length); i++)
5067 for (j = 0; j < length/2; j++)
5069 vect1 = VEC_index (tree, dr_chain, j);
5070 vect2 = VEC_index (tree, dr_chain, j+length/2);
5072 /* Create interleaving stmt:
5073 in the case of big endian:
5074 high = interleave_high (vect1, vect2)
5075 and in the case of little endian:
5076 high = interleave_low (vect1, vect2). */
5077 perm_dest = create_tmp_var (vectype, "vect_inter_high");
5078 DECL_GIMPLE_REG_P (perm_dest) = 1;
5079 add_referenced_var (perm_dest);
5080 if (BYTES_BIG_ENDIAN)
5082 high_code = VEC_INTERLEAVE_HIGH_EXPR;
5083 low_code = VEC_INTERLEAVE_LOW_EXPR;
5087 low_code = VEC_INTERLEAVE_HIGH_EXPR;
5088 high_code = VEC_INTERLEAVE_LOW_EXPR;
5090 perm_stmt = gimple_build_assign_with_ops (high_code, perm_dest,
5092 high = make_ssa_name (perm_dest, perm_stmt);
5093 gimple_assign_set_lhs (perm_stmt, high);
5094 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5095 VEC_replace (tree, *result_chain, 2*j, high);
5097 /* Create interleaving stmt:
5098 in the case of big endian:
5099 low = interleave_low (vect1, vect2)
5100 and in the case of little endian:
5101 low = interleave_high (vect1, vect2). */
5102 perm_dest = create_tmp_var (vectype, "vect_inter_low");
5103 DECL_GIMPLE_REG_P (perm_dest) = 1;
5104 add_referenced_var (perm_dest);
5105 perm_stmt = gimple_build_assign_with_ops (low_code, perm_dest,
5107 low = make_ssa_name (perm_dest, perm_stmt);
5108 gimple_assign_set_lhs (perm_stmt, low);
5109 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5110 VEC_replace (tree, *result_chain, 2*j+1, low);
5112 dr_chain = VEC_copy (tree, heap, *result_chain);
5118 /* Function vectorizable_store.
5120 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
5122 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5123 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5124 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5127 vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
5133 tree vec_oprnd = NULL_TREE;
5134 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5135 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
5136 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5137 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5138 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5139 enum machine_mode vec_mode;
5141 enum dr_alignment_support alignment_support_scheme;
5144 enum vect_def_type dt;
5145 stmt_vec_info prev_stmt_info = NULL;
5146 tree dataref_ptr = NULL_TREE;
5147 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5150 gimple next_stmt, first_stmt = NULL;
5151 bool strided_store = false;
5152 unsigned int group_size, i;
5153 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
5155 VEC(tree,heap) *vec_oprnds = NULL;
5156 bool slp = (slp_node != NULL);
5157 stmt_vec_info first_stmt_vinfo;
5158 unsigned int vec_num;
5160 /* Multiple types in SLP are handled by creating the appropriate number of
5161 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
5166 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5168 gcc_assert (ncopies >= 1);
5170 /* FORNOW. This restriction should be relaxed. */
5171 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
5173 if (vect_print_dump_info (REPORT_DETAILS))
5174 fprintf (vect_dump, "multiple types in nested loop.");
5178 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5181 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5184 /* Is vectorizable store? */
5186 if (!is_gimple_assign (stmt))
5189 scalar_dest = gimple_assign_lhs (stmt);
5190 if (TREE_CODE (scalar_dest) != ARRAY_REF
5191 && TREE_CODE (scalar_dest) != INDIRECT_REF
5192 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5195 gcc_assert (gimple_assign_single_p (stmt));
5196 op = gimple_assign_rhs1 (stmt);
5197 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5199 if (vect_print_dump_info (REPORT_DETAILS))
5200 fprintf (vect_dump, "use not simple.");
5204 /* The scalar rhs type needs to be trivially convertible to the vector
5205 component type. This should always be the case. */
5206 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (op)))
5208 if (vect_print_dump_info (REPORT_DETAILS))
5209 fprintf (vect_dump, "??? operands of different types");
5213 vec_mode = TYPE_MODE (vectype);
5214 /* FORNOW. In some cases can vectorize even if data-type not supported
5215 (e.g. - array initialization with 0). */
5216 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
5219 if (!STMT_VINFO_DATA_REF (stmt_info))
5222 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5224 strided_store = true;
5225 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5226 if (!vect_strided_store_supported (vectype)
5227 && !PURE_SLP_STMT (stmt_info) && !slp)
5230 if (first_stmt == stmt)
5232 /* STMT is the leader of the group. Check the operands of all the
5233 stmts of the group. */
5234 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
5237 gcc_assert (gimple_assign_single_p (next_stmt));
5238 op = gimple_assign_rhs1 (next_stmt);
5239 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5241 if (vect_print_dump_info (REPORT_DETAILS))
5242 fprintf (vect_dump, "use not simple.");
5245 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5250 if (!vec_stmt) /* transformation not required. */
5252 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
5253 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
5261 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5262 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5264 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
5267 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5269 /* We vectorize all the stmts of the interleaving group when we
5270 reach the last stmt in the group. */
5271 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
5272 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
5280 strided_store = false;
5282 /* VEC_NUM is the number of vect stmts to be created for this group. */
5284 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5286 vec_num = group_size;
5292 group_size = vec_num = 1;
5293 first_stmt_vinfo = stmt_info;
5296 if (vect_print_dump_info (REPORT_DETAILS))
5297 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
5299 dr_chain = VEC_alloc (tree, heap, group_size);
5300 oprnds = VEC_alloc (tree, heap, group_size);
5302 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5303 gcc_assert (alignment_support_scheme);
5304 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
5306 /* In case the vectorization factor (VF) is bigger than the number
5307 of elements that we can fit in a vectype (nunits), we have to generate
5308 more than one vector stmt - i.e - we need to "unroll" the
5309 vector stmt by a factor VF/nunits. For more details see documentation in
5310 vect_get_vec_def_for_copy_stmt. */
5312 /* In case of interleaving (non-unit strided access):
5319 We create vectorized stores starting from base address (the access of the
5320 first stmt in the chain (S2 in the above example), when the last store stmt
5321 of the chain (S4) is reached:
5324 VS2: &base + vec_size*1 = vx0
5325 VS3: &base + vec_size*2 = vx1
5326 VS4: &base + vec_size*3 = vx3
5328 Then permutation statements are generated:
5330 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
5331 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
5334 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5335 (the order of the data-refs in the output of vect_permute_store_chain
5336 corresponds to the order of scalar stmts in the interleaving chain - see
5337 the documentation of vect_permute_store_chain()).
5339 In case of both multiple types and interleaving, above vector stores and
5340 permutation stmts are created for every copy. The result vector stmts are
5341 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5342 STMT_VINFO_RELATED_STMT for the next copies.
5345 prev_stmt_info = NULL;
5346 for (j = 0; j < ncopies; j++)
5355 /* Get vectorized arguments for SLP_NODE. */
5356 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
5358 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
5362 /* For interleaved stores we collect vectorized defs for all the
5363 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
5364 used as an input to vect_permute_store_chain(), and OPRNDS as
5365 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
5367 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5368 OPRNDS are of size 1. */
5369 next_stmt = first_stmt;
5370 for (i = 0; i < group_size; i++)
5372 /* Since gaps are not supported for interleaved stores,
5373 GROUP_SIZE is the exact number of stmts in the chain.
5374 Therefore, NEXT_STMT can't be NULL_TREE. In case that
5375 there is no interleaving, GROUP_SIZE is 1, and only one
5376 iteration of the loop will be executed. */
5377 gcc_assert (next_stmt
5378 && gimple_assign_single_p (next_stmt));
5379 op = gimple_assign_rhs1 (next_stmt);
5381 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
5383 VEC_quick_push(tree, dr_chain, vec_oprnd);
5384 VEC_quick_push(tree, oprnds, vec_oprnd);
5385 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5389 /* We should have catched mismatched types earlier. */
5390 gcc_assert (useless_type_conversion_p (vectype,
5391 TREE_TYPE (vec_oprnd)));
5392 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
5393 &dummy, &ptr_incr, false,
5395 gcc_assert (!inv_p);
5399 /* For interleaved stores we created vectorized defs for all the
5400 defs stored in OPRNDS in the previous iteration (previous copy).
5401 DR_CHAIN is then used as an input to vect_permute_store_chain(),
5402 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
5404 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5405 OPRNDS are of size 1. */
5406 for (i = 0; i < group_size; i++)
5408 op = VEC_index (tree, oprnds, i);
5409 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
5410 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
5411 VEC_replace(tree, dr_chain, i, vec_oprnd);
5412 VEC_replace(tree, oprnds, i, vec_oprnd);
5415 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
5420 result_chain = VEC_alloc (tree, heap, group_size);
5422 if (!vect_permute_store_chain (dr_chain, group_size, stmt, gsi,
5427 next_stmt = first_stmt;
5428 for (i = 0; i < vec_num; i++)
5431 /* Bump the vector pointer. */
5432 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
5436 vec_oprnd = VEC_index (tree, vec_oprnds, i);
5437 else if (strided_store)
5438 /* For strided stores vectorized defs are interleaved in
5439 vect_permute_store_chain(). */
5440 vec_oprnd = VEC_index (tree, result_chain, i);
5442 data_ref = build_fold_indirect_ref (dataref_ptr);
5444 /* Arguments are ready. Create the new vector stmt. */
5445 new_stmt = gimple_build_assign (data_ref, vec_oprnd);
5446 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5447 mark_symbols_for_renaming (new_stmt);
5453 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5455 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5457 prev_stmt_info = vinfo_for_stmt (new_stmt);
5458 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5464 VEC_free (tree, heap, dr_chain);
5465 VEC_free (tree, heap, oprnds);
5467 VEC_free (tree, heap, result_chain);
5473 /* Function vect_setup_realignment
5475 This function is called when vectorizing an unaligned load using
5476 the dr_explicit_realign[_optimized] scheme.
5477 This function generates the following code at the loop prolog:
5480 x msq_init = *(floor(p)); # prolog load
5481 realignment_token = call target_builtin;
5483 x msq = phi (msq_init, ---)
5485 The stmts marked with x are generated only for the case of
5486 dr_explicit_realign_optimized.
5488 The code above sets up a new (vector) pointer, pointing to the first
5489 location accessed by STMT, and a "floor-aligned" load using that pointer.
5490 It also generates code to compute the "realignment-token" (if the relevant
5491 target hook was defined), and creates a phi-node at the loop-header bb
5492 whose arguments are the result of the prolog-load (created by this
5493 function) and the result of a load that takes place in the loop (to be
5494 created by the caller to this function).
5496 For the case of dr_explicit_realign_optimized:
5497 The caller to this function uses the phi-result (msq) to create the
5498 realignment code inside the loop, and sets up the missing phi argument,
5501 msq = phi (msq_init, lsq)
5502 lsq = *(floor(p')); # load in loop
5503 result = realign_load (msq, lsq, realignment_token);
5505 For the case of dr_explicit_realign:
5507 msq = *(floor(p)); # load in loop
5509 lsq = *(floor(p')); # load in loop
5510 result = realign_load (msq, lsq, realignment_token);
5513 STMT - (scalar) load stmt to be vectorized. This load accesses
5514 a memory location that may be unaligned.
5515 BSI - place where new code is to be inserted.
5516 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5520 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5521 target hook, if defined.
5522 Return value - the result of the loop-header phi node. */
5525 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
5526 tree *realignment_token,
5527 enum dr_alignment_support alignment_support_scheme,
5529 struct loop **at_loop)
5531 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5532 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5533 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5534 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5536 tree scalar_dest = gimple_assign_lhs (stmt);
5543 tree msq_init = NULL_TREE;
5546 tree msq = NULL_TREE;
5547 gimple_seq stmts = NULL;
5549 bool compute_in_loop = false;
5550 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5551 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5552 struct loop *loop_for_initial_load;
5554 gcc_assert (alignment_support_scheme == dr_explicit_realign
5555 || alignment_support_scheme == dr_explicit_realign_optimized);
5557 /* We need to generate three things:
5558 1. the misalignment computation
5559 2. the extra vector load (for the optimized realignment scheme).
5560 3. the phi node for the two vectors from which the realignment is
5561 done (for the optimized realignment scheme).
5564 /* 1. Determine where to generate the misalignment computation.
5566 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5567 calculation will be generated by this function, outside the loop (in the
5568 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5569 caller, inside the loop.
5571 Background: If the misalignment remains fixed throughout the iterations of
5572 the loop, then both realignment schemes are applicable, and also the
5573 misalignment computation can be done outside LOOP. This is because we are
5574 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5575 are a multiple of VS (the Vector Size), and therefore the misalignment in
5576 different vectorized LOOP iterations is always the same.
5577 The problem arises only if the memory access is in an inner-loop nested
5578 inside LOOP, which is now being vectorized using outer-loop vectorization.
5579 This is the only case when the misalignment of the memory access may not
5580 remain fixed throughout the iterations of the inner-loop (as explained in
5581 detail in vect_supportable_dr_alignment). In this case, not only is the
5582 optimized realignment scheme not applicable, but also the misalignment
5583 computation (and generation of the realignment token that is passed to
5584 REALIGN_LOAD) have to be done inside the loop.
5586 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5587 or not, which in turn determines if the misalignment is computed inside
5588 the inner-loop, or outside LOOP. */
5590 if (init_addr != NULL_TREE)
5592 compute_in_loop = true;
5593 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5597 /* 2. Determine where to generate the extra vector load.
5599 For the optimized realignment scheme, instead of generating two vector
5600 loads in each iteration, we generate a single extra vector load in the
5601 preheader of the loop, and in each iteration reuse the result of the
5602 vector load from the previous iteration. In case the memory access is in
5603 an inner-loop nested inside LOOP, which is now being vectorized using
5604 outer-loop vectorization, we need to determine whether this initial vector
5605 load should be generated at the preheader of the inner-loop, or can be
5606 generated at the preheader of LOOP. If the memory access has no evolution
5607 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5608 to be generated inside LOOP (in the preheader of the inner-loop). */
5610 if (nested_in_vect_loop)
5612 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5613 bool invariant_in_outerloop =
5614 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5615 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5618 loop_for_initial_load = loop;
5620 *at_loop = loop_for_initial_load;
5622 /* 3. For the case of the optimized realignment, create the first vector
5623 load at the loop preheader. */
5625 if (alignment_support_scheme == dr_explicit_realign_optimized)
5627 /* Create msq_init = *(floor(p1)) in the loop preheader */
5629 gcc_assert (!compute_in_loop);
5630 pe = loop_preheader_edge (loop_for_initial_load);
5631 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5632 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5633 &init_addr, &inc, true, &inv_p, NULL_TREE);
5634 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5635 new_stmt = gimple_build_assign (vec_dest, data_ref);
5636 new_temp = make_ssa_name (vec_dest, new_stmt);
5637 gimple_assign_set_lhs (new_stmt, new_temp);
5638 mark_symbols_for_renaming (new_stmt);
5639 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5640 gcc_assert (!new_bb);
5641 msq_init = gimple_assign_lhs (new_stmt);
5644 /* 4. Create realignment token using a target builtin, if available.
5645 It is done either inside the containing loop, or before LOOP (as
5646 determined above). */
5648 if (targetm.vectorize.builtin_mask_for_load)
5652 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5653 if (compute_in_loop)
5654 gcc_assert (init_addr); /* already computed by the caller. */
5657 /* Generate the INIT_ADDR computation outside LOOP. */
5658 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5660 pe = loop_preheader_edge (loop);
5661 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5662 gcc_assert (!new_bb);
5665 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5666 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5668 vect_create_destination_var (scalar_dest,
5669 gimple_call_return_type (new_stmt));
5670 new_temp = make_ssa_name (vec_dest, new_stmt);
5671 gimple_call_set_lhs (new_stmt, new_temp);
5673 if (compute_in_loop)
5674 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5677 /* Generate the misalignment computation outside LOOP. */
5678 pe = loop_preheader_edge (loop);
5679 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5680 gcc_assert (!new_bb);
5683 *realignment_token = gimple_call_lhs (new_stmt);
5685 /* The result of the CALL_EXPR to this builtin is determined from
5686 the value of the parameter and no global variables are touched
5687 which makes the builtin a "const" function. Requiring the
5688 builtin to have the "const" attribute makes it unnecessary
5689 to call mark_call_clobbered. */
5690 gcc_assert (TREE_READONLY (builtin_decl));
5693 if (alignment_support_scheme == dr_explicit_realign)
5696 gcc_assert (!compute_in_loop);
5697 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5700 /* 5. Create msq = phi <msq_init, lsq> in loop */
5702 pe = loop_preheader_edge (containing_loop);
5703 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5704 msq = make_ssa_name (vec_dest, NULL);
5705 phi_stmt = create_phi_node (msq, containing_loop->header);
5706 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5707 add_phi_arg (phi_stmt, msq_init, pe);
5713 /* Function vect_strided_load_supported.
5715 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5716 and FALSE otherwise. */
5719 vect_strided_load_supported (tree vectype)
5721 optab perm_even_optab, perm_odd_optab;
5724 mode = (int) TYPE_MODE (vectype);
5726 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype,
5728 if (!perm_even_optab)
5730 if (vect_print_dump_info (REPORT_DETAILS))
5731 fprintf (vect_dump, "no optab for perm_even.");
5735 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5737 if (vect_print_dump_info (REPORT_DETAILS))
5738 fprintf (vect_dump, "perm_even op not supported by target.");
5742 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype,
5744 if (!perm_odd_optab)
5746 if (vect_print_dump_info (REPORT_DETAILS))
5747 fprintf (vect_dump, "no optab for perm_odd.");
5751 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5753 if (vect_print_dump_info (REPORT_DETAILS))
5754 fprintf (vect_dump, "perm_odd op not supported by target.");
5761 /* Function vect_permute_load_chain.
5763 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5764 a power of 2, generate extract_even/odd stmts to reorder the input data
5765 correctly. Return the final references for loads in RESULT_CHAIN.
5767 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5768 The input is 4 vectors each containing 8 elements. We assign a number to each
5769 element, the input sequence is:
5771 1st vec: 0 1 2 3 4 5 6 7
5772 2nd vec: 8 9 10 11 12 13 14 15
5773 3rd vec: 16 17 18 19 20 21 22 23
5774 4th vec: 24 25 26 27 28 29 30 31
5776 The output sequence should be:
5778 1st vec: 0 4 8 12 16 20 24 28
5779 2nd vec: 1 5 9 13 17 21 25 29
5780 3rd vec: 2 6 10 14 18 22 26 30
5781 4th vec: 3 7 11 15 19 23 27 31
5783 i.e., the first output vector should contain the first elements of each
5784 interleaving group, etc.
5786 We use extract_even/odd instructions to create such output. The input of each
5787 extract_even/odd operation is two vectors
5791 and the output is the vector of extracted even/odd elements. The output of
5792 extract_even will be: 0 2 4 6
5793 and of extract_odd: 1 3 5 7
5796 The permutation is done in log LENGTH stages. In each stage extract_even and
5797 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5798 order. In our example,
5800 E1: extract_even (1st vec, 2nd vec)
5801 E2: extract_odd (1st vec, 2nd vec)
5802 E3: extract_even (3rd vec, 4th vec)
5803 E4: extract_odd (3rd vec, 4th vec)
5805 The output for the first stage will be:
5807 E1: 0 2 4 6 8 10 12 14
5808 E2: 1 3 5 7 9 11 13 15
5809 E3: 16 18 20 22 24 26 28 30
5810 E4: 17 19 21 23 25 27 29 31
5812 In order to proceed and create the correct sequence for the next stage (or
5813 for the correct output, if the second stage is the last one, as in our
5814 example), we first put the output of extract_even operation and then the
5815 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5816 The input for the second stage is:
5818 1st vec (E1): 0 2 4 6 8 10 12 14
5819 2nd vec (E3): 16 18 20 22 24 26 28 30
5820 3rd vec (E2): 1 3 5 7 9 11 13 15
5821 4th vec (E4): 17 19 21 23 25 27 29 31
5823 The output of the second stage:
5825 E1: 0 4 8 12 16 20 24 28
5826 E2: 2 6 10 14 18 22 26 30
5827 E3: 1 5 9 13 17 21 25 29
5828 E4: 3 7 11 15 19 23 27 31
5830 And RESULT_CHAIN after reordering:
5832 1st vec (E1): 0 4 8 12 16 20 24 28
5833 2nd vec (E3): 1 5 9 13 17 21 25 29
5834 3rd vec (E2): 2 6 10 14 18 22 26 30
5835 4th vec (E4): 3 7 11 15 19 23 27 31. */
5838 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5839 unsigned int length,
5841 gimple_stmt_iterator *gsi,
5842 VEC(tree,heap) **result_chain)
5844 tree perm_dest, data_ref, first_vect, second_vect;
5846 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5850 /* Check that the operation is supported. */
5851 if (!vect_strided_load_supported (vectype))
5854 *result_chain = VEC_copy (tree, heap, dr_chain);
5855 for (i = 0; i < exact_log2 (length); i++)
5857 for (j = 0; j < length; j +=2)
5859 first_vect = VEC_index (tree, dr_chain, j);
5860 second_vect = VEC_index (tree, dr_chain, j+1);
5862 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5863 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5864 DECL_GIMPLE_REG_P (perm_dest) = 1;
5865 add_referenced_var (perm_dest);
5867 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_EVEN_EXPR,
5868 perm_dest, first_vect,
5871 data_ref = make_ssa_name (perm_dest, perm_stmt);
5872 gimple_assign_set_lhs (perm_stmt, data_ref);
5873 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5874 mark_symbols_for_renaming (perm_stmt);
5876 VEC_replace (tree, *result_chain, j/2, data_ref);
5878 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5879 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5880 DECL_GIMPLE_REG_P (perm_dest) = 1;
5881 add_referenced_var (perm_dest);
5883 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_ODD_EXPR,
5884 perm_dest, first_vect,
5886 data_ref = make_ssa_name (perm_dest, perm_stmt);
5887 gimple_assign_set_lhs (perm_stmt, data_ref);
5888 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5889 mark_symbols_for_renaming (perm_stmt);
5891 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5893 dr_chain = VEC_copy (tree, heap, *result_chain);
5899 /* Function vect_transform_strided_load.
5901 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5902 to perform their permutation and ascribe the result vectorized statements to
5903 the scalar statements.
5907 vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
5908 gimple_stmt_iterator *gsi)
5910 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5911 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5912 gimple next_stmt, new_stmt;
5913 VEC(tree,heap) *result_chain = NULL;
5914 unsigned int i, gap_count;
5917 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5918 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5919 vectors, that are ready for vector computation. */
5920 result_chain = VEC_alloc (tree, heap, size);
5922 if (!vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain))
5925 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5926 Since we scan the chain starting from it's first node, their order
5927 corresponds the order of data-refs in RESULT_CHAIN. */
5928 next_stmt = first_stmt;
5930 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5935 /* Skip the gaps. Loads created for the gaps will be removed by dead
5936 code elimination pass later. No need to check for the first stmt in
5937 the group, since it always exists.
5938 DR_GROUP_GAP is the number of steps in elements from the previous
5939 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5940 correspond to the gaps.
5942 if (next_stmt != first_stmt
5943 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5951 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5952 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5953 copies, and we put the new vector statement in the first available
5955 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5956 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5959 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5962 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5964 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5967 prev_stmt = rel_stmt;
5969 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5972 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5977 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5979 /* If NEXT_STMT accesses the same DR as the previous statement,
5980 put the same TMP_DATA_REF as its vectorized statement; otherwise
5981 get the next data-ref from RESULT_CHAIN. */
5982 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5987 VEC_free (tree, heap, result_chain);
5992 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
5993 building a vector of type MASK_TYPE from it) and two input vectors placed in
5994 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
5995 shifting by STRIDE elements of DR_CHAIN for every copy.
5996 (STRIDE is the number of vectorized stmts for NODE divided by the number of
5998 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
5999 the created stmts must be inserted. */
6002 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
6003 int *mask_array, int mask_nunits,
6004 tree mask_element_type, tree mask_type,
6005 int first_vec_indx, int second_vec_indx,
6006 gimple_stmt_iterator *gsi, slp_tree node,
6007 tree builtin_decl, tree vectype,
6008 VEC(tree,heap) *dr_chain,
6009 int ncopies, int vect_stmts_counter)
6011 tree t = NULL_TREE, mask_vec, mask, perm_dest;
6012 gimple perm_stmt = NULL;
6013 stmt_vec_info next_stmt_info;
6014 int i, group_size, stride, dr_chain_size;
6015 tree first_vec, second_vec, data_ref;
6018 VEC (tree, heap) *params = NULL;
6020 /* Create a vector mask. */
6021 for (i = mask_nunits - 1; i >= 0; --i)
6022 t = tree_cons (NULL_TREE, build_int_cst (mask_element_type, mask_array[i]),
6024 mask_vec = build_vector (mask_type, t);
6025 mask = vect_init_vector (stmt, mask_vec, mask_type, NULL);
6027 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node));
6028 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
6029 dr_chain_size = VEC_length (tree, dr_chain);
6031 /* Initialize the vect stmts of NODE to properly insert the generated
6033 for (i = VEC_length (gimple, SLP_TREE_VEC_STMTS (node));
6034 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6035 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (node), NULL);
6037 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
6038 for (i = 0; i < ncopies; i++)
6040 first_vec = VEC_index (tree, dr_chain, first_vec_indx);
6041 second_vec = VEC_index (tree, dr_chain, second_vec_indx);
6043 /* Build argument list for the vectorized call. */
6044 VEC_free (tree, heap, params);
6045 params = VEC_alloc (tree, heap, 3);
6046 VEC_quick_push (tree, params, first_vec);
6047 VEC_quick_push (tree, params, second_vec);
6048 VEC_quick_push (tree, params, mask);
6050 /* Generate the permute statement. */
6051 perm_stmt = gimple_build_call_vec (builtin_decl, params);
6052 data_ref = make_ssa_name (perm_dest, perm_stmt);
6053 gimple_call_set_lhs (perm_stmt, data_ref);
6054 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6055 FOR_EACH_SSA_TREE_OPERAND (sym, perm_stmt, iter, SSA_OP_ALL_VIRTUALS)
6057 if (TREE_CODE (sym) == SSA_NAME)
6058 sym = SSA_NAME_VAR (sym);
6059 mark_sym_for_renaming (sym);
6062 /* Store the vector statement in NODE. */
6063 VEC_replace (gimple, SLP_TREE_VEC_STMTS (node),
6064 stride * i + vect_stmts_counter, perm_stmt);
6066 first_vec_indx += stride;
6067 second_vec_indx += stride;
6070 /* Mark the scalar stmt as vectorized. */
6071 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
6072 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
6076 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
6077 return in CURRENT_MASK_ELEMENT its equivalent in target specific
6078 representation. Check that the mask is valid and return FALSE if not.
6079 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
6080 the next vector, i.e., the current first vector is not needed. */
6083 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
6084 int mask_nunits, bool only_one_vec, int index,
6085 int *mask, int *current_mask_element,
6086 bool *need_next_vector)
6089 static int number_of_mask_fixes = 1;
6090 static bool mask_fixed = false;
6091 static bool needs_first_vector = false;
6093 /* Convert to target specific representation. */
6094 *current_mask_element = first_mask_element + m;
6095 /* Adjust the value in case it's a mask for second and third vectors. */
6096 *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1);
6098 if (*current_mask_element < mask_nunits)
6099 needs_first_vector = true;
6101 /* We have only one input vector to permute but the mask accesses values in
6102 the next vector as well. */
6103 if (only_one_vec && *current_mask_element >= mask_nunits)
6105 if (vect_print_dump_info (REPORT_DETAILS))
6107 fprintf (vect_dump, "permutation requires at least two vectors ");
6108 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6114 /* The mask requires the next vector. */
6115 if (*current_mask_element >= mask_nunits * 2)
6117 if (needs_first_vector || mask_fixed)
6119 /* We either need the first vector too or have already moved to the
6120 next vector. In both cases, this permutation needs three
6122 if (vect_print_dump_info (REPORT_DETAILS))
6124 fprintf (vect_dump, "permutation requires at "
6125 "least three vectors ");
6126 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6132 /* We move to the next vector, dropping the first one and working with
6133 the second and the third - we need to adjust the values of the mask
6135 *current_mask_element -= mask_nunits * number_of_mask_fixes;
6137 for (i = 0; i < index; i++)
6138 mask[i] -= mask_nunits * number_of_mask_fixes;
6140 (number_of_mask_fixes)++;
6144 *need_next_vector = mask_fixed;
6146 /* This was the last element of this mask. Start a new one. */
6147 if (index == mask_nunits - 1)
6149 number_of_mask_fixes = 1;
6151 needs_first_vector = false;
6158 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6159 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6160 permute statements for SLP_NODE_INSTANCE. */
6162 vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
6163 gimple_stmt_iterator *gsi, int vf,
6164 slp_instance slp_node_instance, bool analyze_only)
6166 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6167 tree mask_element_type = NULL_TREE, mask_type;
6168 int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index;
6170 tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl;
6171 gimple next_scalar_stmt;
6172 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6173 int first_mask_element;
6174 int index, unroll_factor, *mask, current_mask_element, ncopies;
6175 bool only_one_vec = false, need_next_vector = false;
6176 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
6178 if (!targetm.vectorize.builtin_vec_perm)
6180 if (vect_print_dump_info (REPORT_DETAILS))
6182 fprintf (vect_dump, "no builtin for vect permute for ");
6183 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6189 builtin_decl = targetm.vectorize.builtin_vec_perm (vectype,
6190 &mask_element_type);
6191 if (!builtin_decl || !mask_element_type)
6193 if (vect_print_dump_info (REPORT_DETAILS))
6195 fprintf (vect_dump, "no builtin for vect permute for ");
6196 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6202 mask_type = get_vectype_for_scalar_type (mask_element_type);
6203 mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type);
6204 mask = (int *) xmalloc (sizeof (int) * mask_nunits);
6205 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6206 scale = mask_nunits / nunits;
6207 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6209 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
6210 unrolling factor. */
6211 orig_vec_stmts_num = group_size *
6212 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
6213 if (orig_vec_stmts_num == 1)
6214 only_one_vec = true;
6216 /* Number of copies is determined by the final vectorization factor
6217 relatively to SLP_NODE_INSTANCE unrolling factor. */
6218 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6220 /* Generate permutation masks for every NODE. Number of masks for each NODE
6221 is equal to GROUP_SIZE.
6222 E.g., we have a group of three nodes with three loads from the same
6223 location in each node, and the vector size is 4. I.e., we have a
6224 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6225 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6226 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6229 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target
6230 scpecific type, e.g., in bytes for Altivec.
6231 The last mask is illegal since we assume two operands for permute
6232 operation, and the mask element values can't be outside that range. Hence,
6233 the last mask must be converted into {2,5,5,5}.
6234 For the first two permutations we need the first and the second input
6235 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6236 we need the second and the third vectors: {b1,c1,a2,b2} and
6240 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_node_instance),
6246 vect_stmts_counter = 0;
6248 first_vec_index = vec_index++;
6250 second_vec_index = first_vec_index;
6252 second_vec_index = vec_index++;
6254 for (j = 0; j < unroll_factor; j++)
6256 for (k = 0; k < group_size; k++)
6258 first_mask_element = (i + j * group_size) * scale;
6259 for (m = 0; m < scale; m++)
6261 if (!vect_get_mask_element (stmt, first_mask_element, m,
6262 mask_nunits, only_one_vec, index, mask,
6263 ¤t_mask_element, &need_next_vector))
6266 mask[index++] = current_mask_element;
6269 if (index == mask_nunits)
6274 if (need_next_vector)
6276 first_vec_index = second_vec_index;
6277 second_vec_index = vec_index;
6280 next_scalar_stmt = VEC_index (gimple,
6281 SLP_TREE_SCALAR_STMTS (node), scalar_index++);
6283 vect_create_mask_and_perm (stmt, next_scalar_stmt,
6284 mask, mask_nunits, mask_element_type, mask_type,
6285 first_vec_index, second_vec_index, gsi, node,
6286 builtin_decl, vectype, dr_chain, ncopies,
6287 vect_stmts_counter++);
6298 /* vectorizable_load.
6300 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
6302 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6303 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
6304 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6307 vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
6308 slp_tree slp_node, slp_instance slp_node_instance)
6311 tree vec_dest = NULL;
6312 tree data_ref = NULL;
6313 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6314 stmt_vec_info prev_stmt_info;
6315 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6316 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6317 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
6318 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
6319 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
6320 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6323 gimple new_stmt = NULL;
6325 enum dr_alignment_support alignment_support_scheme;
6326 tree dataref_ptr = NULL_TREE;
6328 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6330 int i, j, group_size;
6331 tree msq = NULL_TREE, lsq;
6332 tree offset = NULL_TREE;
6333 tree realignment_token = NULL_TREE;
6335 VEC(tree,heap) *dr_chain = NULL;
6336 bool strided_load = false;
6340 bool compute_in_loop = false;
6341 struct loop *at_loop;
6343 bool slp = (slp_node != NULL);
6344 bool slp_perm = false;
6345 enum tree_code code;
6347 /* Multiple types in SLP are handled by creating the appropriate number of
6348 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
6353 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6355 gcc_assert (ncopies >= 1);
6357 /* FORNOW. This restriction should be relaxed. */
6358 if (nested_in_vect_loop && ncopies > 1)
6360 if (vect_print_dump_info (REPORT_DETAILS))
6361 fprintf (vect_dump, "multiple types in nested loop.");
6365 if (slp && SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance))
6368 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6371 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6374 /* Is vectorizable load? */
6375 if (!is_gimple_assign (stmt))
6378 scalar_dest = gimple_assign_lhs (stmt);
6379 if (TREE_CODE (scalar_dest) != SSA_NAME)
6382 code = gimple_assign_rhs_code (stmt);
6383 if (code != ARRAY_REF
6384 && code != INDIRECT_REF
6385 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
6388 if (!STMT_VINFO_DATA_REF (stmt_info))
6391 scalar_type = TREE_TYPE (DR_REF (dr));
6392 mode = (int) TYPE_MODE (vectype);
6394 /* FORNOW. In some cases can vectorize even if data-type not supported
6395 (e.g. - data copies). */
6396 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
6398 if (vect_print_dump_info (REPORT_DETAILS))
6399 fprintf (vect_dump, "Aligned load, but unsupported type.");
6403 /* The vector component type needs to be trivially convertible to the
6404 scalar lhs. This should always be the case. */
6405 if (!useless_type_conversion_p (TREE_TYPE (scalar_dest), TREE_TYPE (vectype)))
6407 if (vect_print_dump_info (REPORT_DETAILS))
6408 fprintf (vect_dump, "??? operands of different types");
6412 /* Check if the load is a part of an interleaving chain. */
6413 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6415 strided_load = true;
6417 gcc_assert (! nested_in_vect_loop);
6419 /* Check if interleaving is supported. */
6420 if (!vect_strided_load_supported (vectype)
6421 && !PURE_SLP_STMT (stmt_info) && !slp)
6425 if (!vec_stmt) /* transformation not required. */
6427 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
6428 vect_model_load_cost (stmt_info, ncopies, NULL);
6432 if (vect_print_dump_info (REPORT_DETAILS))
6433 fprintf (vect_dump, "transform load.");
6439 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
6440 /* Check if the chain of loads is already vectorized. */
6441 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
6443 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6446 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
6447 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
6449 /* VEC_NUM is the number of vect stmts to be created for this group. */
6452 strided_load = false;
6453 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6456 vec_num = group_size;
6458 dr_chain = VEC_alloc (tree, heap, vec_num);
6464 group_size = vec_num = 1;
6467 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
6468 gcc_assert (alignment_support_scheme);
6470 /* In case the vectorization factor (VF) is bigger than the number
6471 of elements that we can fit in a vectype (nunits), we have to generate
6472 more than one vector stmt - i.e - we need to "unroll" the
6473 vector stmt by a factor VF/nunits. In doing so, we record a pointer
6474 from one copy of the vector stmt to the next, in the field
6475 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
6476 stages to find the correct vector defs to be used when vectorizing
6477 stmts that use the defs of the current stmt. The example below illustrates
6478 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
6479 4 vectorized stmts):
6481 before vectorization:
6482 RELATED_STMT VEC_STMT
6486 step 1: vectorize stmt S1:
6487 We first create the vector stmt VS1_0, and, as usual, record a
6488 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
6489 Next, we create the vector stmt VS1_1, and record a pointer to
6490 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
6491 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
6493 RELATED_STMT VEC_STMT
6494 VS1_0: vx0 = memref0 VS1_1 -
6495 VS1_1: vx1 = memref1 VS1_2 -
6496 VS1_2: vx2 = memref2 VS1_3 -
6497 VS1_3: vx3 = memref3 - -
6498 S1: x = load - VS1_0
6501 See in documentation in vect_get_vec_def_for_stmt_copy for how the
6502 information we recorded in RELATED_STMT field is used to vectorize
6505 /* In case of interleaving (non-unit strided access):
6512 Vectorized loads are created in the order of memory accesses
6513 starting from the access of the first stmt of the chain:
6516 VS2: vx1 = &base + vec_size*1
6517 VS3: vx3 = &base + vec_size*2
6518 VS4: vx4 = &base + vec_size*3
6520 Then permutation statements are generated:
6522 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
6523 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
6526 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
6527 (the order of the data-refs in the output of vect_permute_load_chain
6528 corresponds to the order of scalar stmts in the interleaving chain - see
6529 the documentation of vect_permute_load_chain()).
6530 The generation of permutation stmts and recording them in
6531 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
6533 In case of both multiple types and interleaving, the vector loads and
6534 permutation stmts above are created for every copy. The result vector stmts
6535 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
6536 STMT_VINFO_RELATED_STMT for the next copies. */
6538 /* If the data reference is aligned (dr_aligned) or potentially unaligned
6539 on a target that supports unaligned accesses (dr_unaligned_supported)
6540 we generate the following code:
6544 p = p + indx * vectype_size;
6549 Otherwise, the data reference is potentially unaligned on a target that
6550 does not support unaligned accesses (dr_explicit_realign_optimized) -
6551 then generate the following code, in which the data in each iteration is
6552 obtained by two vector loads, one from the previous iteration, and one
6553 from the current iteration:
6555 msq_init = *(floor(p1))
6556 p2 = initial_addr + VS - 1;
6557 realignment_token = call target_builtin;
6560 p2 = p2 + indx * vectype_size
6562 vec_dest = realign_load (msq, lsq, realignment_token)
6567 /* If the misalignment remains the same throughout the execution of the
6568 loop, we can create the init_addr and permutation mask at the loop
6569 preheader. Otherwise, it needs to be created inside the loop.
6570 This can only occur when vectorizing memory accesses in the inner-loop
6571 nested within an outer-loop that is being vectorized. */
6573 if (nested_in_vect_loop_p (loop, stmt)
6574 && (TREE_INT_CST_LOW (DR_STEP (dr))
6575 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
6577 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
6578 compute_in_loop = true;
6581 if ((alignment_support_scheme == dr_explicit_realign_optimized
6582 || alignment_support_scheme == dr_explicit_realign)
6583 && !compute_in_loop)
6585 msq = vect_setup_realignment (first_stmt, gsi, &realignment_token,
6586 alignment_support_scheme, NULL_TREE,
6588 if (alignment_support_scheme == dr_explicit_realign_optimized)
6590 phi = SSA_NAME_DEF_STMT (msq);
6591 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6597 prev_stmt_info = NULL;
6598 for (j = 0; j < ncopies; j++)
6600 /* 1. Create the vector pointer update chain. */
6602 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
6604 &dummy, &ptr_incr, false,
6608 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
6610 for (i = 0; i < vec_num; i++)
6613 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
6616 /* 2. Create the vector-load in the loop. */
6617 switch (alignment_support_scheme)
6620 gcc_assert (aligned_access_p (first_dr));
6621 data_ref = build_fold_indirect_ref (dataref_ptr);
6623 case dr_unaligned_supported:
6625 int mis = DR_MISALIGNMENT (first_dr);
6626 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
6628 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
6630 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
6633 case dr_explicit_realign:
6636 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6638 if (compute_in_loop)
6639 msq = vect_setup_realignment (first_stmt, gsi,
6641 dr_explicit_realign,
6644 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6645 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6646 new_stmt = gimple_build_assign (vec_dest, data_ref);
6647 new_temp = make_ssa_name (vec_dest, new_stmt);
6648 gimple_assign_set_lhs (new_stmt, new_temp);
6649 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6650 copy_virtual_operands (new_stmt, stmt);
6651 mark_symbols_for_renaming (new_stmt);
6654 bump = size_binop (MULT_EXPR, vs_minus_1,
6655 TYPE_SIZE_UNIT (scalar_type));
6656 ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump);
6657 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
6660 case dr_explicit_realign_optimized:
6661 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6666 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6667 new_stmt = gimple_build_assign (vec_dest, data_ref);
6668 new_temp = make_ssa_name (vec_dest, new_stmt);
6669 gimple_assign_set_lhs (new_stmt, new_temp);
6670 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6671 mark_symbols_for_renaming (new_stmt);
6673 /* 3. Handle explicit realignment if necessary/supported. Create in
6674 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
6675 if (alignment_support_scheme == dr_explicit_realign_optimized
6676 || alignment_support_scheme == dr_explicit_realign)
6680 lsq = gimple_assign_lhs (new_stmt);
6681 if (!realignment_token)
6682 realignment_token = dataref_ptr;
6683 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6684 tmp = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
6686 new_stmt = gimple_build_assign (vec_dest, tmp);
6687 new_temp = make_ssa_name (vec_dest, new_stmt);
6688 gimple_assign_set_lhs (new_stmt, new_temp);
6689 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6691 if (alignment_support_scheme == dr_explicit_realign_optimized)
6694 if (i == vec_num - 1 && j == ncopies - 1)
6695 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
6700 /* 4. Handle invariant-load. */
6703 gcc_assert (!strided_load);
6704 gcc_assert (nested_in_vect_loop_p (loop, stmt));
6709 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
6711 /* CHECKME: bitpos depends on endianess? */
6712 bitpos = bitsize_zero_node;
6713 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6716 vect_create_destination_var (scalar_dest, NULL_TREE);
6717 new_stmt = gimple_build_assign (vec_dest, vec_inv);
6718 new_temp = make_ssa_name (vec_dest, new_stmt);
6719 gimple_assign_set_lhs (new_stmt, new_temp);
6720 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6722 for (k = nunits - 1; k >= 0; --k)
6723 t = tree_cons (NULL_TREE, new_temp, t);
6724 /* FIXME: use build_constructor directly. */
6725 vec_inv = build_constructor_from_list (vectype, t);
6726 new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
6727 new_stmt = SSA_NAME_DEF_STMT (new_temp);
6730 gcc_unreachable (); /* FORNOW. */
6733 /* Collect vector loads and later create their permutation in
6734 vect_transform_strided_load (). */
6735 if (strided_load || slp_perm)
6736 VEC_quick_push (tree, dr_chain, new_temp);
6738 /* Store vector loads in the corresponding SLP_NODE. */
6739 if (slp && !slp_perm)
6740 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
6743 if (slp && !slp_perm)
6748 if (!vect_transform_slp_perm_load (stmt, dr_chain, gsi,
6749 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
6750 slp_node_instance, false))
6752 VEC_free (tree, heap, dr_chain);
6760 if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi))
6763 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6764 VEC_free (tree, heap, dr_chain);
6765 dr_chain = VEC_alloc (tree, heap, group_size);
6770 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6772 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6773 prev_stmt_info = vinfo_for_stmt (new_stmt);
6779 VEC_free (tree, heap, dr_chain);
6785 /* Function vectorizable_live_operation.
6787 STMT computes a value that is used outside the loop. Check if
6788 it can be supported. */
6791 vectorizable_live_operation (gimple stmt,
6792 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6793 gimple *vec_stmt ATTRIBUTE_UNUSED)
6795 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6796 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6797 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6803 enum vect_def_type dt;
6804 enum tree_code code;
6805 enum gimple_rhs_class rhs_class;
6807 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6809 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6812 if (!is_gimple_assign (stmt))
6815 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6818 /* FORNOW. CHECKME. */
6819 if (nested_in_vect_loop_p (loop, stmt))
6822 code = gimple_assign_rhs_code (stmt);
6823 op_type = TREE_CODE_LENGTH (code);
6824 rhs_class = get_gimple_rhs_class (code);
6825 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
6826 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
6828 /* FORNOW: support only if all uses are invariant. This means
6829 that the scalar operations can remain in place, unvectorized.
6830 The original last scalar value that they compute will be used. */
6832 for (i = 0; i < op_type; i++)
6834 if (rhs_class == GIMPLE_SINGLE_RHS)
6835 op = TREE_OPERAND (gimple_op (stmt, 1), i);
6837 op = gimple_op (stmt, i + 1);
6838 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
6840 if (vect_print_dump_info (REPORT_DETAILS))
6841 fprintf (vect_dump, "use not simple.");
6845 if (dt != vect_invariant_def && dt != vect_constant_def)
6849 /* No transformation is required for the cases we currently support. */
6854 /* Function vect_is_simple_cond.
6857 LOOP - the loop that is being vectorized.
6858 COND - Condition that is checked for simple use.
6860 Returns whether a COND can be vectorized. Checks whether
6861 condition operands are supportable using vec_is_simple_use. */
6864 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6868 enum vect_def_type dt;
6870 if (!COMPARISON_CLASS_P (cond))
6873 lhs = TREE_OPERAND (cond, 0);
6874 rhs = TREE_OPERAND (cond, 1);
6876 if (TREE_CODE (lhs) == SSA_NAME)
6878 gimple lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6879 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6882 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6883 && TREE_CODE (lhs) != FIXED_CST)
6886 if (TREE_CODE (rhs) == SSA_NAME)
6888 gimple rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6889 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6892 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
6893 && TREE_CODE (rhs) != FIXED_CST)
6899 /* vectorizable_condition.
6901 Check if STMT is conditional modify expression that can be vectorized.
6902 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6903 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
6906 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6909 vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
6912 tree scalar_dest = NULL_TREE;
6913 tree vec_dest = NULL_TREE;
6914 tree op = NULL_TREE;
6915 tree cond_expr, then_clause, else_clause;
6916 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6917 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6918 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6919 tree vec_compare, vec_cond_expr;
6921 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6922 enum machine_mode vec_mode;
6924 enum vect_def_type dt;
6925 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6926 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6927 enum tree_code code;
6929 gcc_assert (ncopies >= 1);
6931 return false; /* FORNOW */
6933 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6936 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6939 /* FORNOW: SLP not supported. */
6940 if (STMT_SLP_TYPE (stmt_info))
6943 /* FORNOW: not yet supported. */
6944 if (STMT_VINFO_LIVE_P (stmt_info))
6946 if (vect_print_dump_info (REPORT_DETAILS))
6947 fprintf (vect_dump, "value used after loop.");
6951 /* Is vectorizable conditional operation? */
6952 if (!is_gimple_assign (stmt))
6955 code = gimple_assign_rhs_code (stmt);
6957 if (code != COND_EXPR)
6960 gcc_assert (gimple_assign_single_p (stmt));
6961 op = gimple_assign_rhs1 (stmt);
6962 cond_expr = TREE_OPERAND (op, 0);
6963 then_clause = TREE_OPERAND (op, 1);
6964 else_clause = TREE_OPERAND (op, 2);
6966 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6969 /* We do not handle two different vector types for the condition
6971 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6974 if (TREE_CODE (then_clause) == SSA_NAME)
6976 gimple then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6977 if (!vect_is_simple_use (then_clause, loop_vinfo,
6978 &then_def_stmt, &def, &dt))
6981 else if (TREE_CODE (then_clause) != INTEGER_CST
6982 && TREE_CODE (then_clause) != REAL_CST
6983 && TREE_CODE (then_clause) != FIXED_CST)
6986 if (TREE_CODE (else_clause) == SSA_NAME)
6988 gimple else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6989 if (!vect_is_simple_use (else_clause, loop_vinfo,
6990 &else_def_stmt, &def, &dt))
6993 else if (TREE_CODE (else_clause) != INTEGER_CST
6994 && TREE_CODE (else_clause) != REAL_CST
6995 && TREE_CODE (else_clause) != FIXED_CST)
6999 vec_mode = TYPE_MODE (vectype);
7003 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
7004 return expand_vec_cond_expr_p (op, vec_mode);
7010 scalar_dest = gimple_assign_lhs (stmt);
7011 vec_dest = vect_create_destination_var (scalar_dest, vectype);
7013 /* Handle cond expr. */
7015 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
7017 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
7018 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
7019 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
7021 /* Arguments are ready. Create the new vector stmt. */
7022 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
7023 vec_cond_lhs, vec_cond_rhs);
7024 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
7025 vec_compare, vec_then_clause, vec_else_clause);
7027 *vec_stmt = gimple_build_assign (vec_dest, vec_cond_expr);
7028 new_temp = make_ssa_name (vec_dest, *vec_stmt);
7029 gimple_assign_set_lhs (*vec_stmt, new_temp);
7030 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
7036 /* Function vect_transform_stmt.
7038 Create a vectorized stmt to replace STMT, and insert it at BSI. */
7041 vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
7042 bool *strided_store, slp_tree slp_node,
7043 slp_instance slp_node_instance)
7045 bool is_store = false;
7046 gimple vec_stmt = NULL;
7047 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7048 gimple orig_stmt_in_pattern;
7050 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7051 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7053 switch (STMT_VINFO_TYPE (stmt_info))
7055 case type_demotion_vec_info_type:
7056 done = vectorizable_type_demotion (stmt, gsi, &vec_stmt, slp_node);
7060 case type_promotion_vec_info_type:
7061 done = vectorizable_type_promotion (stmt, gsi, &vec_stmt, slp_node);
7065 case type_conversion_vec_info_type:
7066 done = vectorizable_conversion (stmt, gsi, &vec_stmt, slp_node);
7070 case induc_vec_info_type:
7071 gcc_assert (!slp_node);
7072 done = vectorizable_induction (stmt, gsi, &vec_stmt);
7076 case op_vec_info_type:
7077 done = vectorizable_operation (stmt, gsi, &vec_stmt, slp_node);
7081 case assignment_vec_info_type:
7082 done = vectorizable_assignment (stmt, gsi, &vec_stmt, slp_node);
7086 case load_vec_info_type:
7087 done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node,
7092 case store_vec_info_type:
7093 done = vectorizable_store (stmt, gsi, &vec_stmt, slp_node);
7095 if (STMT_VINFO_STRIDED_ACCESS (stmt_info) && !slp_node)
7097 /* In case of interleaving, the whole chain is vectorized when the
7098 last store in the chain is reached. Store stmts before the last
7099 one are skipped, and there vec_stmt_info shouldn't be freed
7101 *strided_store = true;
7102 if (STMT_VINFO_VEC_STMT (stmt_info))
7109 case condition_vec_info_type:
7110 gcc_assert (!slp_node);
7111 done = vectorizable_condition (stmt, gsi, &vec_stmt);
7115 case call_vec_info_type:
7116 gcc_assert (!slp_node);
7117 done = vectorizable_call (stmt, gsi, &vec_stmt);
7120 case reduc_vec_info_type:
7121 gcc_assert (!slp_node);
7122 done = vectorizable_reduction (stmt, gsi, &vec_stmt);
7127 if (!STMT_VINFO_LIVE_P (stmt_info))
7129 if (vect_print_dump_info (REPORT_DETAILS))
7130 fprintf (vect_dump, "stmt not supported.");
7135 /* Handle inner-loop stmts whose DEF is used in the loop-nest that
7136 is being vectorized, but outside the immediately enclosing loop. */
7138 && nested_in_vect_loop_p (loop, stmt)
7139 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type
7140 && (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_outer
7141 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_outer_by_reduction))
7143 struct loop *innerloop = loop->inner;
7144 imm_use_iterator imm_iter;
7145 use_operand_p use_p;
7149 if (vect_print_dump_info (REPORT_DETAILS))
7150 fprintf (vect_dump, "Record the vdef for outer-loop vectorization.");
7152 /* Find the relevant loop-exit phi-node, and reord the vec_stmt there
7153 (to be used when vectorizing outer-loop stmts that use the DEF of
7155 if (gimple_code (stmt) == GIMPLE_PHI)
7156 scalar_dest = PHI_RESULT (stmt);
7158 scalar_dest = gimple_assign_lhs (stmt);
7160 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
7162 if (!flow_bb_inside_loop_p (innerloop, gimple_bb (USE_STMT (use_p))))
7164 exit_phi = USE_STMT (use_p);
7165 STMT_VINFO_VEC_STMT (vinfo_for_stmt (exit_phi)) = vec_stmt;
7170 /* Handle stmts whose DEF is used outside the loop-nest that is
7171 being vectorized. */
7172 if (STMT_VINFO_LIVE_P (stmt_info)
7173 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
7175 done = vectorizable_live_operation (stmt, gsi, &vec_stmt);
7181 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
7182 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
7183 if (orig_stmt_in_pattern)
7185 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
7186 /* STMT was inserted by the vectorizer to replace a computation idiom.
7187 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
7188 computed this idiom. We need to record a pointer to VEC_STMT in
7189 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
7190 documentation of vect_pattern_recog. */
7191 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
7193 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
7194 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
7203 /* This function builds ni_name = number of iterations loop executes
7204 on the loop preheader. */
7207 vect_build_loop_niters (loop_vec_info loop_vinfo)
7210 gimple_seq stmts = NULL;
7212 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7213 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
7215 var = create_tmp_var (TREE_TYPE (ni), "niters");
7216 add_referenced_var (var);
7217 ni_name = force_gimple_operand (ni, &stmts, false, var);
7219 pe = loop_preheader_edge (loop);
7222 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7223 gcc_assert (!new_bb);
7230 /* This function generates the following statements:
7232 ni_name = number of iterations loop executes
7233 ratio = ni_name / vf
7234 ratio_mult_vf_name = ratio * vf
7236 and places them at the loop preheader edge. */
7239 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
7241 tree *ratio_mult_vf_name_ptr,
7242 tree *ratio_name_ptr)
7251 tree ratio_mult_vf_name;
7252 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7253 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
7254 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7257 pe = loop_preheader_edge (loop);
7259 /* Generate temporary variable that contains
7260 number of iterations loop executes. */
7262 ni_name = vect_build_loop_niters (loop_vinfo);
7263 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
7265 /* Create: ratio = ni >> log2(vf) */
7267 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
7268 if (!is_gimple_val (ratio_name))
7270 var = create_tmp_var (TREE_TYPE (ni), "bnd");
7271 add_referenced_var (var);
7274 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
7275 pe = loop_preheader_edge (loop);
7276 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7277 gcc_assert (!new_bb);
7280 /* Create: ratio_mult_vf = ratio << log2 (vf). */
7282 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
7283 ratio_name, log_vf);
7284 if (!is_gimple_val (ratio_mult_vf_name))
7286 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
7287 add_referenced_var (var);
7290 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
7292 pe = loop_preheader_edge (loop);
7293 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7294 gcc_assert (!new_bb);
7297 *ni_name_ptr = ni_name;
7298 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
7299 *ratio_name_ptr = ratio_name;
7305 /* Function vect_update_ivs_after_vectorizer.
7307 "Advance" the induction variables of LOOP to the value they should take
7308 after the execution of LOOP. This is currently necessary because the
7309 vectorizer does not handle induction variables that are used after the
7310 loop. Such a situation occurs when the last iterations of LOOP are
7312 1. We introduced new uses after LOOP for IVs that were not originally used
7313 after LOOP: the IVs of LOOP are now used by an epilog loop.
7314 2. LOOP is going to be vectorized; this means that it will iterate N/VF
7315 times, whereas the loop IVs should be bumped N times.
7318 - LOOP - a loop that is going to be vectorized. The last few iterations
7319 of LOOP were peeled.
7320 - NITERS - the number of iterations that LOOP executes (before it is
7321 vectorized). i.e, the number of times the ivs should be bumped.
7322 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
7323 coming out from LOOP on which there are uses of the LOOP ivs
7324 (this is the path from LOOP->exit to epilog_loop->preheader).
7326 The new definitions of the ivs are placed in LOOP->exit.
7327 The phi args associated with the edge UPDATE_E in the bb
7328 UPDATE_E->dest are updated accordingly.
7330 Assumption 1: Like the rest of the vectorizer, this function assumes
7331 a single loop exit that has a single predecessor.
7333 Assumption 2: The phi nodes in the LOOP header and in update_bb are
7334 organized in the same order.
7336 Assumption 3: The access function of the ivs is simple enough (see
7337 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
7339 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
7340 coming out of LOOP on which the ivs of LOOP are used (this is the path
7341 that leads to the epilog loop; other paths skip the epilog loop). This
7342 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
7343 needs to have its phis updated.
7347 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
7350 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7351 basic_block exit_bb = single_exit (loop)->dest;
7353 gimple_stmt_iterator gsi, gsi1;
7354 basic_block update_bb = update_e->dest;
7356 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
7358 /* Make sure there exists a single-predecessor exit bb: */
7359 gcc_assert (single_pred_p (exit_bb));
7361 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
7362 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
7363 gsi_next (&gsi), gsi_next (&gsi1))
7365 tree access_fn = NULL;
7366 tree evolution_part;
7369 tree var, ni, ni_name;
7370 gimple_stmt_iterator last_gsi;
7372 phi = gsi_stmt (gsi);
7373 phi1 = gsi_stmt (gsi1);
7374 if (vect_print_dump_info (REPORT_DETAILS))
7376 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
7377 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
7380 /* Skip virtual phi's. */
7381 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
7383 if (vect_print_dump_info (REPORT_DETAILS))
7384 fprintf (vect_dump, "virtual phi. skip.");
7388 /* Skip reduction phis. */
7389 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
7391 if (vect_print_dump_info (REPORT_DETAILS))
7392 fprintf (vect_dump, "reduc phi. skip.");
7396 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
7397 gcc_assert (access_fn);
7398 STRIP_NOPS (access_fn);
7400 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
7401 gcc_assert (evolution_part != NULL_TREE);
7403 /* FORNOW: We do not support IVs whose evolution function is a polynomial
7404 of degree >= 2 or exponential. */
7405 gcc_assert (!tree_is_chrec (evolution_part));
7407 step_expr = evolution_part;
7408 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
7411 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
7412 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
7414 fold_convert (sizetype,
7415 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
7416 niters, step_expr)));
7418 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
7419 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
7420 fold_convert (TREE_TYPE (init_expr),
7427 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
7428 add_referenced_var (var);
7430 last_gsi = gsi_last_bb (exit_bb);
7431 ni_name = force_gimple_operand_gsi (&last_gsi, ni, false, var,
7432 true, GSI_SAME_STMT);
7434 /* Fix phi expressions in the successor bb. */
7435 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
7439 /* Return the more conservative threshold between the
7440 min_profitable_iters returned by the cost model and the user
7441 specified threshold, if provided. */
7444 conservative_cost_threshold (loop_vec_info loop_vinfo,
7445 int min_profitable_iters)
7448 int min_scalar_loop_bound;
7450 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
7451 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
7453 /* Use the cost model only if it is more conservative than user specified
7455 th = (unsigned) min_scalar_loop_bound;
7456 if (min_profitable_iters
7457 && (!min_scalar_loop_bound
7458 || min_profitable_iters > min_scalar_loop_bound))
7459 th = (unsigned) min_profitable_iters;
7461 if (th && vect_print_dump_info (REPORT_COST))
7462 fprintf (vect_dump, "Vectorization may not be profitable.");
7467 /* Function vect_do_peeling_for_loop_bound
7469 Peel the last iterations of the loop represented by LOOP_VINFO.
7470 The peeled iterations form a new epilog loop. Given that the loop now
7471 iterates NITERS times, the new epilog loop iterates
7472 NITERS % VECTORIZATION_FACTOR times.
7474 The original loop will later be made to iterate
7475 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
7478 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
7480 tree ni_name, ratio_mult_vf_name;
7481 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7482 struct loop *new_loop;
7484 basic_block preheader;
7486 bool check_profitability = false;
7487 unsigned int th = 0;
7488 int min_profitable_iters;
7490 if (vect_print_dump_info (REPORT_DETAILS))
7491 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
7493 initialize_original_copy_tables ();
7495 /* Generate the following variables on the preheader of original loop:
7497 ni_name = number of iteration the original loop executes
7498 ratio = ni_name / vf
7499 ratio_mult_vf_name = ratio * vf */
7500 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
7501 &ratio_mult_vf_name, ratio);
7503 loop_num = loop->num;
7505 /* If cost model check not done during versioning and
7506 peeling for alignment. */
7507 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7508 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
7509 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7511 check_profitability = true;
7513 /* Get profitability threshold for vectorized loop. */
7514 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7516 th = conservative_cost_threshold (loop_vinfo,
7517 min_profitable_iters);
7520 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
7521 ratio_mult_vf_name, ni_name, false,
7522 th, check_profitability);
7523 gcc_assert (new_loop);
7524 gcc_assert (loop_num == loop->num);
7525 #ifdef ENABLE_CHECKING
7526 slpeel_verify_cfg_after_peeling (loop, new_loop);
7529 /* A guard that controls whether the new_loop is to be executed or skipped
7530 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
7531 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
7532 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
7533 is on the path where the LOOP IVs are used and need to be updated. */
7535 preheader = loop_preheader_edge (new_loop)->src;
7536 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
7537 update_e = EDGE_PRED (preheader, 0);
7539 update_e = EDGE_PRED (preheader, 1);
7541 /* Update IVs of original loop as if they were advanced
7542 by ratio_mult_vf_name steps. */
7543 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
7545 /* After peeling we have to reset scalar evolution analyzer. */
7548 free_original_copy_tables ();
7552 /* Function vect_gen_niters_for_prolog_loop
7554 Set the number of iterations for the loop represented by LOOP_VINFO
7555 to the minimum between LOOP_NITERS (the original iteration count of the loop)
7556 and the misalignment of DR - the data reference recorded in
7557 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
7558 this loop, the data reference DR will refer to an aligned location.
7560 The following computation is generated:
7562 If the misalignment of DR is known at compile time:
7563 addr_mis = int mis = DR_MISALIGNMENT (dr);
7564 Else, compute address misalignment in bytes:
7565 addr_mis = addr & (vectype_size - 1)
7567 prolog_niters = min (LOOP_NITERS, ((VF - addr_mis/elem_size)&(VF-1))/step)
7569 (elem_size = element type size; an element is the scalar element whose type
7570 is the inner type of the vectype)
7572 When the step of the data-ref in the loop is not 1 (as in interleaved data
7573 and SLP), the number of iterations of the prolog must be divided by the step
7574 (which is equal to the size of interleaved group).
7576 The above formulas assume that VF == number of elements in the vector. This
7577 may not hold when there are multiple-types in the loop.
7578 In this case, for some data-references in the loop the VF does not represent
7579 the number of elements that fit in the vector. Therefore, instead of VF we
7580 use TYPE_VECTOR_SUBPARTS. */
7583 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
7585 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
7586 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7589 tree iters, iters_name;
7592 gimple dr_stmt = DR_STMT (dr);
7593 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
7594 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7595 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
7596 tree niters_type = TREE_TYPE (loop_niters);
7598 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
7599 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
7601 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7602 step = DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info)));
7604 pe = loop_preheader_edge (loop);
7606 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
7608 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
7609 int elem_misalign = byte_misalign / element_size;
7611 if (vect_print_dump_info (REPORT_DETAILS))
7612 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
7614 iters = build_int_cst (niters_type,
7615 (((nelements - elem_misalign) & (nelements - 1)) / step));
7619 gimple_seq new_stmts = NULL;
7620 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
7621 &new_stmts, NULL_TREE, loop);
7622 tree ptr_type = TREE_TYPE (start_addr);
7623 tree size = TYPE_SIZE (ptr_type);
7624 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
7625 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
7626 tree elem_size_log =
7627 build_int_cst (type, exact_log2 (vectype_align/nelements));
7628 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
7629 tree nelements_tree = build_int_cst (type, nelements);
7633 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmts);
7634 gcc_assert (!new_bb);
7636 /* Create: byte_misalign = addr & (vectype_size - 1) */
7638 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
7640 /* Create: elem_misalign = byte_misalign / element_size */
7642 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
7644 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
7645 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
7646 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
7647 iters = fold_convert (niters_type, iters);
7650 /* Create: prolog_loop_niters = min (iters, loop_niters) */
7651 /* If the loop bound is known at compile time we already verified that it is
7652 greater than vf; since the misalignment ('iters') is at most vf, there's
7653 no need to generate the MIN_EXPR in this case. */
7654 if (TREE_CODE (loop_niters) != INTEGER_CST)
7655 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
7657 if (vect_print_dump_info (REPORT_DETAILS))
7659 fprintf (vect_dump, "niters for prolog loop: ");
7660 print_generic_expr (vect_dump, iters, TDF_SLIM);
7663 var = create_tmp_var (niters_type, "prolog_loop_niters");
7664 add_referenced_var (var);
7666 iters_name = force_gimple_operand (iters, &stmts, false, var);
7668 /* Insert stmt on loop preheader edge. */
7671 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7672 gcc_assert (!new_bb);
7679 /* Function vect_update_init_of_dr
7681 NITERS iterations were peeled from LOOP. DR represents a data reference
7682 in LOOP. This function updates the information recorded in DR to
7683 account for the fact that the first NITERS iterations had already been
7684 executed. Specifically, it updates the OFFSET field of DR. */
7687 vect_update_init_of_dr (struct data_reference *dr, tree niters)
7689 tree offset = DR_OFFSET (dr);
7691 niters = fold_build2 (MULT_EXPR, sizetype,
7692 fold_convert (sizetype, niters),
7693 fold_convert (sizetype, DR_STEP (dr)));
7694 offset = fold_build2 (PLUS_EXPR, sizetype, offset, niters);
7695 DR_OFFSET (dr) = offset;
7699 /* Function vect_update_inits_of_drs
7701 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
7702 This function updates the information recorded for the data references in
7703 the loop to account for the fact that the first NITERS iterations had
7704 already been executed. Specifically, it updates the initial_condition of
7705 the access_function of all the data_references in the loop. */
7708 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
7711 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
7712 struct data_reference *dr;
7714 if (vect_print_dump_info (REPORT_DETAILS))
7715 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
7717 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
7718 vect_update_init_of_dr (dr, niters);
7722 /* Function vect_do_peeling_for_alignment
7724 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
7725 'niters' is set to the misalignment of one of the data references in the
7726 loop, thereby forcing it to refer to an aligned location at the beginning
7727 of the execution of this loop. The data reference for which we are
7728 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
7731 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
7733 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7734 tree niters_of_prolog_loop, ni_name;
7736 struct loop *new_loop;
7737 bool check_profitability = false;
7738 unsigned int th = 0;
7739 int min_profitable_iters;
7741 if (vect_print_dump_info (REPORT_DETAILS))
7742 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
7744 initialize_original_copy_tables ();
7746 ni_name = vect_build_loop_niters (loop_vinfo);
7747 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
7750 /* If cost model check not done during versioning. */
7751 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7752 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7754 check_profitability = true;
7756 /* Get profitability threshold for vectorized loop. */
7757 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7759 th = conservative_cost_threshold (loop_vinfo,
7760 min_profitable_iters);
7763 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
7765 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
7766 niters_of_prolog_loop, ni_name, true,
7767 th, check_profitability);
7769 gcc_assert (new_loop);
7770 #ifdef ENABLE_CHECKING
7771 slpeel_verify_cfg_after_peeling (new_loop, loop);
7774 /* Update number of times loop executes. */
7775 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
7776 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
7777 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
7779 /* Update the init conditions of the access functions of all data refs. */
7780 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
7782 /* After peeling we have to reset scalar evolution analyzer. */
7785 free_original_copy_tables ();
7789 /* Function vect_create_cond_for_align_checks.
7791 Create a conditional expression that represents the alignment checks for
7792 all of data references (array element references) whose alignment must be
7796 COND_EXPR - input conditional expression. New conditions will be chained
7797 with logical AND operation.
7798 LOOP_VINFO - two fields of the loop information are used.
7799 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
7800 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
7803 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7805 The returned value is the conditional expression to be used in the if
7806 statement that controls which version of the loop gets executed at runtime.
7808 The algorithm makes two assumptions:
7809 1) The number of bytes "n" in a vector is a power of 2.
7810 2) An address "a" is aligned if a%n is zero and that this
7811 test can be done as a&(n-1) == 0. For example, for 16
7812 byte vectors the test is a&0xf == 0. */
7815 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
7817 gimple_seq *cond_expr_stmt_list)
7819 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7820 VEC(gimple,heap) *may_misalign_stmts
7821 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
7823 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
7827 tree int_ptrsize_type;
7829 tree or_tmp_name = NULL_TREE;
7830 tree and_tmp, and_tmp_name;
7833 tree part_cond_expr;
7835 /* Check that mask is one less than a power of 2, i.e., mask is
7836 all zeros followed by all ones. */
7837 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
7839 /* CHECKME: what is the best integer or unsigned type to use to hold a
7840 cast from a pointer value? */
7841 psize = TYPE_SIZE (ptr_type_node);
7843 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
7845 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
7846 of the first vector of the i'th data reference. */
7848 for (i = 0; VEC_iterate (gimple, may_misalign_stmts, i, ref_stmt); i++)
7850 gimple_seq new_stmt_list = NULL;
7852 tree addr_tmp, addr_tmp_name;
7853 tree or_tmp, new_or_tmp_name;
7854 gimple addr_stmt, or_stmt;
7856 /* create: addr_tmp = (int)(address_of_first_vector) */
7858 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
7860 if (new_stmt_list != NULL)
7861 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
7863 sprintf (tmp_name, "%s%d", "addr2int", i);
7864 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7865 add_referenced_var (addr_tmp);
7866 addr_tmp_name = make_ssa_name (addr_tmp, NULL);
7867 addr_stmt = gimple_build_assign_with_ops (NOP_EXPR, addr_tmp_name,
7868 addr_base, NULL_TREE);
7869 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
7870 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
7872 /* The addresses are OR together. */
7874 if (or_tmp_name != NULL_TREE)
7876 /* create: or_tmp = or_tmp | addr_tmp */
7877 sprintf (tmp_name, "%s%d", "orptrs", i);
7878 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7879 add_referenced_var (or_tmp);
7880 new_or_tmp_name = make_ssa_name (or_tmp, NULL);
7881 or_stmt = gimple_build_assign_with_ops (BIT_IOR_EXPR,
7883 or_tmp_name, addr_tmp_name);
7884 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
7885 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
7886 or_tmp_name = new_or_tmp_name;
7889 or_tmp_name = addr_tmp_name;
7893 mask_cst = build_int_cst (int_ptrsize_type, mask);
7895 /* create: and_tmp = or_tmp & mask */
7896 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
7897 add_referenced_var (and_tmp);
7898 and_tmp_name = make_ssa_name (and_tmp, NULL);
7900 and_stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, and_tmp_name,
7901 or_tmp_name, mask_cst);
7902 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
7903 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
7905 /* Make and_tmp the left operand of the conditional test against zero.
7906 if and_tmp has a nonzero bit then some address is unaligned. */
7907 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
7908 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
7909 and_tmp_name, ptrsize_zero);
7911 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7912 *cond_expr, part_cond_expr);
7914 *cond_expr = part_cond_expr;
7917 /* Function vect_vfa_segment_size.
7919 Create an expression that computes the size of segment
7920 that will be accessed for a data reference. The functions takes into
7921 account that realignment loads may access one more vector.
7924 DR: The data reference.
7925 VECT_FACTOR: vectorization factor.
7927 Return an expression whose value is the size of segment which will be
7931 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7933 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7934 DR_STEP (dr), vect_factor);
7936 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7938 tree vector_size = TYPE_SIZE_UNIT
7939 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7941 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7942 segment_length, vector_size);
7944 return fold_convert (sizetype, segment_length);
7947 /* Function vect_create_cond_for_alias_checks.
7949 Create a conditional expression that represents the run-time checks for
7950 overlapping of address ranges represented by a list of data references
7951 relations passed as input.
7954 COND_EXPR - input conditional expression. New conditions will be chained
7955 with logical AND operation.
7956 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7960 COND_EXPR - conditional expression.
7961 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7965 The returned value is the conditional expression to be used in the if
7966 statement that controls which version of the loop gets executed at runtime.
7970 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7972 gimple_seq * cond_expr_stmt_list)
7974 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7975 VEC (ddr_p, heap) * may_alias_ddrs =
7976 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7978 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7982 tree part_cond_expr;
7984 /* Create expression
7985 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7986 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7990 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7991 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
7993 if (VEC_empty (ddr_p, may_alias_ddrs))
7996 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
7998 struct data_reference *dr_a, *dr_b;
7999 gimple dr_group_first_a, dr_group_first_b;
8000 tree addr_base_a, addr_base_b;
8001 tree segment_length_a, segment_length_b;
8002 gimple stmt_a, stmt_b;
8005 stmt_a = DR_STMT (DDR_A (ddr));
8006 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
8007 if (dr_group_first_a)
8009 stmt_a = dr_group_first_a;
8010 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
8014 stmt_b = DR_STMT (DDR_B (ddr));
8015 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
8016 if (dr_group_first_b)
8018 stmt_b = dr_group_first_b;
8019 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
8023 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
8026 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
8029 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
8030 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
8032 if (vect_print_dump_info (REPORT_DR_DETAILS))
8035 "create runtime check for data references ");
8036 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
8037 fprintf (vect_dump, " and ");
8038 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
8043 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
8044 fold_build2 (LT_EXPR, boolean_type_node,
8045 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
8049 fold_build2 (LT_EXPR, boolean_type_node,
8050 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
8056 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
8057 *cond_expr, part_cond_expr);
8059 *cond_expr = part_cond_expr;
8061 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8062 fprintf (vect_dump, "created %u versioning for alias checks.\n",
8063 VEC_length (ddr_p, may_alias_ddrs));
8067 /* Function vect_loop_versioning.
8069 If the loop has data references that may or may not be aligned or/and
8070 has data reference relations whose independence was not proven then
8071 two versions of the loop need to be generated, one which is vectorized
8072 and one which isn't. A test is then generated to control which of the
8073 loops is executed. The test checks for the alignment of all of the
8074 data references that may or may not be aligned. An additional
8075 sequence of runtime tests is generated for each pairs of DDRs whose
8076 independence was not proven. The vectorized version of loop is
8077 executed only if both alias and alignment tests are passed.
8079 The test generated to check which version of loop is executed
8080 is modified to also check for profitability as indicated by the
8081 cost model initially. */
8084 vect_loop_versioning (loop_vec_info loop_vinfo)
8086 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8088 tree cond_expr = NULL_TREE;
8089 gimple_seq cond_expr_stmt_list = NULL;
8090 basic_block condition_bb;
8091 gimple_stmt_iterator gsi, cond_exp_gsi;
8092 basic_block merge_bb;
8093 basic_block new_exit_bb;
8095 gimple orig_phi, new_phi;
8097 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
8098 gimple_seq gimplify_stmt_list = NULL;
8099 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
8100 int min_profitable_iters = 0;
8103 /* Get profitability threshold for vectorized loop. */
8104 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
8106 th = conservative_cost_threshold (loop_vinfo,
8107 min_profitable_iters);
8110 build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
8111 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
8113 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
8116 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
8117 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
8118 &cond_expr_stmt_list);
8120 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8121 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
8122 &cond_expr_stmt_list);
8125 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
8127 force_gimple_operand (cond_expr, &gimplify_stmt_list, true, NULL_TREE);
8128 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
8130 initialize_original_copy_tables ();
8131 nloop = loop_version (loop, cond_expr, &condition_bb,
8132 prob, prob, REG_BR_PROB_BASE - prob, true);
8133 free_original_copy_tables();
8135 /* Loop versioning violates an assumption we try to maintain during
8136 vectorization - that the loop exit block has a single predecessor.
8137 After versioning, the exit block of both loop versions is the same
8138 basic block (i.e. it has two predecessors). Just in order to simplify
8139 following transformations in the vectorizer, we fix this situation
8140 here by adding a new (empty) block on the exit-edge of the loop,
8141 with the proper loop-exit phis to maintain loop-closed-form. */
8143 merge_bb = single_exit (loop)->dest;
8144 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
8145 new_exit_bb = split_edge (single_exit (loop));
8146 new_exit_e = single_exit (loop);
8147 e = EDGE_SUCC (new_exit_bb, 0);
8149 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8151 orig_phi = gsi_stmt (gsi);
8152 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
8154 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
8155 add_phi_arg (new_phi, arg, new_exit_e);
8156 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
8159 /* End loop-exit-fixes after versioning. */
8161 update_ssa (TODO_update_ssa);
8162 if (cond_expr_stmt_list)
8164 cond_exp_gsi = gsi_last_bb (condition_bb);
8165 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, GSI_SAME_STMT);
8169 /* Remove a group of stores (for SLP or interleaving), free their
8173 vect_remove_stores (gimple first_stmt)
8175 gimple next = first_stmt;
8177 gimple_stmt_iterator next_si;
8181 /* Free the attached stmt_vec_info and remove the stmt. */
8182 next_si = gsi_for_stmt (next);
8183 gsi_remove (&next_si, true);
8184 tmp = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
8185 free_stmt_vec_info (next);
8191 /* Vectorize SLP instance tree in postorder. */
8194 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
8195 unsigned int vectorization_factor)
8198 bool strided_store, is_store;
8199 gimple_stmt_iterator si;
8200 stmt_vec_info stmt_info;
8201 unsigned int vec_stmts_size, nunits, group_size;
8204 slp_tree loads_node;
8209 vect_schedule_slp_instance (SLP_TREE_LEFT (node), instance,
8210 vectorization_factor);
8211 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), instance,
8212 vectorization_factor);
8214 stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
8215 stmt_info = vinfo_for_stmt (stmt);
8217 /* VECTYPE is the type of the destination. */
8218 vectype = get_vectype_for_scalar_type (TREE_TYPE (gimple_assign_lhs (stmt)));
8219 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
8220 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
8222 /* For each SLP instance calculate number of vector stmts to be created
8223 for the scalar stmts in each node of the SLP tree. Number of vector
8224 elements in one vector iteration is the number of scalar elements in
8225 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
8227 vec_stmts_size = (vectorization_factor * group_size) / nunits;
8229 /* In case of load permutation we have to allocate vectorized statements for
8230 all the nodes that participate in that permutation. */
8231 if (SLP_INSTANCE_LOAD_PERMUTATION (instance))
8234 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), i, loads_node);
8237 if (!SLP_TREE_VEC_STMTS (loads_node))
8239 SLP_TREE_VEC_STMTS (loads_node) = VEC_alloc (gimple, heap,
8241 SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
8246 if (!SLP_TREE_VEC_STMTS (node))
8248 SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size);
8249 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
8252 if (vect_print_dump_info (REPORT_DETAILS))
8254 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
8255 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8258 /* Loads should be inserted before the first load. */
8259 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
8260 && STMT_VINFO_STRIDED_ACCESS (stmt_info)
8261 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
8262 si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
8264 si = gsi_for_stmt (stmt);
8266 is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance);
8269 if (DR_GROUP_FIRST_DR (stmt_info))
8270 /* If IS_STORE is TRUE, the vectorization of the
8271 interleaving chain was completed - free all the stores in
8273 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8275 /* FORNOW: SLP originates only from strided stores. */
8281 /* FORNOW: SLP originates only from strided stores. */
8287 vect_schedule_slp (loop_vec_info loop_vinfo)
8289 VEC (slp_instance, heap) *slp_instances =
8290 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
8291 slp_instance instance;
8293 bool is_store = false;
8295 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
8297 /* Schedule the tree of INSTANCE. */
8298 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
8299 instance, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
8301 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
8302 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
8303 fprintf (vect_dump, "vectorizing stmts using SLP.");
8309 /* Function vect_transform_loop.
8311 The analysis phase has determined that the loop is vectorizable.
8312 Vectorize the loop - created vectorized stmts to replace the scalar
8313 stmts in the loop, and update the loop exit condition. */
8316 vect_transform_loop (loop_vec_info loop_vinfo)
8318 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8319 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8320 int nbbs = loop->num_nodes;
8321 gimple_stmt_iterator si;
8324 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8326 bool slp_scheduled = false;
8327 unsigned int nunits;
8329 if (vect_print_dump_info (REPORT_DETAILS))
8330 fprintf (vect_dump, "=== vec_transform_loop ===");
8332 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
8333 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8334 vect_loop_versioning (loop_vinfo);
8336 /* CHECKME: we wouldn't need this if we called update_ssa once
8338 bitmap_zero (vect_memsyms_to_rename);
8340 /* Peel the loop if there are data refs with unknown alignment.
8341 Only one data ref with unknown store is allowed. */
8343 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
8344 vect_do_peeling_for_alignment (loop_vinfo);
8346 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
8347 compile time constant), or it is a constant that doesn't divide by the
8348 vectorization factor, then an epilog loop needs to be created.
8349 We therefore duplicate the loop: the original loop will be vectorized,
8350 and will compute the first (n/VF) iterations. The second copy of the loop
8351 will remain scalar and will compute the remaining (n%VF) iterations.
8352 (VF is the vectorization factor). */
8354 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8355 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8356 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
8357 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
8359 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8360 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
8362 /* 1) Make sure the loop header has exactly two entries
8363 2) Make sure we have a preheader basic block. */
8365 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8367 split_edge (loop_preheader_edge (loop));
8369 /* FORNOW: the vectorizer supports only loops which body consist
8370 of one basic block (header + empty latch). When the vectorizer will
8371 support more involved loop forms, the order by which the BBs are
8372 traversed need to be reconsidered. */
8374 for (i = 0; i < nbbs; i++)
8376 basic_block bb = bbs[i];
8377 stmt_vec_info stmt_info;
8380 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
8382 phi = gsi_stmt (si);
8383 if (vect_print_dump_info (REPORT_DETAILS))
8385 fprintf (vect_dump, "------>vectorizing phi: ");
8386 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
8388 stmt_info = vinfo_for_stmt (phi);
8392 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8393 && !STMT_VINFO_LIVE_P (stmt_info))
8396 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
8397 != (unsigned HOST_WIDE_INT) vectorization_factor)
8398 && vect_print_dump_info (REPORT_DETAILS))
8399 fprintf (vect_dump, "multiple-types.");
8401 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
8403 if (vect_print_dump_info (REPORT_DETAILS))
8404 fprintf (vect_dump, "transform phi.");
8405 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8409 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
8411 gimple stmt = gsi_stmt (si);
8414 if (vect_print_dump_info (REPORT_DETAILS))
8416 fprintf (vect_dump, "------>vectorizing statement: ");
8417 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8420 stmt_info = vinfo_for_stmt (stmt);
8422 /* vector stmts created in the outer-loop during vectorization of
8423 stmts in an inner-loop may not have a stmt_info, and do not
8424 need to be vectorized. */
8431 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8432 && !STMT_VINFO_LIVE_P (stmt_info))
8438 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
8440 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8441 if (!STMT_SLP_TYPE (stmt_info)
8442 && nunits != (unsigned int) vectorization_factor
8443 && vect_print_dump_info (REPORT_DETAILS))
8444 /* For SLP VF is set according to unrolling factor, and not to
8445 vector size, hence for SLP this print is not valid. */
8446 fprintf (vect_dump, "multiple-types.");
8448 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8450 if (STMT_SLP_TYPE (stmt_info))
8454 slp_scheduled = true;
8456 if (vect_print_dump_info (REPORT_DETAILS))
8457 fprintf (vect_dump, "=== scheduling SLP instances ===");
8459 is_store = vect_schedule_slp (loop_vinfo);
8461 /* IS_STORE is true if STMT is a store. Stores cannot be of
8462 hybrid SLP type. They are removed in
8463 vect_schedule_slp_instance and their vinfo is destroyed. */
8471 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8472 if (PURE_SLP_STMT (stmt_info))
8479 /* -------- vectorize statement ------------ */
8480 if (vect_print_dump_info (REPORT_DETAILS))
8481 fprintf (vect_dump, "transform statement.");
8483 strided_store = false;
8484 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
8487 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
8489 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8490 interleaving chain was completed - free all the stores in
8492 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8493 gsi_remove (&si, true);
8498 /* Free the attached stmt_vec_info and remove the stmt. */
8499 free_stmt_vec_info (stmt);
8500 gsi_remove (&si, true);
8508 slpeel_make_loop_iterate_ntimes (loop, ratio);
8510 mark_set_for_renaming (vect_memsyms_to_rename);
8512 /* The memory tags and pointers in vectorized statements need to
8513 have their SSA forms updated. FIXME, why can't this be delayed
8514 until all the loops have been transformed? */
8515 update_ssa (TODO_update_ssa);
8517 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8518 fprintf (vect_dump, "LOOP VECTORIZED.");
8519 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8520 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");