1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (gimple, gimple_stmt_iterator *, bool *,
50 slp_tree, slp_instance);
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (gimple, struct loop*, tree, tree *, gimple *, bool, bool *, tree);
54 static tree vect_create_addr_base_for_vector_ref
55 (gimple, gimple_seq *, tree, struct loop *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, gimple, tree *);
58 static tree vect_init_vector (gimple, tree, tree, gimple_stmt_iterator *);
59 static void vect_finish_stmt_generation
60 (gimple stmt, gimple vec_stmt, gimple_stmt_iterator *);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void vect_create_epilog_for_reduction
63 (tree, gimple, int, enum tree_code, gimple);
64 static tree get_initial_def_for_reduction (gimple, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
78 cost_for_stmt (gimple stmt)
80 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
82 switch (STMT_VINFO_TYPE (stmt_info))
84 case load_vec_info_type:
85 return TARG_SCALAR_LOAD_COST;
86 case store_vec_info_type:
87 return TARG_SCALAR_STORE_COST;
88 case op_vec_info_type:
89 case condition_vec_info_type:
90 case assignment_vec_info_type:
91 case reduc_vec_info_type:
92 case induc_vec_info_type:
93 case type_promotion_vec_info_type:
94 case type_demotion_vec_info_type:
95 case type_conversion_vec_info_type:
96 case call_vec_info_type:
97 return TARG_SCALAR_STMT_COST;
98 case undef_vec_info_type:
105 /* Function vect_estimate_min_profitable_iters
107 Return the number of iterations required for the vector version of the
108 loop to be profitable relative to the cost of the scalar version of the
111 TODO: Take profile info into account before making vectorization
112 decisions, if available. */
115 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
118 int min_profitable_iters;
119 int peel_iters_prologue;
120 int peel_iters_epilogue;
121 int vec_inside_cost = 0;
122 int vec_outside_cost = 0;
123 int scalar_single_iter_cost = 0;
124 int scalar_outside_cost = 0;
125 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
126 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
127 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
128 int nbbs = loop->num_nodes;
129 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
130 int peel_guard_costs = 0;
131 int innerloop_iters = 0, factor;
132 VEC (slp_instance, heap) *slp_instances;
133 slp_instance instance;
135 /* Cost model disabled. */
136 if (!flag_vect_cost_model)
138 if (vect_print_dump_info (REPORT_COST))
139 fprintf (vect_dump, "cost model disabled.");
143 /* Requires loop versioning tests to handle misalignment. */
144 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
146 /* FIXME: Make cost depend on complexity of individual check. */
148 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
149 if (vect_print_dump_info (REPORT_COST))
150 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
151 "versioning to treat misalignment.\n");
154 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
156 /* FIXME: Make cost depend on complexity of individual check. */
158 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
159 if (vect_print_dump_info (REPORT_COST))
160 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
161 "versioning aliasing.\n");
164 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
165 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
167 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
170 /* Count statements in scalar loop. Using this as scalar cost for a single
173 TODO: Add outer loop support.
175 TODO: Consider assigning different costs to different scalar
180 innerloop_iters = 50; /* FIXME */
182 for (i = 0; i < nbbs; i++)
184 gimple_stmt_iterator si;
185 basic_block bb = bbs[i];
187 if (bb->loop_father == loop->inner)
188 factor = innerloop_iters;
192 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
194 gimple stmt = gsi_stmt (si);
195 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
196 /* Skip stmts that are not vectorized inside the loop. */
197 if (!STMT_VINFO_RELEVANT_P (stmt_info)
198 && (!STMT_VINFO_LIVE_P (stmt_info)
199 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
201 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
202 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
203 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
204 some of the "outside" costs are generated inside the outer-loop. */
205 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
209 /* Add additional cost for the peeled instructions in prologue and epilogue
212 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
213 at compile-time - we assume it's vf/2 (the worst would be vf-1).
215 TODO: Build an expression that represents peel_iters for prologue and
216 epilogue to be used in a run-time test. */
218 if (byte_misalign < 0)
220 peel_iters_prologue = vf/2;
221 if (vect_print_dump_info (REPORT_COST))
222 fprintf (vect_dump, "cost model: "
223 "prologue peel iters set to vf/2.");
225 /* If peeling for alignment is unknown, loop bound of main loop becomes
227 peel_iters_epilogue = vf/2;
228 if (vect_print_dump_info (REPORT_COST))
229 fprintf (vect_dump, "cost model: "
230 "epilogue peel iters set to vf/2 because "
231 "peeling for alignment is unknown .");
233 /* If peeled iterations are unknown, count a taken branch and a not taken
234 branch per peeled loop. Even if scalar loop iterations are known,
235 vector iterations are not known since peeled prologue iterations are
236 not known. Hence guards remain the same. */
237 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
238 + TARG_COND_NOT_TAKEN_BRANCH_COST);
244 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
245 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
246 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
247 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
249 peel_iters_prologue = nelements - (byte_misalign / element_size);
252 peel_iters_prologue = 0;
254 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
256 peel_iters_epilogue = vf/2;
257 if (vect_print_dump_info (REPORT_COST))
258 fprintf (vect_dump, "cost model: "
259 "epilogue peel iters set to vf/2 because "
260 "loop iterations are unknown .");
262 /* If peeled iterations are known but number of scalar loop
263 iterations are unknown, count a taken branch per peeled loop. */
264 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
269 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
270 peel_iters_prologue = niters < peel_iters_prologue ?
271 niters : peel_iters_prologue;
272 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
276 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
277 + (peel_iters_epilogue * scalar_single_iter_cost)
280 /* FORNOW: The scalar outside cost is incremented in one of the
283 1. The vectorizer checks for alignment and aliasing and generates
284 a condition that allows dynamic vectorization. A cost model
285 check is ANDED with the versioning condition. Hence scalar code
286 path now has the added cost of the versioning check.
288 if (cost > th & versioning_check)
291 Hence run-time scalar is incremented by not-taken branch cost.
293 2. The vectorizer then checks if a prologue is required. If the
294 cost model check was not done before during versioning, it has to
295 be done before the prologue check.
298 prologue = scalar_iters
303 if (prologue == num_iters)
306 Hence the run-time scalar cost is incremented by a taken branch,
307 plus a not-taken branch, plus a taken branch cost.
309 3. The vectorizer then checks if an epilogue is required. If the
310 cost model check was not done before during prologue check, it
311 has to be done with the epilogue check.
317 if (prologue == num_iters)
320 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
323 Hence the run-time scalar cost should be incremented by 2 taken
326 TODO: The back end may reorder the BBS's differently and reverse
327 conditions/branch directions. Change the estimates below to
328 something more reasonable. */
330 /* If the number of iterations is known and we do not do versioning, we can
331 decide whether to vectorize at compile time. Hence the scalar version
332 do not carry cost model guard costs. */
333 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
334 || VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
335 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
337 /* Cost model check occurs at versioning. */
338 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
339 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
340 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
343 /* Cost model check occurs at prologue generation. */
344 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
345 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
346 + TARG_COND_NOT_TAKEN_BRANCH_COST;
347 /* Cost model check occurs at epilogue generation. */
349 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
354 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
355 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
357 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
358 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
361 /* Calculate number of iterations required to make the vector version
362 profitable, relative to the loop bodies only. The following condition
364 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
366 SIC = scalar iteration cost, VIC = vector iteration cost,
367 VOC = vector outside cost, VF = vectorization factor,
368 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
369 SOC = scalar outside cost for run time cost model check. */
371 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
373 if (vec_outside_cost <= 0)
374 min_profitable_iters = 1;
377 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
378 - vec_inside_cost * peel_iters_prologue
379 - vec_inside_cost * peel_iters_epilogue)
380 / ((scalar_single_iter_cost * vf)
383 if ((scalar_single_iter_cost * vf * min_profitable_iters)
384 <= ((vec_inside_cost * min_profitable_iters)
385 + ((vec_outside_cost - scalar_outside_cost) * vf)))
386 min_profitable_iters++;
389 /* vector version will never be profitable. */
392 if (vect_print_dump_info (REPORT_COST))
393 fprintf (vect_dump, "cost model: vector iteration cost = %d "
394 "is divisible by scalar iteration cost = %d by a factor "
395 "greater than or equal to the vectorization factor = %d .",
396 vec_inside_cost, scalar_single_iter_cost, vf);
400 if (vect_print_dump_info (REPORT_COST))
402 fprintf (vect_dump, "Cost model analysis: \n");
403 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
405 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
407 fprintf (vect_dump, " Scalar iteration cost: %d\n",
408 scalar_single_iter_cost);
409 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
410 fprintf (vect_dump, " prologue iterations: %d\n",
411 peel_iters_prologue);
412 fprintf (vect_dump, " epilogue iterations: %d\n",
413 peel_iters_epilogue);
414 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
415 min_profitable_iters);
418 min_profitable_iters =
419 min_profitable_iters < vf ? vf : min_profitable_iters;
421 /* Because the condition we create is:
422 if (niters <= min_profitable_iters)
423 then skip the vectorized loop. */
424 min_profitable_iters--;
426 if (vect_print_dump_info (REPORT_COST))
427 fprintf (vect_dump, " Profitability threshold = %d\n",
428 min_profitable_iters);
430 return min_profitable_iters;
434 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
435 functions. Design better to avoid maintenance issues. */
437 /* Function vect_model_reduction_cost.
439 Models cost for a reduction operation, including the vector ops
440 generated within the strip-mine loop, the initial definition before
441 the loop, and the epilogue code that must be generated. */
444 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
451 gimple stmt, orig_stmt;
453 enum machine_mode mode;
454 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
455 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
458 /* Cost of reduction op inside loop. */
459 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
461 stmt = STMT_VINFO_STMT (stmt_info);
463 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
465 case GIMPLE_SINGLE_RHS:
466 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
467 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
469 case GIMPLE_UNARY_RHS:
470 reduction_op = gimple_assign_rhs1 (stmt);
472 case GIMPLE_BINARY_RHS:
473 reduction_op = gimple_assign_rhs2 (stmt);
479 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
482 if (vect_print_dump_info (REPORT_COST))
484 fprintf (vect_dump, "unsupported data-type ");
485 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
490 mode = TYPE_MODE (vectype);
491 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
494 orig_stmt = STMT_VINFO_STMT (stmt_info);
496 code = gimple_assign_rhs_code (orig_stmt);
498 /* Add in cost for initial definition. */
499 outer_cost += TARG_SCALAR_TO_VEC_COST;
501 /* Determine cost of epilogue code.
503 We have a reduction operator that will reduce the vector in one statement.
504 Also requires scalar extract. */
506 if (!nested_in_vect_loop_p (loop, orig_stmt))
508 if (reduc_code < NUM_TREE_CODES)
509 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
512 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
514 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
515 int element_bitsize = tree_low_cst (bitsize, 1);
516 int nelements = vec_size_in_bits / element_bitsize;
518 optab = optab_for_tree_code (code, vectype, optab_default);
520 /* We have a whole vector shift available. */
521 if (VECTOR_MODE_P (mode)
522 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
523 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
524 /* Final reduction via vector shifts and the reduction operator. Also
525 requires scalar extract. */
526 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
527 + TARG_VEC_TO_SCALAR_COST);
529 /* Use extracts and reduction op for final reduction. For N elements,
530 we have N extracts and N-1 reduction ops. */
531 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
535 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
537 if (vect_print_dump_info (REPORT_COST))
538 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
539 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
540 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
546 /* Function vect_model_induction_cost.
548 Models cost for induction operations. */
551 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
553 /* loop cost for vec_loop. */
554 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
555 /* prologue cost for vec_init and vec_step. */
556 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
558 if (vect_print_dump_info (REPORT_COST))
559 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
560 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
561 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
565 /* Function vect_model_simple_cost.
567 Models cost for simple operations, i.e. those that only emit ncopies of a
568 single op. Right now, this does not account for multiple insns that could
569 be generated for the single vector op. We will handle that shortly. */
572 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
573 enum vect_def_type *dt, slp_tree slp_node)
576 int inside_cost = 0, outside_cost = 0;
578 /* The SLP costs were already calculated during SLP tree build. */
579 if (PURE_SLP_STMT (stmt_info))
582 inside_cost = ncopies * TARG_VEC_STMT_COST;
584 /* FORNOW: Assuming maximum 2 args per stmts. */
585 for (i = 0; i < 2; i++)
587 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
588 outside_cost += TARG_SCALAR_TO_VEC_COST;
591 if (vect_print_dump_info (REPORT_COST))
592 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
593 "outside_cost = %d .", inside_cost, outside_cost);
595 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
596 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
597 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
601 /* Function vect_cost_strided_group_size
603 For strided load or store, return the group_size only if it is the first
604 load or store of a group, else return 1. This ensures that group size is
605 only returned once per group. */
608 vect_cost_strided_group_size (stmt_vec_info stmt_info)
610 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
612 if (first_stmt == STMT_VINFO_STMT (stmt_info))
613 return DR_GROUP_SIZE (stmt_info);
619 /* Function vect_model_store_cost
621 Models cost for stores. In the case of strided accesses, one access
622 has the overhead of the strided access attributed to it. */
625 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
626 enum vect_def_type dt, slp_tree slp_node)
629 int inside_cost = 0, outside_cost = 0;
631 /* The SLP costs were already calculated during SLP tree build. */
632 if (PURE_SLP_STMT (stmt_info))
635 if (dt == vect_constant_def || dt == vect_invariant_def)
636 outside_cost = TARG_SCALAR_TO_VEC_COST;
638 /* Strided access? */
639 if (DR_GROUP_FIRST_DR (stmt_info) && !slp_node)
640 group_size = vect_cost_strided_group_size (stmt_info);
641 /* Not a strided access. */
645 /* Is this an access in a group of stores, which provide strided access?
646 If so, add in the cost of the permutes. */
649 /* Uses a high and low interleave operation for each needed permute. */
650 inside_cost = ncopies * exact_log2(group_size) * group_size
651 * TARG_VEC_STMT_COST;
653 if (vect_print_dump_info (REPORT_COST))
654 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
659 /* Costs of the stores. */
660 inside_cost += ncopies * TARG_VEC_STORE_COST;
662 if (vect_print_dump_info (REPORT_COST))
663 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
664 "outside_cost = %d .", inside_cost, outside_cost);
666 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
667 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
668 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
672 /* Function vect_model_load_cost
674 Models cost for loads. In the case of strided accesses, the last access
675 has the overhead of the strided access attributed to it. Since unaligned
676 accesses are supported for loads, we also account for the costs of the
677 access scheme chosen. */
680 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
684 int alignment_support_cheme;
686 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
687 int inside_cost = 0, outside_cost = 0;
689 /* The SLP costs were already calculated during SLP tree build. */
690 if (PURE_SLP_STMT (stmt_info))
693 /* Strided accesses? */
694 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
695 if (first_stmt && !slp_node)
697 group_size = vect_cost_strided_group_size (stmt_info);
698 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
700 /* Not a strided access. */
707 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
709 /* Is this an access in a group of loads providing strided access?
710 If so, add in the cost of the permutes. */
713 /* Uses an even and odd extract operations for each needed permute. */
714 inside_cost = ncopies * exact_log2(group_size) * group_size
715 * TARG_VEC_STMT_COST;
717 if (vect_print_dump_info (REPORT_COST))
718 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
723 /* The loads themselves. */
724 switch (alignment_support_cheme)
728 inside_cost += ncopies * TARG_VEC_LOAD_COST;
730 if (vect_print_dump_info (REPORT_COST))
731 fprintf (vect_dump, "vect_model_load_cost: aligned.");
735 case dr_unaligned_supported:
737 /* Here, we assign an additional cost for the unaligned load. */
738 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
740 if (vect_print_dump_info (REPORT_COST))
741 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
746 case dr_explicit_realign:
748 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
750 /* FIXME: If the misalignment remains fixed across the iterations of
751 the containing loop, the following cost should be added to the
753 if (targetm.vectorize.builtin_mask_for_load)
754 inside_cost += TARG_VEC_STMT_COST;
758 case dr_explicit_realign_optimized:
760 if (vect_print_dump_info (REPORT_COST))
761 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
764 /* Unaligned software pipeline has a load of an address, an initial
765 load, and possibly a mask operation to "prime" the loop. However,
766 if this is an access in a group of loads, which provide strided
767 access, then the above cost should only be considered for one
768 access in the group. Inside the loop, there is a load op
769 and a realignment op. */
771 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
773 outside_cost = 2*TARG_VEC_STMT_COST;
774 if (targetm.vectorize.builtin_mask_for_load)
775 outside_cost += TARG_VEC_STMT_COST;
778 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
787 if (vect_print_dump_info (REPORT_COST))
788 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
789 "outside_cost = %d .", inside_cost, outside_cost);
791 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
792 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
793 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
797 /* Function vect_get_new_vect_var.
799 Returns a name for a new variable. The current naming scheme appends the
800 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
801 the name of vectorizer generated variables, and appends that to NAME if
805 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
812 case vect_simple_var:
815 case vect_scalar_var:
818 case vect_pointer_var:
827 char* tmp = concat (prefix, name, NULL);
828 new_vect_var = create_tmp_var (type, tmp);
832 new_vect_var = create_tmp_var (type, prefix);
834 /* Mark vector typed variable as a gimple register variable. */
835 if (TREE_CODE (type) == VECTOR_TYPE)
836 DECL_GIMPLE_REG_P (new_vect_var) = true;
842 /* Function vect_create_addr_base_for_vector_ref.
844 Create an expression that computes the address of the first memory location
845 that will be accessed for a data reference.
848 STMT: The statement containing the data reference.
849 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
850 OFFSET: Optional. If supplied, it is be added to the initial address.
851 LOOP: Specify relative to which loop-nest should the address be computed.
852 For example, when the dataref is in an inner-loop nested in an
853 outer-loop that is now being vectorized, LOOP can be either the
854 outer-loop, or the inner-loop. The first memory location accessed
855 by the following dataref ('in' points to short):
862 if LOOP=i_loop: &in (relative to i_loop)
863 if LOOP=j_loop: &in+i*2B (relative to j_loop)
866 1. Return an SSA_NAME whose value is the address of the memory location of
867 the first vector of the data reference.
868 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
869 these statement(s) which define the returned SSA_NAME.
871 FORNOW: We are only handling array accesses with step 1. */
874 vect_create_addr_base_for_vector_ref (gimple stmt,
875 gimple_seq *new_stmt_list,
879 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
880 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
881 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
882 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
884 tree data_ref_base_var;
886 tree addr_base, addr_expr;
888 gimple_seq seq = NULL;
889 tree base_offset = unshare_expr (DR_OFFSET (dr));
890 tree init = unshare_expr (DR_INIT (dr));
891 tree vect_ptr_type, addr_expr2;
892 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
895 if (loop != containing_loop)
897 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
898 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
900 gcc_assert (nested_in_vect_loop_p (loop, stmt));
902 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
903 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
904 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
907 /* Create data_ref_base */
908 base_name = build_fold_indirect_ref (data_ref_base);
909 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
910 add_referenced_var (data_ref_base_var);
911 data_ref_base = force_gimple_operand (data_ref_base, &seq, true,
913 gimple_seq_add_seq (new_stmt_list, seq);
915 /* Create base_offset */
916 base_offset = size_binop (PLUS_EXPR,
917 fold_convert (sizetype, base_offset),
918 fold_convert (sizetype, init));
919 dest = create_tmp_var (sizetype, "base_off");
920 add_referenced_var (dest);
921 base_offset = force_gimple_operand (base_offset, &seq, true, dest);
922 gimple_seq_add_seq (new_stmt_list, seq);
926 tree tmp = create_tmp_var (sizetype, "offset");
928 add_referenced_var (tmp);
929 offset = fold_build2 (MULT_EXPR, sizetype,
930 fold_convert (sizetype, offset), step);
931 base_offset = fold_build2 (PLUS_EXPR, sizetype,
932 base_offset, offset);
933 base_offset = force_gimple_operand (base_offset, &seq, false, tmp);
934 gimple_seq_add_seq (new_stmt_list, seq);
937 /* base + base_offset */
938 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
939 data_ref_base, base_offset);
941 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
943 /* addr_expr = addr_base */
944 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
945 get_name (base_name));
946 add_referenced_var (addr_expr);
947 vec_stmt = fold_convert (vect_ptr_type, addr_base);
948 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
949 get_name (base_name));
950 add_referenced_var (addr_expr2);
951 vec_stmt = force_gimple_operand (vec_stmt, &seq, false, addr_expr2);
952 gimple_seq_add_seq (new_stmt_list, seq);
954 if (vect_print_dump_info (REPORT_DETAILS))
956 fprintf (vect_dump, "created ");
957 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
963 /* Function vect_create_data_ref_ptr.
965 Create a new pointer to vector type (vp), that points to the first location
966 accessed in the loop by STMT, along with the def-use update chain to
967 appropriately advance the pointer through the loop iterations. Also set
968 aliasing information for the pointer. This vector pointer is used by the
969 callers to this function to create a memory reference expression for vector
973 1. STMT: a stmt that references memory. Expected to be of the form
974 GIMPLE_ASSIGN <name, data-ref> or
975 GIMPLE_ASSIGN <data-ref, name>.
976 2. AT_LOOP: the loop where the vector memref is to be created.
977 3. OFFSET (optional): an offset to be added to the initial address accessed
978 by the data-ref in STMT.
979 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
980 pointing to the initial address.
981 5. TYPE: if not NULL indicates the required type of the data-ref.
984 1. Declare a new ptr to vector_type, and have it point to the base of the
985 data reference (initial addressed accessed by the data reference).
986 For example, for vector of type V8HI, the following code is generated:
989 vp = (v8hi *)initial_address;
991 if OFFSET is not supplied:
992 initial_address = &a[init];
993 if OFFSET is supplied:
994 initial_address = &a[init + OFFSET];
996 Return the initial_address in INITIAL_ADDRESS.
998 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
999 update the pointer in each iteration of the loop.
1001 Return the increment stmt that updates the pointer in PTR_INCR.
1003 3. Set INV_P to true if the access pattern of the data reference in the
1004 vectorized loop is invariant. Set it to false otherwise.
1006 4. Return the pointer. */
1009 vect_create_data_ref_ptr (gimple stmt, struct loop *at_loop,
1010 tree offset, tree *initial_address, gimple *ptr_incr,
1011 bool only_init, bool *inv_p, tree type)
1014 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1015 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1016 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1017 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
1018 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
1019 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1025 gimple_seq new_stmt_list = NULL;
1029 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1031 gimple_stmt_iterator incr_gsi;
1033 tree indx_before_incr, indx_after_incr;
1037 /* Check the step (evolution) of the load in LOOP, and record
1038 whether it's invariant. */
1039 if (nested_in_vect_loop)
1040 step = STMT_VINFO_DR_STEP (stmt_info);
1042 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1044 if (tree_int_cst_compare (step, size_zero_node) == 0)
1049 /* Create an expression for the first address accessed by this load
1051 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1053 if (vect_print_dump_info (REPORT_DETAILS))
1055 tree data_ref_base = base_name;
1056 fprintf (vect_dump, "create vector-pointer variable to type: ");
1057 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1058 if (TREE_CODE (data_ref_base) == VAR_DECL)
1059 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
1060 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1061 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
1062 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1063 fprintf (vect_dump, " vectorizing a record based array ref: ");
1064 else if (TREE_CODE (data_ref_base) == SSA_NAME)
1065 fprintf (vect_dump, " vectorizing a pointer ref: ");
1066 print_generic_expr (vect_dump, base_name, TDF_SLIM);
1069 /** (1) Create the new vector-pointer variable: **/
1071 vect_ptr_type = build_pointer_type (type);
1073 vect_ptr_type = build_pointer_type (vectype);
1075 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == SSA_NAME
1076 && TYPE_RESTRICT (TREE_TYPE (DR_BASE_ADDRESS (dr))))
1077 vect_ptr_type = build_qualified_type (vect_ptr_type, TYPE_QUAL_RESTRICT);
1078 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1079 get_name (base_name));
1080 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == SSA_NAME
1081 && TYPE_RESTRICT (TREE_TYPE (DR_BASE_ADDRESS (dr))))
1083 get_alias_set (base_name);
1084 DECL_POINTER_ALIAS_SET (vect_ptr)
1085 = DECL_POINTER_ALIAS_SET (SSA_NAME_VAR (DR_BASE_ADDRESS (dr)));
1088 add_referenced_var (vect_ptr);
1090 /** (2) Add aliasing information to the new vector-pointer:
1091 (The points-to info (DR_PTR_INFO) may be defined later.) **/
1093 tag = DR_SYMBOL_TAG (dr);
1096 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1097 tag must be created with tag added to its may alias list. */
1099 new_type_alias (vect_ptr, tag, DR_REF (dr));
1101 set_symbol_mem_tag (vect_ptr, tag);
1103 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1104 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1105 def-use update cycles for the pointer: One relative to the outer-loop
1106 (LOOP), which is what steps (3) and (4) below do. The other is relative
1107 to the inner-loop (which is the inner-most loop containing the dataref),
1108 and this is done be step (5) below.
1110 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1111 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1112 redundant. Steps (3),(4) create the following:
1115 LOOP: vp1 = phi(vp0,vp2)
1121 If there is an inner-loop nested in loop, then step (5) will also be
1122 applied, and an additional update in the inner-loop will be created:
1125 LOOP: vp1 = phi(vp0,vp2)
1127 inner: vp3 = phi(vp1,vp4)
1128 vp4 = vp3 + inner_step
1134 /** (3) Calculate the initial address the vector-pointer, and set
1135 the vector-pointer to point to it before the loop: **/
1137 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1139 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1141 pe = loop_preheader_edge (loop);
1144 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
1145 gcc_assert (!new_bb);
1148 *initial_address = new_temp;
1150 /* Create: p = (vectype *) initial_base */
1151 vec_stmt = gimple_build_assign (vect_ptr,
1152 fold_convert (vect_ptr_type, new_temp));
1153 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1154 gimple_assign_set_lhs (vec_stmt, vect_ptr_init);
1155 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
1156 gcc_assert (!new_bb);
1159 /** (4) Handle the updating of the vector-pointer inside the loop.
1160 This is needed when ONLY_INIT is false, and also when AT_LOOP
1161 is the inner-loop nested in LOOP (during outer-loop vectorization).
1164 if (only_init && at_loop == loop) /* No update in loop is required. */
1166 /* Copy the points-to information if it exists. */
1167 if (DR_PTR_INFO (dr))
1168 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1169 vptr = vect_ptr_init;
1173 /* The step of the vector pointer is the Vector Size. */
1174 tree step = TYPE_SIZE_UNIT (vectype);
1175 /* One exception to the above is when the scalar step of the load in
1176 LOOP is zero. In this case the step here is also zero. */
1178 step = size_zero_node;
1180 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1182 create_iv (vect_ptr_init,
1183 fold_convert (vect_ptr_type, step),
1184 vect_ptr, loop, &incr_gsi, insert_after,
1185 &indx_before_incr, &indx_after_incr);
1186 incr = gsi_stmt (incr_gsi);
1187 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1189 /* Copy the points-to information if it exists. */
1190 if (DR_PTR_INFO (dr))
1192 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1193 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1195 merge_alias_info (vect_ptr_init, indx_before_incr);
1196 merge_alias_info (vect_ptr_init, indx_after_incr);
1200 vptr = indx_before_incr;
1203 if (!nested_in_vect_loop || only_init)
1207 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1208 nested in LOOP, if exists: **/
1210 gcc_assert (nested_in_vect_loop);
1213 standard_iv_increment_position (containing_loop, &incr_gsi,
1215 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), vect_ptr,
1216 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
1218 incr = gsi_stmt (incr_gsi);
1219 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1221 /* Copy the points-to information if it exists. */
1222 if (DR_PTR_INFO (dr))
1224 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1225 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1227 merge_alias_info (vect_ptr_init, indx_before_incr);
1228 merge_alias_info (vect_ptr_init, indx_after_incr);
1232 return indx_before_incr;
1239 /* Function bump_vector_ptr
1241 Increment a pointer (to a vector type) by vector-size. If requested,
1242 i.e. if PTR-INCR is given, then also connect the new increment stmt
1243 to the existing def-use update-chain of the pointer, by modifying
1244 the PTR_INCR as illustrated below:
1246 The pointer def-use update-chain before this function:
1247 DATAREF_PTR = phi (p_0, p_2)
1249 PTR_INCR: p_2 = DATAREF_PTR + step
1251 The pointer def-use update-chain after this function:
1252 DATAREF_PTR = phi (p_0, p_2)
1254 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1256 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1259 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1261 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1262 the loop. The increment amount across iterations is expected
1264 BSI - location where the new update stmt is to be placed.
1265 STMT - the original scalar memory-access stmt that is being vectorized.
1266 BUMP - optional. The offset by which to bump the pointer. If not given,
1267 the offset is assumed to be vector_size.
1269 Output: Return NEW_DATAREF_PTR as illustrated above.
1274 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
1275 gimple stmt, tree bump)
1277 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1278 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1279 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1280 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1281 tree update = TYPE_SIZE_UNIT (vectype);
1284 use_operand_p use_p;
1285 tree new_dataref_ptr;
1290 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, ptr_var,
1291 dataref_ptr, update);
1292 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1293 gimple_assign_set_lhs (incr_stmt, new_dataref_ptr);
1294 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
1296 /* Copy the points-to information if it exists. */
1297 if (DR_PTR_INFO (dr))
1298 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1299 merge_alias_info (new_dataref_ptr, dataref_ptr);
1302 return new_dataref_ptr;
1304 /* Update the vector-pointer's cross-iteration increment. */
1305 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1307 tree use = USE_FROM_PTR (use_p);
1309 if (use == dataref_ptr)
1310 SET_USE (use_p, new_dataref_ptr);
1312 gcc_assert (tree_int_cst_compare (use, update) == 0);
1315 return new_dataref_ptr;
1319 /* Function vect_create_destination_var.
1321 Create a new temporary of type VECTYPE. */
1324 vect_create_destination_var (tree scalar_dest, tree vectype)
1327 const char *new_name;
1329 enum vect_var_kind kind;
1331 kind = vectype ? vect_simple_var : vect_scalar_var;
1332 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1334 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1336 new_name = get_name (scalar_dest);
1339 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1340 add_referenced_var (vec_dest);
1346 /* Function vect_init_vector.
1348 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1349 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1350 is not NULL. Otherwise, place the initialization at the loop preheader.
1351 Return the DEF of INIT_STMT.
1352 It will be used in the vectorization of STMT. */
1355 vect_init_vector (gimple stmt, tree vector_var, tree vector_type,
1356 gimple_stmt_iterator *gsi)
1358 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1366 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1367 add_referenced_var (new_var);
1368 init_stmt = gimple_build_assign (new_var, vector_var);
1369 new_temp = make_ssa_name (new_var, init_stmt);
1370 gimple_assign_set_lhs (init_stmt, new_temp);
1373 vect_finish_stmt_generation (stmt, init_stmt, gsi);
1376 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1377 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1379 if (nested_in_vect_loop_p (loop, stmt))
1381 pe = loop_preheader_edge (loop);
1382 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1383 gcc_assert (!new_bb);
1386 if (vect_print_dump_info (REPORT_DETAILS))
1388 fprintf (vect_dump, "created new init_stmt: ");
1389 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1392 vec_oprnd = gimple_assign_lhs (init_stmt);
1397 /* For constant and loop invariant defs of SLP_NODE this function returns
1398 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1399 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1400 stmts. NUMBER_OF_VECTORS is the number of vector defs to create. */
1403 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1404 unsigned int op_num, unsigned int number_of_vectors)
1406 VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1407 gimple stmt = VEC_index (gimple, stmts, 0);
1408 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1409 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1413 int j, number_of_places_left_in_vector;
1416 int group_size = VEC_length (gimple, stmts);
1417 unsigned int vec_num, i;
1418 int number_of_copies = 1;
1419 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1420 bool constant_p, is_store;
1422 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1425 op = gimple_assign_rhs1 (stmt);
1430 op = gimple_op (stmt, op_num + 1);
1433 if (CONSTANT_CLASS_P (op))
1435 vector_type = vectype;
1440 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1441 gcc_assert (vector_type);
1445 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
1447 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1448 created vectors. It is greater than 1 if unrolling is performed.
1450 For example, we have two scalar operands, s1 and s2 (e.g., group of
1451 strided accesses of size two), while NUNITS is four (i.e., four scalars
1452 of this type can be packed in a vector). The output vector will contain
1453 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1456 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1457 containing the operands.
1459 For example, NUNITS is four as before, and the group size is 8
1460 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1461 {s5, s6, s7, s8}. */
1463 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1465 number_of_places_left_in_vector = nunits;
1466 for (j = 0; j < number_of_copies; j++)
1468 for (i = group_size - 1; VEC_iterate (gimple, stmts, i, stmt); i--)
1471 op = gimple_assign_rhs1 (stmt);
1473 op = gimple_op (stmt, op_num + 1);
1475 /* Create 'vect_ = {op0,op1,...,opn}'. */
1476 t = tree_cons (NULL_TREE, op, t);
1478 number_of_places_left_in_vector--;
1480 if (number_of_places_left_in_vector == 0)
1482 number_of_places_left_in_vector = nunits;
1485 vec_cst = build_vector (vector_type, t);
1487 vec_cst = build_constructor_from_list (vector_type, t);
1488 VEC_quick_push (tree, voprnds,
1489 vect_init_vector (stmt, vec_cst, vector_type, NULL));
1495 /* Since the vectors are created in the reverse order, we should invert
1497 vec_num = VEC_length (tree, voprnds);
1498 for (j = vec_num - 1; j >= 0; j--)
1500 vop = VEC_index (tree, voprnds, j);
1501 VEC_quick_push (tree, *vec_oprnds, vop);
1504 VEC_free (tree, heap, voprnds);
1506 /* In case that VF is greater than the unrolling factor needed for the SLP
1507 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1508 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1509 to replicate the vectors. */
1510 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1512 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1513 VEC_quick_push (tree, *vec_oprnds, vop);
1518 /* Get vectorized definitions from SLP_NODE that contains corresponding
1519 vectorized def-stmts. */
1522 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1525 gimple vec_def_stmt;
1528 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1531 VEC_iterate (gimple, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1534 gcc_assert (vec_def_stmt);
1535 vec_oprnd = gimple_get_lhs (vec_def_stmt);
1536 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1541 /* Get vectorized definitions for SLP_NODE.
1542 If the scalar definitions are loop invariants or constants, collect them and
1543 call vect_get_constant_vectors() to create vector stmts.
1544 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1545 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1546 vect_get_slp_vect_defs() to retrieve them.
1547 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1548 the right node. This is used when the second operand must remain scalar. */
1551 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1552 VEC (tree,heap) **vec_oprnds1)
1555 enum tree_code code;
1556 int number_of_vects;
1557 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
1559 first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1560 /* The number of vector defs is determined by the number of vector statements
1561 in the node from which we get those statements. */
1562 if (SLP_TREE_LEFT (slp_node))
1563 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_LEFT (slp_node));
1566 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1567 /* Number of vector stmts was calculated according to LHS in
1568 vect_schedule_slp_instance(), fix it by replacing LHS with RHS, if
1569 necessary. See vect_get_smallest_scalar_type() for details. */
1570 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
1572 if (rhs_size_unit != lhs_size_unit)
1574 number_of_vects *= rhs_size_unit;
1575 number_of_vects /= lhs_size_unit;
1579 /* Allocate memory for vectorized defs. */
1580 *vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects);
1582 /* SLP_NODE corresponds either to a group of stores or to a group of
1583 unary/binary operations. We don't call this function for loads. */
1584 if (SLP_TREE_LEFT (slp_node))
1585 /* The defs are already vectorized. */
1586 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1588 /* Build vectors from scalar defs. */
1589 vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects);
1591 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1592 /* Since we don't call this function with loads, this is a group of
1596 code = gimple_assign_rhs_code (first_stmt);
1597 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1)
1600 /* The number of vector defs is determined by the number of vector statements
1601 in the node from which we get those statements. */
1602 if (SLP_TREE_RIGHT (slp_node))
1603 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_RIGHT (slp_node));
1605 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1607 *vec_oprnds1 = VEC_alloc (tree, heap, number_of_vects);
1609 if (SLP_TREE_RIGHT (slp_node))
1610 /* The defs are already vectorized. */
1611 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1613 /* Build vectors from scalar defs. */
1614 vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects);
1618 /* Function get_initial_def_for_induction
1621 STMT - a stmt that performs an induction operation in the loop.
1622 IV_PHI - the initial value of the induction variable
1625 Return a vector variable, initialized with the first VF values of
1626 the induction variable. E.g., for an iv with IV_PHI='X' and
1627 evolution S, for a vector of 4 units, we want to return:
1628 [X, X + S, X + 2*S, X + 3*S]. */
1631 get_initial_def_for_induction (gimple iv_phi)
1633 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1634 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1635 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1636 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
1639 edge pe = loop_preheader_edge (loop);
1640 struct loop *iv_loop;
1642 tree vec, vec_init, vec_step, t;
1646 gimple init_stmt, induction_phi, new_stmt;
1647 tree induc_def, vec_def, vec_dest;
1648 tree init_expr, step_expr;
1649 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1654 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1655 bool nested_in_vect_loop = false;
1656 gimple_seq stmts = NULL;
1657 imm_use_iterator imm_iter;
1658 use_operand_p use_p;
1662 gimple_stmt_iterator si;
1663 basic_block bb = gimple_bb (iv_phi);
1665 vectype = get_vectype_for_scalar_type (scalar_type);
1666 gcc_assert (vectype);
1667 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1668 ncopies = vf / nunits;
1670 gcc_assert (phi_info);
1671 gcc_assert (ncopies >= 1);
1673 /* Find the first insertion point in the BB. */
1674 si = gsi_after_labels (bb);
1676 if (INTEGRAL_TYPE_P (scalar_type) || POINTER_TYPE_P (scalar_type))
1677 step_expr = build_int_cst (scalar_type, 0);
1679 step_expr = build_real (scalar_type, dconst0);
1681 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1682 if (nested_in_vect_loop_p (loop, iv_phi))
1684 nested_in_vect_loop = true;
1685 iv_loop = loop->inner;
1689 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
1691 latch_e = loop_latch_edge (iv_loop);
1692 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1694 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1695 gcc_assert (access_fn);
1696 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1697 &init_expr, &step_expr);
1699 pe = loop_preheader_edge (iv_loop);
1701 /* Create the vector that holds the initial_value of the induction. */
1702 if (nested_in_vect_loop)
1704 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1705 been created during vectorization of previous stmts; We obtain it from
1706 the STMT_VINFO_VEC_STMT of the defining stmt. */
1707 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1708 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1712 /* iv_loop is the loop to be vectorized. Create:
1713 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1714 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1715 add_referenced_var (new_var);
1717 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1720 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
1721 gcc_assert (!new_bb);
1725 t = tree_cons (NULL_TREE, init_expr, t);
1726 for (i = 1; i < nunits; i++)
1728 /* Create: new_name_i = new_name + step_expr */
1729 enum tree_code code = POINTER_TYPE_P (scalar_type)
1730 ? POINTER_PLUS_EXPR : PLUS_EXPR;
1731 init_stmt = gimple_build_assign_with_ops (code, new_var,
1732 new_name, step_expr);
1733 new_name = make_ssa_name (new_var, init_stmt);
1734 gimple_assign_set_lhs (init_stmt, new_name);
1736 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1737 gcc_assert (!new_bb);
1739 if (vect_print_dump_info (REPORT_DETAILS))
1741 fprintf (vect_dump, "created new init_stmt: ");
1742 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1744 t = tree_cons (NULL_TREE, new_name, t);
1746 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1747 vec = build_constructor_from_list (vectype, nreverse (t));
1748 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1752 /* Create the vector that holds the step of the induction. */
1753 if (nested_in_vect_loop)
1754 /* iv_loop is nested in the loop to be vectorized. Generate:
1755 vec_step = [S, S, S, S] */
1756 new_name = step_expr;
1759 /* iv_loop is the loop to be vectorized. Generate:
1760 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1761 expr = build_int_cst (scalar_type, vf);
1762 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1766 for (i = 0; i < nunits; i++)
1767 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1768 gcc_assert (CONSTANT_CLASS_P (new_name));
1769 vec = build_vector (vectype, t);
1770 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1773 /* Create the following def-use cycle:
1778 vec_iv = PHI <vec_init, vec_loop>
1782 vec_loop = vec_iv + vec_step; */
1784 /* Create the induction-phi that defines the induction-operand. */
1785 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1786 add_referenced_var (vec_dest);
1787 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1788 set_vinfo_for_stmt (induction_phi,
1789 new_stmt_vec_info (induction_phi, loop_vinfo));
1790 induc_def = PHI_RESULT (induction_phi);
1792 /* Create the iv update inside the loop */
1793 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1794 induc_def, vec_step);
1795 vec_def = make_ssa_name (vec_dest, new_stmt);
1796 gimple_assign_set_lhs (new_stmt, vec_def);
1797 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1798 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
1800 /* Set the arguments of the phi node: */
1801 add_phi_arg (induction_phi, vec_init, pe);
1802 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1805 /* In case that vectorization factor (VF) is bigger than the number
1806 of elements that we can fit in a vectype (nunits), we have to generate
1807 more than one vector stmt - i.e - we need to "unroll" the
1808 vector stmt by a factor VF/nunits. For more details see documentation
1809 in vectorizable_operation. */
1813 stmt_vec_info prev_stmt_vinfo;
1814 /* FORNOW. This restriction should be relaxed. */
1815 gcc_assert (!nested_in_vect_loop);
1817 /* Create the vector that holds the step of the induction. */
1818 expr = build_int_cst (scalar_type, nunits);
1819 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1821 for (i = 0; i < nunits; i++)
1822 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1823 gcc_assert (CONSTANT_CLASS_P (new_name));
1824 vec = build_vector (vectype, t);
1825 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1827 vec_def = induc_def;
1828 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1829 for (i = 1; i < ncopies; i++)
1831 /* vec_i = vec_prev + vec_step */
1832 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1834 vec_def = make_ssa_name (vec_dest, new_stmt);
1835 gimple_assign_set_lhs (new_stmt, vec_def);
1837 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1838 set_vinfo_for_stmt (new_stmt,
1839 new_stmt_vec_info (new_stmt, loop_vinfo));
1840 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1841 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1845 if (nested_in_vect_loop)
1847 /* Find the loop-closed exit-phi of the induction, and record
1848 the final vector of induction results: */
1850 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1852 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
1854 exit_phi = USE_STMT (use_p);
1860 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1861 /* FORNOW. Currently not supporting the case that an inner-loop induction
1862 is not used in the outer-loop (i.e. only outside the outer-loop). */
1863 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1864 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1866 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1867 if (vect_print_dump_info (REPORT_DETAILS))
1869 fprintf (vect_dump, "vector of inductions after inner-loop:");
1870 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
1876 if (vect_print_dump_info (REPORT_DETAILS))
1878 fprintf (vect_dump, "transform induction: created def-use cycle: ");
1879 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
1880 fprintf (vect_dump, "\n");
1881 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
1884 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1889 /* Function vect_get_vec_def_for_operand.
1891 OP is an operand in STMT. This function returns a (vector) def that will be
1892 used in the vectorized stmt for STMT.
1894 In the case that OP is an SSA_NAME which is defined in the loop, then
1895 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1897 In case OP is an invariant or constant, a new stmt that creates a vector def
1898 needs to be introduced. */
1901 vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
1906 stmt_vec_info def_stmt_info = NULL;
1907 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1908 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1909 unsigned int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1910 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1916 enum vect_def_type dt;
1920 if (vect_print_dump_info (REPORT_DETAILS))
1922 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1923 print_generic_expr (vect_dump, op, TDF_SLIM);
1926 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1927 gcc_assert (is_simple_use);
1928 if (vect_print_dump_info (REPORT_DETAILS))
1932 fprintf (vect_dump, "def = ");
1933 print_generic_expr (vect_dump, def, TDF_SLIM);
1937 fprintf (vect_dump, " def_stmt = ");
1938 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1944 /* Case 1: operand is a constant. */
1945 case vect_constant_def:
1950 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1951 if (vect_print_dump_info (REPORT_DETAILS))
1952 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1954 for (i = nunits - 1; i >= 0; --i)
1956 t = tree_cons (NULL_TREE, op, t);
1958 vec_cst = build_vector (vectype, t);
1959 return vect_init_vector (stmt, vec_cst, vectype, NULL);
1962 /* Case 2: operand is defined outside the loop - loop invariant. */
1963 case vect_invariant_def:
1965 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1966 gcc_assert (vector_type);
1967 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
1972 /* Create 'vec_inv = {inv,inv,..,inv}' */
1973 if (vect_print_dump_info (REPORT_DETAILS))
1974 fprintf (vect_dump, "Create vector_inv.");
1976 for (i = nunits - 1; i >= 0; --i)
1978 t = tree_cons (NULL_TREE, def, t);
1981 /* FIXME: use build_constructor directly. */
1982 vec_inv = build_constructor_from_list (vector_type, t);
1983 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1986 /* Case 3: operand is defined inside the loop. */
1990 *scalar_def = NULL/* FIXME tuples: def_stmt*/;
1992 /* Get the def from the vectorized stmt. */
1993 def_stmt_info = vinfo_for_stmt (def_stmt);
1994 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1995 gcc_assert (vec_stmt);
1996 if (gimple_code (vec_stmt) == GIMPLE_PHI)
1997 vec_oprnd = PHI_RESULT (vec_stmt);
1998 else if (is_gimple_call (vec_stmt))
1999 vec_oprnd = gimple_call_lhs (vec_stmt);
2001 vec_oprnd = gimple_assign_lhs (vec_stmt);
2005 /* Case 4: operand is defined by a loop header phi - reduction */
2006 case vect_reduction_def:
2010 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2011 loop = (gimple_bb (def_stmt))->loop_father;
2013 /* Get the def before the loop */
2014 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
2015 return get_initial_def_for_reduction (stmt, op, scalar_def);
2018 /* Case 5: operand is defined by loop-header phi - induction. */
2019 case vect_induction_def:
2021 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2023 /* Get the def from the vectorized stmt. */
2024 def_stmt_info = vinfo_for_stmt (def_stmt);
2025 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
2026 gcc_assert (vec_stmt && gimple_code (vec_stmt) == GIMPLE_PHI);
2027 vec_oprnd = PHI_RESULT (vec_stmt);
2037 /* Function vect_get_vec_def_for_stmt_copy
2039 Return a vector-def for an operand. This function is used when the
2040 vectorized stmt to be created (by the caller to this function) is a "copy"
2041 created in case the vectorized result cannot fit in one vector, and several
2042 copies of the vector-stmt are required. In this case the vector-def is
2043 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
2044 of the stmt that defines VEC_OPRND.
2045 DT is the type of the vector def VEC_OPRND.
2048 In case the vectorization factor (VF) is bigger than the number
2049 of elements that can fit in a vectype (nunits), we have to generate
2050 more than one vector stmt to vectorize the scalar stmt. This situation
2051 arises when there are multiple data-types operated upon in the loop; the
2052 smallest data-type determines the VF, and as a result, when vectorizing
2053 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
2054 vector stmt (each computing a vector of 'nunits' results, and together
2055 computing 'VF' results in each iteration). This function is called when
2056 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
2057 which VF=16 and nunits=4, so the number of copies required is 4):
2059 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
2061 S1: x = load VS1.0: vx.0 = memref0 VS1.1
2062 VS1.1: vx.1 = memref1 VS1.2
2063 VS1.2: vx.2 = memref2 VS1.3
2064 VS1.3: vx.3 = memref3
2066 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
2067 VSnew.1: vz1 = vx.1 + ... VSnew.2
2068 VSnew.2: vz2 = vx.2 + ... VSnew.3
2069 VSnew.3: vz3 = vx.3 + ...
2071 The vectorization of S1 is explained in vectorizable_load.
2072 The vectorization of S2:
2073 To create the first vector-stmt out of the 4 copies - VSnew.0 -
2074 the function 'vect_get_vec_def_for_operand' is called to
2075 get the relevant vector-def for each operand of S2. For operand x it
2076 returns the vector-def 'vx.0'.
2078 To create the remaining copies of the vector-stmt (VSnew.j), this
2079 function is called to get the relevant vector-def for each operand. It is
2080 obtained from the respective VS1.j stmt, which is recorded in the
2081 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2083 For example, to obtain the vector-def 'vx.1' in order to create the
2084 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2085 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2086 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2087 and return its def ('vx.1').
2088 Overall, to create the above sequence this function will be called 3 times:
2089 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2090 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2091 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
2094 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2096 gimple vec_stmt_for_operand;
2097 stmt_vec_info def_stmt_info;
2099 /* Do nothing; can reuse same def. */
2100 if (dt == vect_invariant_def || dt == vect_constant_def )
2103 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2104 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2105 gcc_assert (def_stmt_info);
2106 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2107 gcc_assert (vec_stmt_for_operand);
2108 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2109 if (gimple_code (vec_stmt_for_operand) == GIMPLE_PHI)
2110 vec_oprnd = PHI_RESULT (vec_stmt_for_operand);
2112 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2117 /* Get vectorized definitions for the operands to create a copy of an original
2118 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
2121 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2122 VEC(tree,heap) **vec_oprnds0,
2123 VEC(tree,heap) **vec_oprnds1)
2125 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2127 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2128 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2130 if (vec_oprnds1 && *vec_oprnds1)
2132 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2133 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2134 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2139 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
2142 vect_get_vec_defs (tree op0, tree op1, gimple stmt,
2143 VEC(tree,heap) **vec_oprnds0, VEC(tree,heap) **vec_oprnds1,
2147 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2152 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2153 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2154 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2158 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2159 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2160 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2166 /* Function vect_finish_stmt_generation.
2168 Insert a new stmt. */
2171 vect_finish_stmt_generation (gimple stmt, gimple vec_stmt,
2172 gimple_stmt_iterator *gsi)
2174 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2175 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2177 gcc_assert (gimple_code (stmt) != GIMPLE_LABEL);
2179 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
2181 set_vinfo_for_stmt (vec_stmt, new_stmt_vec_info (vec_stmt, loop_vinfo));
2183 if (vect_print_dump_info (REPORT_DETAILS))
2185 fprintf (vect_dump, "add new stmt: ");
2186 print_gimple_stmt (vect_dump, vec_stmt, 0, TDF_SLIM);
2189 gimple_set_location (vec_stmt, gimple_location (gsi_stmt (*gsi)));
2193 /* Function get_initial_def_for_reduction
2196 STMT - a stmt that performs a reduction operation in the loop.
2197 INIT_VAL - the initial value of the reduction variable
2200 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2201 of the reduction (used for adjusting the epilog - see below).
2202 Return a vector variable, initialized according to the operation that STMT
2203 performs. This vector will be used as the initial value of the
2204 vector of partial results.
2206 Option1 (adjust in epilog): Initialize the vector as follows:
2209 min/max: [init_val,init_val,..,init_val,init_val]
2210 bit and/or: [init_val,init_val,..,init_val,init_val]
2211 and when necessary (e.g. add/mult case) let the caller know
2212 that it needs to adjust the result by init_val.
2214 Option2: Initialize the vector as follows:
2215 add: [0,0,...,0,init_val]
2216 mult: [1,1,...,1,init_val]
2217 min/max: [init_val,init_val,...,init_val]
2218 bit and/or: [init_val,init_val,...,init_val]
2219 and no adjustments are needed.
2221 For example, for the following code:
2227 STMT is 's = s + a[i]', and the reduction variable is 's'.
2228 For a vector of 4 units, we want to return either [0,0,0,init_val],
2229 or [0,0,0,0] and let the caller know that it needs to adjust
2230 the result at the end by 'init_val'.
2232 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2233 initialization vector is simpler (same element in all entries).
2234 A cost model should help decide between these two schemes. */
2237 get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
2239 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2240 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2241 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2242 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2243 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2244 tree scalar_type = TREE_TYPE (vectype);
2245 enum tree_code code = gimple_assign_rhs_code (stmt);
2246 tree type = TREE_TYPE (init_val);
2252 bool nested_in_vect_loop = false;
2254 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2255 if (nested_in_vect_loop_p (loop, stmt))
2256 nested_in_vect_loop = true;
2258 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2260 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2264 case WIDEN_SUM_EXPR:
2267 if (nested_in_vect_loop)
2268 *adjustment_def = vecdef;
2270 *adjustment_def = init_val;
2271 /* Create a vector of zeros for init_def. */
2272 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2273 def_for_init = build_real (scalar_type, dconst0);
2275 def_for_init = build_int_cst (scalar_type, 0);
2277 for (i = nunits - 1; i >= 0; --i)
2278 t = tree_cons (NULL_TREE, def_for_init, t);
2279 init_def = build_vector (vectype, t);
2284 *adjustment_def = NULL_TREE;
2296 /* Function vect_create_epilog_for_reduction
2298 Create code at the loop-epilog to finalize the result of a reduction
2301 VECT_DEF is a vector of partial results.
2302 REDUC_CODE is the tree-code for the epilog reduction.
2303 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2304 number of elements that we can fit in a vectype (nunits). In this case
2305 we have to generate more than one vector stmt - i.e - we need to "unroll"
2306 the vector stmt by a factor VF/nunits. For more details see documentation
2307 in vectorizable_operation.
2308 STMT is the scalar reduction stmt that is being vectorized.
2309 REDUCTION_PHI is the phi-node that carries the reduction computation.
2312 1. Creates the reduction def-use cycle: sets the arguments for
2314 The loop-entry argument is the vectorized initial-value of the reduction.
2315 The loop-latch argument is VECT_DEF - the vector of partial sums.
2316 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2317 by applying the operation specified by REDUC_CODE if available, or by
2318 other means (whole-vector shifts or a scalar loop).
2319 The function also creates a new phi node at the loop exit to preserve
2320 loop-closed form, as illustrated below.
2322 The flow at the entry to this function:
2325 vec_def = phi <null, null> # REDUCTION_PHI
2326 VECT_DEF = vector_stmt # vectorized form of STMT
2327 s_loop = scalar_stmt # (scalar) STMT
2329 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2333 The above is transformed by this function into:
2336 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2337 VECT_DEF = vector_stmt # vectorized form of STMT
2338 s_loop = scalar_stmt # (scalar) STMT
2340 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2341 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2342 v_out2 = reduce <v_out1>
2343 s_out3 = extract_field <v_out2, 0>
2344 s_out4 = adjust_result <s_out3>
2350 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2352 enum tree_code reduc_code,
2353 gimple reduction_phi)
2355 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2356 stmt_vec_info prev_phi_info;
2358 enum machine_mode mode;
2359 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2360 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2361 basic_block exit_bb;
2364 gimple new_phi = NULL, phi;
2365 gimple_stmt_iterator exit_gsi;
2367 tree new_temp = NULL_TREE;
2369 gimple epilog_stmt = NULL;
2370 tree new_scalar_dest, new_dest;
2372 tree bitsize, bitpos, bytesize;
2373 enum tree_code code = gimple_assign_rhs_code (stmt);
2374 tree adjustment_def;
2375 tree vec_initial_def, def;
2377 imm_use_iterator imm_iter;
2378 use_operand_p use_p;
2379 bool extract_scalar_result = false;
2380 tree reduction_op, expr;
2383 bool nested_in_vect_loop = false;
2384 VEC(gimple,heap) *phis = NULL;
2385 enum vect_def_type dt = vect_unknown_def_type;
2388 if (nested_in_vect_loop_p (loop, stmt))
2391 nested_in_vect_loop = true;
2394 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2396 case GIMPLE_SINGLE_RHS:
2397 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2398 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2400 case GIMPLE_UNARY_RHS:
2401 reduction_op = gimple_assign_rhs1 (stmt);
2403 case GIMPLE_BINARY_RHS:
2404 reduction_op = gimple_assign_rhs2 (stmt);
2410 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2411 gcc_assert (vectype);
2412 mode = TYPE_MODE (vectype);
2414 /*** 1. Create the reduction def-use cycle ***/
2416 /* For the case of reduction, vect_get_vec_def_for_operand returns
2417 the scalar def before the loop, that defines the initial value
2418 of the reduction variable. */
2419 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2422 phi = reduction_phi;
2424 for (j = 0; j < ncopies; j++)
2426 /* 1.1 set the loop-entry arg of the reduction-phi: */
2427 add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop));
2429 /* 1.2 set the loop-latch arg for the reduction-phi: */
2431 def = vect_get_vec_def_for_stmt_copy (dt, def);
2432 add_phi_arg (phi, def, loop_latch_edge (loop));
2434 if (vect_print_dump_info (REPORT_DETAILS))
2436 fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2437 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2438 fprintf (vect_dump, "\n");
2439 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2442 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2445 /*** 2. Create epilog code
2446 The reduction epilog code operates across the elements of the vector
2447 of partial results computed by the vectorized loop.
2448 The reduction epilog code consists of:
2449 step 1: compute the scalar result in a vector (v_out2)
2450 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2451 step 3: adjust the scalar result (s_out3) if needed.
2453 Step 1 can be accomplished using one the following three schemes:
2454 (scheme 1) using reduc_code, if available.
2455 (scheme 2) using whole-vector shifts, if available.
2456 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2459 The overall epilog code looks like this:
2461 s_out0 = phi <s_loop> # original EXIT_PHI
2462 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2463 v_out2 = reduce <v_out1> # step 1
2464 s_out3 = extract_field <v_out2, 0> # step 2
2465 s_out4 = adjust_result <s_out3> # step 3
2467 (step 3 is optional, and steps 1 and 2 may be combined).
2468 Lastly, the uses of s_out0 are replaced by s_out4.
2472 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2473 v_out1 = phi <v_loop> */
2475 exit_bb = single_exit (loop)->dest;
2477 prev_phi_info = NULL;
2478 for (j = 0; j < ncopies; j++)
2480 phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2481 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
2486 def = vect_get_vec_def_for_stmt_copy (dt, def);
2487 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
2489 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
2490 prev_phi_info = vinfo_for_stmt (phi);
2492 exit_gsi = gsi_after_labels (exit_bb);
2494 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2495 (i.e. when reduc_code is not available) and in the final adjustment
2496 code (if needed). Also get the original scalar reduction variable as
2497 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2498 represents a reduction pattern), the tree-code and scalar-def are
2499 taken from the original stmt that the pattern-stmt (STMT) replaces.
2500 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2501 are taken from STMT. */
2503 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2506 /* Regular reduction */
2511 /* Reduction pattern */
2512 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2513 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2514 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2516 code = gimple_assign_rhs_code (orig_stmt);
2517 scalar_dest = gimple_assign_lhs (orig_stmt);
2518 scalar_type = TREE_TYPE (scalar_dest);
2519 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2520 bitsize = TYPE_SIZE (scalar_type);
2521 bytesize = TYPE_SIZE_UNIT (scalar_type);
2524 /* In case this is a reduction in an inner-loop while vectorizing an outer
2525 loop - we don't need to extract a single scalar result at the end of the
2526 inner-loop. The final vector of partial results will be used in the
2527 vectorized outer-loop, or reduced to a scalar result at the end of the
2529 if (nested_in_vect_loop)
2530 goto vect_finalize_reduction;
2533 gcc_assert (ncopies == 1);
2535 /* 2.3 Create the reduction code, using one of the three schemes described
2538 if (reduc_code < NUM_TREE_CODES)
2542 /*** Case 1: Create:
2543 v_out2 = reduc_expr <v_out1> */
2545 if (vect_print_dump_info (REPORT_DETAILS))
2546 fprintf (vect_dump, "Reduce using direct vector reduction.");
2548 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2549 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2550 epilog_stmt = gimple_build_assign (vec_dest, tmp);
2551 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2552 gimple_assign_set_lhs (epilog_stmt, new_temp);
2553 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2555 extract_scalar_result = true;
2559 enum tree_code shift_code = 0;
2560 bool have_whole_vector_shift = true;
2562 int element_bitsize = tree_low_cst (bitsize, 1);
2563 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2566 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2567 shift_code = VEC_RSHIFT_EXPR;
2569 have_whole_vector_shift = false;
2571 /* Regardless of whether we have a whole vector shift, if we're
2572 emulating the operation via tree-vect-generic, we don't want
2573 to use it. Only the first round of the reduction is likely
2574 to still be profitable via emulation. */
2575 /* ??? It might be better to emit a reduction tree code here, so that
2576 tree-vect-generic can expand the first round via bit tricks. */
2577 if (!VECTOR_MODE_P (mode))
2578 have_whole_vector_shift = false;
2581 optab optab = optab_for_tree_code (code, vectype, optab_default);
2582 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2583 have_whole_vector_shift = false;
2586 if (have_whole_vector_shift)
2588 /*** Case 2: Create:
2589 for (offset = VS/2; offset >= element_size; offset/=2)
2591 Create: va' = vec_shift <va, offset>
2592 Create: va = vop <va, va'>
2595 if (vect_print_dump_info (REPORT_DETAILS))
2596 fprintf (vect_dump, "Reduce using vector shifts");
2598 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2599 new_temp = PHI_RESULT (new_phi);
2601 for (bit_offset = vec_size_in_bits/2;
2602 bit_offset >= element_bitsize;
2605 tree bitpos = size_int (bit_offset);
2606 epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
2608 new_name = make_ssa_name (vec_dest, epilog_stmt);
2609 gimple_assign_set_lhs (epilog_stmt, new_name);
2610 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2612 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
2613 new_name, new_temp);
2614 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2615 gimple_assign_set_lhs (epilog_stmt, new_temp);
2616 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2619 extract_scalar_result = true;
2625 /*** Case 3: Create:
2626 s = extract_field <v_out2, 0>
2627 for (offset = element_size;
2628 offset < vector_size;
2629 offset += element_size;)
2631 Create: s' = extract_field <v_out2, offset>
2632 Create: s = op <s, s'>
2635 if (vect_print_dump_info (REPORT_DETAILS))
2636 fprintf (vect_dump, "Reduce using scalar code. ");
2638 vec_temp = PHI_RESULT (new_phi);
2639 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2640 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2642 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2643 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2644 gimple_assign_set_lhs (epilog_stmt, new_temp);
2645 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2647 for (bit_offset = element_bitsize;
2648 bit_offset < vec_size_in_bits;
2649 bit_offset += element_bitsize)
2651 tree bitpos = bitsize_int (bit_offset);
2652 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2655 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2656 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2657 gimple_assign_set_lhs (epilog_stmt, new_name);
2658 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2660 epilog_stmt = gimple_build_assign_with_ops (code,
2662 new_name, new_temp);
2663 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2664 gimple_assign_set_lhs (epilog_stmt, new_temp);
2665 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2668 extract_scalar_result = false;
2672 /* 2.4 Extract the final scalar result. Create:
2673 s_out3 = extract_field <v_out2, bitpos> */
2675 if (extract_scalar_result)
2679 gcc_assert (!nested_in_vect_loop);
2680 if (vect_print_dump_info (REPORT_DETAILS))
2681 fprintf (vect_dump, "extract scalar result");
2683 if (BYTES_BIG_ENDIAN)
2684 bitpos = size_binop (MULT_EXPR,
2685 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2686 TYPE_SIZE (scalar_type));
2688 bitpos = bitsize_zero_node;
2690 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2691 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2692 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2693 gimple_assign_set_lhs (epilog_stmt, new_temp);
2694 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2697 vect_finalize_reduction:
2699 /* 2.5 Adjust the final result by the initial value of the reduction
2700 variable. (When such adjustment is not needed, then
2701 'adjustment_def' is zero). For example, if code is PLUS we create:
2702 new_temp = loop_exit_def + adjustment_def */
2706 if (nested_in_vect_loop)
2708 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2709 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2710 new_dest = vect_create_destination_var (scalar_dest, vectype);
2714 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2715 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2716 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2718 epilog_stmt = gimple_build_assign (new_dest, expr);
2719 new_temp = make_ssa_name (new_dest, epilog_stmt);
2720 gimple_assign_set_lhs (epilog_stmt, new_temp);
2721 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
2722 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2726 /* 2.6 Handle the loop-exit phi */
2728 /* Replace uses of s_out0 with uses of s_out3:
2729 Find the loop-closed-use at the loop exit of the original scalar result.
2730 (The reduction result is expected to have two immediate uses - one at the
2731 latch block, and one at the loop exit). */
2732 phis = VEC_alloc (gimple, heap, 10);
2733 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2735 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2737 exit_phi = USE_STMT (use_p);
2738 VEC_quick_push (gimple, phis, exit_phi);
2741 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2742 gcc_assert (!VEC_empty (gimple, phis));
2744 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
2746 if (nested_in_vect_loop)
2748 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2750 /* FORNOW. Currently not supporting the case that an inner-loop
2751 reduction is not used in the outer-loop (but only outside the
2753 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2754 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2756 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2757 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2758 set_vinfo_for_stmt (epilog_stmt,
2759 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2761 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
2762 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
2766 /* Replace the uses: */
2767 orig_name = PHI_RESULT (exit_phi);
2768 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2769 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2770 SET_USE (use_p, new_temp);
2772 VEC_free (gimple, heap, phis);
2776 /* Function vectorizable_reduction.
2778 Check if STMT performs a reduction operation that can be vectorized.
2779 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2780 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2781 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2783 This function also handles reduction idioms (patterns) that have been
2784 recognized in advance during vect_pattern_recog. In this case, STMT may be
2786 X = pattern_expr (arg0, arg1, ..., X)
2787 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2788 sequence that had been detected and replaced by the pattern-stmt (STMT).
2790 In some cases of reduction patterns, the type of the reduction variable X is
2791 different than the type of the other arguments of STMT.
2792 In such cases, the vectype that is used when transforming STMT into a vector
2793 stmt is different than the vectype that is used to determine the
2794 vectorization factor, because it consists of a different number of elements
2795 than the actual number of elements that are being operated upon in parallel.
2797 For example, consider an accumulation of shorts into an int accumulator.
2798 On some targets it's possible to vectorize this pattern operating on 8
2799 shorts at a time (hence, the vectype for purposes of determining the
2800 vectorization factor should be V8HI); on the other hand, the vectype that
2801 is used to create the vector form is actually V4SI (the type of the result).
2803 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2804 indicates what is the actual level of parallelism (V8HI in the example), so
2805 that the right vectorization factor would be derived. This vectype
2806 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2807 be used to create the vectorized stmt. The right vectype for the vectorized
2808 stmt is obtained from the type of the result X:
2809 get_vectype_for_scalar_type (TREE_TYPE (X))
2811 This means that, contrary to "regular" reductions (or "regular" stmts in
2812 general), the following equation:
2813 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2814 does *NOT* necessarily hold for reduction patterns. */
2817 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
2822 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2823 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2824 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2825 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2826 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2827 enum tree_code code, orig_code, epilog_reduc_code = 0;
2828 enum machine_mode vec_mode;
2830 optab optab, reduc_optab;
2831 tree new_temp = NULL_TREE;
2834 enum vect_def_type dt;
2835 gimple new_phi = NULL;
2839 stmt_vec_info orig_stmt_info;
2840 tree expr = NULL_TREE;
2842 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2843 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2845 stmt_vec_info prev_stmt_info, prev_phi_info;
2846 gimple first_phi = NULL;
2847 bool single_defuse_cycle = false;
2849 gimple new_stmt = NULL;
2853 if (nested_in_vect_loop_p (loop, stmt))
2856 gcc_assert (ncopies >= 1);
2858 /* FORNOW: SLP not supported. */
2859 if (STMT_SLP_TYPE (stmt_info))
2862 /* 1. Is vectorizable reduction? */
2864 /* Not supportable if the reduction variable is used in the loop. */
2865 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2868 /* Reductions that are not used even in an enclosing outer-loop,
2869 are expected to be "live" (used out of the loop). */
2870 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2871 && !STMT_VINFO_LIVE_P (stmt_info))
2874 /* Make sure it was already recognized as a reduction computation. */
2875 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2878 /* 2. Has this been recognized as a reduction pattern?
2880 Check if STMT represents a pattern that has been recognized
2881 in earlier analysis stages. For stmts that represent a pattern,
2882 the STMT_VINFO_RELATED_STMT field records the last stmt in
2883 the original sequence that constitutes the pattern. */
2885 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2888 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2889 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2890 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2891 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2894 /* 3. Check the operands of the operation. The first operands are defined
2895 inside the loop body. The last operand is the reduction variable,
2896 which is defined by the loop-header-phi. */
2898 gcc_assert (is_gimple_assign (stmt));
2901 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2903 case GIMPLE_SINGLE_RHS:
2904 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
2905 if (op_type == ternary_op)
2907 tree rhs = gimple_assign_rhs1 (stmt);
2908 ops[0] = TREE_OPERAND (rhs, 0);
2909 ops[1] = TREE_OPERAND (rhs, 1);
2910 ops[2] = TREE_OPERAND (rhs, 2);
2911 code = TREE_CODE (rhs);
2917 case GIMPLE_BINARY_RHS:
2918 code = gimple_assign_rhs_code (stmt);
2919 op_type = TREE_CODE_LENGTH (code);
2920 gcc_assert (op_type == binary_op);
2921 ops[0] = gimple_assign_rhs1 (stmt);
2922 ops[1] = gimple_assign_rhs2 (stmt);
2925 case GIMPLE_UNARY_RHS:
2932 scalar_dest = gimple_assign_lhs (stmt);
2933 scalar_type = TREE_TYPE (scalar_dest);
2934 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2935 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2938 /* All uses but the last are expected to be defined in the loop.
2939 The last use is the reduction variable. */
2940 for (i = 0; i < op_type-1; i++)
2942 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt,
2944 gcc_assert (is_simple_use);
2945 if (dt != vect_loop_def
2946 && dt != vect_invariant_def
2947 && dt != vect_constant_def
2948 && dt != vect_induction_def)
2952 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt, &def, &dt);
2953 gcc_assert (is_simple_use);
2954 gcc_assert (dt == vect_reduction_def);
2955 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2957 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2959 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2961 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2964 /* 4. Supportable by target? */
2966 /* 4.1. check support for the operation in the loop */
2967 optab = optab_for_tree_code (code, vectype, optab_default);
2970 if (vect_print_dump_info (REPORT_DETAILS))
2971 fprintf (vect_dump, "no optab.");
2974 vec_mode = TYPE_MODE (vectype);
2975 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2977 if (vect_print_dump_info (REPORT_DETAILS))
2978 fprintf (vect_dump, "op not supported by target.");
2979 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2980 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2981 < vect_min_worthwhile_factor (code))
2983 if (vect_print_dump_info (REPORT_DETAILS))
2984 fprintf (vect_dump, "proceeding using word mode.");
2987 /* Worthwhile without SIMD support? */
2988 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2989 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2990 < vect_min_worthwhile_factor (code))
2992 if (vect_print_dump_info (REPORT_DETAILS))
2993 fprintf (vect_dump, "not worthwhile without SIMD support.");
2997 /* 4.2. Check support for the epilog operation.
2999 If STMT represents a reduction pattern, then the type of the
3000 reduction variable may be different than the type of the rest
3001 of the arguments. For example, consider the case of accumulation
3002 of shorts into an int accumulator; The original code:
3003 S1: int_a = (int) short_a;
3004 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
3007 STMT: int_acc = widen_sum <short_a, int_acc>
3010 1. The tree-code that is used to create the vector operation in the
3011 epilog code (that reduces the partial results) is not the
3012 tree-code of STMT, but is rather the tree-code of the original
3013 stmt from the pattern that STMT is replacing. I.e, in the example
3014 above we want to use 'widen_sum' in the loop, but 'plus' in the
3016 2. The type (mode) we use to check available target support
3017 for the vector operation to be created in the *epilog*, is
3018 determined by the type of the reduction variable (in the example
3019 above we'd check this: plus_optab[vect_int_mode]).
3020 However the type (mode) we use to check available target support
3021 for the vector operation to be created *inside the loop*, is
3022 determined by the type of the other arguments to STMT (in the
3023 example we'd check this: widen_sum_optab[vect_short_mode]).
3025 This is contrary to "regular" reductions, in which the types of all
3026 the arguments are the same as the type of the reduction variable.
3027 For "regular" reductions we can therefore use the same vector type
3028 (and also the same tree-code) when generating the epilog code and
3029 when generating the code inside the loop. */
3033 /* This is a reduction pattern: get the vectype from the type of the
3034 reduction variable, and get the tree-code from orig_stmt. */
3035 orig_code = gimple_assign_rhs_code (orig_stmt);
3036 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3039 if (vect_print_dump_info (REPORT_DETAILS))
3041 fprintf (vect_dump, "unsupported data-type ");
3042 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3047 vec_mode = TYPE_MODE (vectype);
3051 /* Regular reduction: use the same vectype and tree-code as used for
3052 the vector code inside the loop can be used for the epilog code. */
3056 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3058 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default);
3061 if (vect_print_dump_info (REPORT_DETAILS))
3062 fprintf (vect_dump, "no optab for reduction.");
3063 epilog_reduc_code = NUM_TREE_CODES;
3065 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
3067 if (vect_print_dump_info (REPORT_DETAILS))
3068 fprintf (vect_dump, "reduc op not supported by target.");
3069 epilog_reduc_code = NUM_TREE_CODES;
3072 if (!vec_stmt) /* transformation not required. */
3074 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3075 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3082 if (vect_print_dump_info (REPORT_DETAILS))
3083 fprintf (vect_dump, "transform reduction.");
3085 /* Create the destination vector */
3086 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3088 /* In case the vectorization factor (VF) is bigger than the number
3089 of elements that we can fit in a vectype (nunits), we have to generate
3090 more than one vector stmt - i.e - we need to "unroll" the
3091 vector stmt by a factor VF/nunits. For more details see documentation
3092 in vectorizable_operation. */
3094 /* If the reduction is used in an outer loop we need to generate
3095 VF intermediate results, like so (e.g. for ncopies=2):
3100 (i.e. we generate VF results in 2 registers).
3101 In this case we have a separate def-use cycle for each copy, and therefore
3102 for each copy we get the vector def for the reduction variable from the
3103 respective phi node created for this copy.
3105 Otherwise (the reduction is unused in the loop nest), we can combine
3106 together intermediate results, like so (e.g. for ncopies=2):
3110 (i.e. we generate VF/2 results in a single register).
3111 In this case for each copy we get the vector def for the reduction variable
3112 from the vectorized reduction operation generated in the previous iteration.
3115 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop)
3117 single_defuse_cycle = true;
3121 epilog_copies = ncopies;
3123 prev_stmt_info = NULL;
3124 prev_phi_info = NULL;
3125 for (j = 0; j < ncopies; j++)
3127 if (j == 0 || !single_defuse_cycle)
3129 /* Create the reduction-phi that defines the reduction-operand. */
3130 new_phi = create_phi_node (vec_dest, loop->header);
3131 set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo));
3137 loop_vec_def0 = vect_get_vec_def_for_operand (ops[0], stmt, NULL);
3138 if (op_type == ternary_op)
3140 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, NULL);
3143 /* Get the vector def for the reduction variable from the phi node */
3144 reduc_def = PHI_RESULT (new_phi);
3145 first_phi = new_phi;
3149 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3150 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3151 if (op_type == ternary_op)
3152 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3154 if (single_defuse_cycle)
3155 reduc_def = gimple_assign_lhs (new_stmt);
3157 reduc_def = PHI_RESULT (new_phi);
3159 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3162 /* Arguments are ready. create the new vector stmt. */
3163 if (op_type == binary_op)
3164 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3166 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3168 new_stmt = gimple_build_assign (vec_dest, expr);
3169 new_temp = make_ssa_name (vec_dest, new_stmt);
3170 gimple_assign_set_lhs (new_stmt, new_temp);
3171 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3174 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3176 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3177 prev_stmt_info = vinfo_for_stmt (new_stmt);
3178 prev_phi_info = vinfo_for_stmt (new_phi);
3181 /* Finalize the reduction-phi (set its arguments) and create the
3182 epilog reduction code. */
3183 if (!single_defuse_cycle)
3184 new_temp = gimple_assign_lhs (*vec_stmt);
3185 vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3186 epilog_reduc_code, first_phi);
3190 /* Checks if CALL can be vectorized in type VECTYPE. Returns
3191 a function declaration if the target has a vectorized version
3192 of the function, or NULL_TREE if the function cannot be vectorized. */
3195 vectorizable_function (gimple call, tree vectype_out, tree vectype_in)
3197 tree fndecl = gimple_call_fndecl (call);
3198 enum built_in_function code;
3200 /* We only handle functions that do not read or clobber memory -- i.e.
3201 const or novops ones. */
3202 if (!(gimple_call_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3206 || TREE_CODE (fndecl) != FUNCTION_DECL
3207 || !DECL_BUILT_IN (fndecl))
3210 code = DECL_FUNCTION_CODE (fndecl);
3211 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3215 /* Function vectorizable_call.
3217 Check if STMT performs a function call that can be vectorized.
3218 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3219 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3220 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3223 vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
3228 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3229 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3230 tree vectype_out, vectype_in;
3233 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3234 tree fndecl, new_temp, def, rhs_type, lhs_type;
3236 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3239 VEC(tree, heap) *vargs = NULL;
3240 enum { NARROW, NONE, WIDEN } modifier;
3243 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3246 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3249 /* FORNOW: SLP not supported. */
3250 if (STMT_SLP_TYPE (stmt_info))
3253 /* Is STMT a vectorizable call? */
3254 if (!is_gimple_call (stmt))
3257 if (TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
3260 /* Process function arguments. */
3261 rhs_type = NULL_TREE;
3262 nargs = gimple_call_num_args (stmt);
3264 /* Bail out if the function has more than two arguments, we
3265 do not have interesting builtin functions to vectorize with
3266 more than two arguments. No arguments is also not good. */
3267 if (nargs == 0 || nargs > 2)
3270 for (i = 0; i < nargs; i++)
3272 op = gimple_call_arg (stmt, i);
3274 /* We can only handle calls with arguments of the same type. */
3276 && rhs_type != TREE_TYPE (op))
3278 if (vect_print_dump_info (REPORT_DETAILS))
3279 fprintf (vect_dump, "argument types differ.");
3282 rhs_type = TREE_TYPE (op);
3284 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[i]))
3286 if (vect_print_dump_info (REPORT_DETAILS))
3287 fprintf (vect_dump, "use not simple.");
3292 vectype_in = get_vectype_for_scalar_type (rhs_type);
3295 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3297 lhs_type = TREE_TYPE (gimple_call_lhs (stmt));
3298 vectype_out = get_vectype_for_scalar_type (lhs_type);
3301 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3304 if (nunits_in == nunits_out / 2)
3306 else if (nunits_out == nunits_in)
3308 else if (nunits_out == nunits_in / 2)
3313 /* For now, we only vectorize functions if a target specific builtin
3314 is available. TODO -- in some cases, it might be profitable to
3315 insert the calls for pieces of the vector, in order to be able
3316 to vectorize other operations in the loop. */
3317 fndecl = vectorizable_function (stmt, vectype_out, vectype_in);
3318 if (fndecl == NULL_TREE)
3320 if (vect_print_dump_info (REPORT_DETAILS))
3321 fprintf (vect_dump, "function is not vectorizable.");
3326 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3328 if (modifier == NARROW)
3329 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3331 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3333 /* Sanity check: make sure that at least one copy of the vectorized stmt
3334 needs to be generated. */
3335 gcc_assert (ncopies >= 1);
3337 if (!vec_stmt) /* transformation not required. */
3339 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3340 if (vect_print_dump_info (REPORT_DETAILS))
3341 fprintf (vect_dump, "=== vectorizable_call ===");
3342 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3348 if (vect_print_dump_info (REPORT_DETAILS))
3349 fprintf (vect_dump, "transform operation.");
3352 scalar_dest = gimple_call_lhs (stmt);
3353 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3355 prev_stmt_info = NULL;
3359 for (j = 0; j < ncopies; ++j)
3361 /* Build argument list for the vectorized call. */
3363 vargs = VEC_alloc (tree, heap, nargs);
3365 VEC_truncate (tree, vargs, 0);
3367 for (i = 0; i < nargs; i++)
3369 op = gimple_call_arg (stmt, i);
3372 = vect_get_vec_def_for_operand (op, stmt, NULL);
3375 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3377 VEC_quick_push (tree, vargs, vec_oprnd0);
3380 new_stmt = gimple_build_call_vec (fndecl, vargs);
3381 new_temp = make_ssa_name (vec_dest, new_stmt);
3382 gimple_call_set_lhs (new_stmt, new_temp);
3384 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3387 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3389 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3391 prev_stmt_info = vinfo_for_stmt (new_stmt);
3397 for (j = 0; j < ncopies; ++j)
3399 /* Build argument list for the vectorized call. */
3401 vargs = VEC_alloc (tree, heap, nargs * 2);
3403 VEC_truncate (tree, vargs, 0);
3405 for (i = 0; i < nargs; i++)
3407 op = gimple_call_arg (stmt, i);
3411 = vect_get_vec_def_for_operand (op, stmt, NULL);
3413 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3418 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3420 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3423 VEC_quick_push (tree, vargs, vec_oprnd0);
3424 VEC_quick_push (tree, vargs, vec_oprnd1);
3427 new_stmt = gimple_build_call_vec (fndecl, vargs);
3428 new_temp = make_ssa_name (vec_dest, new_stmt);
3429 gimple_call_set_lhs (new_stmt, new_temp);
3431 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3434 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3436 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3438 prev_stmt_info = vinfo_for_stmt (new_stmt);
3441 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3446 /* No current target implements this case. */
3450 VEC_free (tree, heap, vargs);
3452 /* The call in STMT might prevent it from being removed in dce.
3453 We however cannot remove it here, due to the way the ssa name
3454 it defines is mapped to the new definition. So just replace
3455 rhs of the statement with something harmless. */
3457 type = TREE_TYPE (scalar_dest);
3458 new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
3459 fold_convert (type, integer_zero_node));
3460 set_vinfo_for_stmt (new_stmt, stmt_info);
3461 set_vinfo_for_stmt (stmt, NULL);
3462 STMT_VINFO_STMT (stmt_info) = new_stmt;
3463 gsi_replace (gsi, new_stmt, false);
3464 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3470 /* Function vect_gen_widened_results_half
3472 Create a vector stmt whose code, type, number of arguments, and result
3473 variable are CODE, OP_TYPE, and VEC_DEST, and its arguments are
3474 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3475 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3476 needs to be created (DECL is a function-decl of a target-builtin).
3477 STMT is the original scalar stmt that we are vectorizing. */
3480 vect_gen_widened_results_half (enum tree_code code,
3482 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3483 tree vec_dest, gimple_stmt_iterator *gsi,
3491 /* Generate half of the widened result: */
3492 if (code == CALL_EXPR)
3494 /* Target specific support */
3495 if (op_type == binary_op)
3496 new_stmt = gimple_build_call (decl, 2, vec_oprnd0, vec_oprnd1);
3498 new_stmt = gimple_build_call (decl, 1, vec_oprnd0);
3499 new_temp = make_ssa_name (vec_dest, new_stmt);
3500 gimple_call_set_lhs (new_stmt, new_temp);
3504 /* Generic support */
3505 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3506 if (op_type != binary_op)
3508 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vec_oprnd0,
3510 new_temp = make_ssa_name (vec_dest, new_stmt);
3511 gimple_assign_set_lhs (new_stmt, new_temp);
3513 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3515 if (code == CALL_EXPR)
3517 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3519 if (TREE_CODE (sym) == SSA_NAME)
3520 sym = SSA_NAME_VAR (sym);
3521 mark_sym_for_renaming (sym);
3529 /* Check if STMT performs a conversion operation, that can be vectorized.
3530 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3531 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3532 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3535 vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
3536 gimple *vec_stmt, slp_tree slp_node)
3541 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3542 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3543 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3544 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3545 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3549 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3550 gimple new_stmt = NULL;
3551 stmt_vec_info prev_stmt_info;
3554 tree vectype_out, vectype_in;
3557 tree rhs_type, lhs_type;
3559 enum { NARROW, NONE, WIDEN } modifier;
3561 VEC(tree,heap) *vec_oprnds0 = NULL;
3564 VEC(tree,heap) *dummy = NULL;
3567 /* Is STMT a vectorizable conversion? */
3569 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3572 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3575 if (!is_gimple_assign (stmt))
3578 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
3581 code = gimple_assign_rhs_code (stmt);
3582 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3585 /* Check types of lhs and rhs. */
3586 op0 = gimple_assign_rhs1 (stmt);
3587 rhs_type = TREE_TYPE (op0);
3588 vectype_in = get_vectype_for_scalar_type (rhs_type);
3591 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3593 scalar_dest = gimple_assign_lhs (stmt);
3594 lhs_type = TREE_TYPE (scalar_dest);
3595 vectype_out = get_vectype_for_scalar_type (lhs_type);
3598 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3601 if (nunits_in == nunits_out / 2)
3603 else if (nunits_out == nunits_in)
3605 else if (nunits_out == nunits_in / 2)
3610 if (modifier == NONE)
3611 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3613 /* Bail out if the types are both integral or non-integral. */
3614 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3615 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3618 integral_type = INTEGRAL_TYPE_P (rhs_type) ? vectype_in : vectype_out;
3620 if (modifier == NARROW)
3621 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3623 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3625 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3626 this, so we can safely override NCOPIES with 1 here. */
3630 /* Sanity check: make sure that at least one copy of the vectorized stmt
3631 needs to be generated. */
3632 gcc_assert (ncopies >= 1);
3634 /* Check the operands of the operation. */
3635 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3637 if (vect_print_dump_info (REPORT_DETAILS))
3638 fprintf (vect_dump, "use not simple.");
3642 /* Supportable by target? */
3643 if ((modifier == NONE
3644 && !targetm.vectorize.builtin_conversion (code, integral_type))
3645 || (modifier == WIDEN
3646 && !supportable_widening_operation (code, stmt, vectype_in,
3649 &dummy_int, &dummy))
3650 || (modifier == NARROW
3651 && !supportable_narrowing_operation (code, stmt, vectype_in,
3652 &code1, &dummy_int, &dummy)))
3654 if (vect_print_dump_info (REPORT_DETAILS))
3655 fprintf (vect_dump, "conversion not supported by target.");
3659 if (modifier != NONE)
3661 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3662 /* FORNOW: SLP not supported. */
3663 if (STMT_SLP_TYPE (stmt_info))
3667 if (!vec_stmt) /* transformation not required. */
3669 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3674 if (vect_print_dump_info (REPORT_DETAILS))
3675 fprintf (vect_dump, "transform conversion.");
3678 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3680 if (modifier == NONE && !slp_node)
3681 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3683 prev_stmt_info = NULL;
3687 for (j = 0; j < ncopies; j++)
3693 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3695 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3698 targetm.vectorize.builtin_conversion (code, integral_type);
3699 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3701 /* Arguments are ready. create the new vector stmt. */
3702 new_stmt = gimple_build_call (builtin_decl, 1, vop0);
3703 new_temp = make_ssa_name (vec_dest, new_stmt);
3704 gimple_call_set_lhs (new_stmt, new_temp);
3705 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3706 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3707 SSA_OP_ALL_VIRTUALS)
3709 if (TREE_CODE (sym) == SSA_NAME)
3710 sym = SSA_NAME_VAR (sym);
3711 mark_sym_for_renaming (sym);
3714 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3718 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3720 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3721 prev_stmt_info = vinfo_for_stmt (new_stmt);
3726 /* In case the vectorization factor (VF) is bigger than the number
3727 of elements that we can fit in a vectype (nunits), we have to
3728 generate more than one vector stmt - i.e - we need to "unroll"
3729 the vector stmt by a factor VF/nunits. */
3730 for (j = 0; j < ncopies; j++)
3733 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3735 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3737 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3739 /* Generate first half of the widened result: */
3741 = vect_gen_widened_results_half (code1, decl1,
3742 vec_oprnd0, vec_oprnd1,
3743 unary_op, vec_dest, gsi, stmt);
3745 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3747 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3748 prev_stmt_info = vinfo_for_stmt (new_stmt);
3750 /* Generate second half of the widened result: */
3752 = vect_gen_widened_results_half (code2, decl2,
3753 vec_oprnd0, vec_oprnd1,
3754 unary_op, vec_dest, gsi, stmt);
3755 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3756 prev_stmt_info = vinfo_for_stmt (new_stmt);
3761 /* In case the vectorization factor (VF) is bigger than the number
3762 of elements that we can fit in a vectype (nunits), we have to
3763 generate more than one vector stmt - i.e - we need to "unroll"
3764 the vector stmt by a factor VF/nunits. */
3765 for (j = 0; j < ncopies; j++)
3770 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3771 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3775 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3776 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3779 /* Arguments are ready. Create the new vector stmt. */
3780 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3781 new_stmt = gimple_build_assign_with_ops (code1, vec_dest, vec_oprnd0,
3783 new_temp = make_ssa_name (vec_dest, new_stmt);
3784 gimple_assign_set_lhs (new_stmt, new_temp);
3785 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3788 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3790 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3792 prev_stmt_info = vinfo_for_stmt (new_stmt);
3795 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3799 VEC_free (tree, heap, vec_oprnds0);
3805 /* Function vectorizable_assignment.
3807 Check if STMT performs an assignment (copy) that can be vectorized.
3808 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3809 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3810 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3813 vectorizable_assignment (gimple stmt, gimple_stmt_iterator *gsi,
3814 gimple *vec_stmt, slp_tree slp_node)
3819 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3820 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3821 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3825 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3826 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3829 VEC(tree,heap) *vec_oprnds = NULL;
3832 /* Multiple types in SLP are handled by creating the appropriate number of
3833 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
3838 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3840 gcc_assert (ncopies >= 1);
3842 return false; /* FORNOW */
3844 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3847 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3850 /* Is vectorizable assignment? */
3851 if (!is_gimple_assign (stmt))
3854 scalar_dest = gimple_assign_lhs (stmt);
3855 if (TREE_CODE (scalar_dest) != SSA_NAME)
3858 if (gimple_assign_single_p (stmt)
3859 || gimple_assign_rhs_code (stmt) == PAREN_EXPR)
3860 op = gimple_assign_rhs1 (stmt);
3864 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3866 if (vect_print_dump_info (REPORT_DETAILS))
3867 fprintf (vect_dump, "use not simple.");
3871 if (!vec_stmt) /* transformation not required. */
3873 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3874 if (vect_print_dump_info (REPORT_DETAILS))
3875 fprintf (vect_dump, "=== vectorizable_assignment ===");
3876 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3881 if (vect_print_dump_info (REPORT_DETAILS))
3882 fprintf (vect_dump, "transform assignment.");
3885 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3888 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3890 /* Arguments are ready. create the new vector stmt. */
3891 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3893 *vec_stmt = gimple_build_assign (vec_dest, vop);
3894 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3895 gimple_assign_set_lhs (*vec_stmt, new_temp);
3896 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
3897 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3900 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3903 VEC_free (tree, heap, vec_oprnds);
3908 /* Function vect_min_worthwhile_factor.
3910 For a loop where we could vectorize the operation indicated by CODE,
3911 return the minimum vectorization factor that makes it worthwhile
3912 to use generic vectors. */
3914 vect_min_worthwhile_factor (enum tree_code code)
3935 /* Function vectorizable_induction
3937 Check if PHI performs an induction computation that can be vectorized.
3938 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3939 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3940 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3943 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3946 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3947 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3948 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3949 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3950 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3951 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3954 gcc_assert (ncopies >= 1);
3955 /* FORNOW. This restriction should be relaxed. */
3956 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
3958 if (vect_print_dump_info (REPORT_DETAILS))
3959 fprintf (vect_dump, "multiple types in nested loop.");
3963 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3966 /* FORNOW: SLP not supported. */
3967 if (STMT_SLP_TYPE (stmt_info))
3970 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3972 if (gimple_code (phi) != GIMPLE_PHI)
3975 if (!vec_stmt) /* transformation not required. */
3977 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3978 if (vect_print_dump_info (REPORT_DETAILS))
3979 fprintf (vect_dump, "=== vectorizable_induction ===");
3980 vect_model_induction_cost (stmt_info, ncopies);
3986 if (vect_print_dump_info (REPORT_DETAILS))
3987 fprintf (vect_dump, "transform induction phi.");
3989 vec_def = get_initial_def_for_induction (phi);
3990 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3995 /* Function vectorizable_operation.
3997 Check if STMT performs a binary or unary operation that can be vectorized.
3998 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3999 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4000 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4003 vectorizable_operation (gimple stmt, gimple_stmt_iterator *gsi,
4004 gimple *vec_stmt, slp_tree slp_node)
4008 tree op0, op1 = NULL;
4009 tree vec_oprnd1 = NULL_TREE;
4010 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4011 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4012 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4013 enum tree_code code;
4014 enum machine_mode vec_mode;
4019 enum machine_mode optab_op2_mode;
4022 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4023 gimple new_stmt = NULL;
4024 stmt_vec_info prev_stmt_info;
4025 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
4030 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4033 bool shift_p = false;
4034 bool scalar_shift_arg = false;
4036 /* Multiple types in SLP are handled by creating the appropriate number of
4037 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4042 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4044 gcc_assert (ncopies >= 1);
4046 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4049 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4052 /* Is STMT a vectorizable binary/unary operation? */
4053 if (!is_gimple_assign (stmt))
4056 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4059 scalar_dest = gimple_assign_lhs (stmt);
4060 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4063 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4064 if (nunits_out != nunits_in)
4067 code = gimple_assign_rhs_code (stmt);
4069 /* For pointer addition, we should use the normal plus for
4070 the vector addition. */
4071 if (code == POINTER_PLUS_EXPR)
4074 /* Support only unary or binary operations. */
4075 op_type = TREE_CODE_LENGTH (code);
4076 if (op_type != unary_op && op_type != binary_op)
4078 if (vect_print_dump_info (REPORT_DETAILS))
4079 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
4083 op0 = gimple_assign_rhs1 (stmt);
4084 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4086 if (vect_print_dump_info (REPORT_DETAILS))
4087 fprintf (vect_dump, "use not simple.");
4091 if (op_type == binary_op)
4093 op1 = gimple_assign_rhs2 (stmt);
4094 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4096 if (vect_print_dump_info (REPORT_DETAILS))
4097 fprintf (vect_dump, "use not simple.");
4102 /* If this is a shift/rotate, determine whether the shift amount is a vector,
4103 or scalar. If the shift/rotate amount is a vector, use the vector/vector
4105 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
4106 || code == RROTATE_EXPR)
4110 /* vector shifted by vector */
4111 if (dt[1] == vect_loop_def)
4113 optab = optab_for_tree_code (code, vectype, optab_vector);
4114 if (vect_print_dump_info (REPORT_DETAILS))
4115 fprintf (vect_dump, "vector/vector shift/rotate found.");
4118 /* See if the machine has a vector shifted by scalar insn and if not
4119 then see if it has a vector shifted by vector insn */
4120 else if (dt[1] == vect_constant_def || dt[1] == vect_invariant_def)
4122 optab = optab_for_tree_code (code, vectype, optab_scalar);
4124 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4125 != CODE_FOR_nothing))
4127 scalar_shift_arg = true;
4128 if (vect_print_dump_info (REPORT_DETAILS))
4129 fprintf (vect_dump, "vector/scalar shift/rotate found.");
4133 optab = optab_for_tree_code (code, vectype, optab_vector);
4134 if (vect_print_dump_info (REPORT_DETAILS)
4136 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4137 != CODE_FOR_nothing))
4138 fprintf (vect_dump, "vector/vector shift/rotate found.");
4144 if (vect_print_dump_info (REPORT_DETAILS))
4145 fprintf (vect_dump, "operand mode requires invariant argument.");
4150 optab = optab_for_tree_code (code, vectype, optab_default);
4152 /* Supportable by target? */
4155 if (vect_print_dump_info (REPORT_DETAILS))
4156 fprintf (vect_dump, "no optab.");
4159 vec_mode = TYPE_MODE (vectype);
4160 icode = (int) optab_handler (optab, vec_mode)->insn_code;
4161 if (icode == CODE_FOR_nothing)
4163 if (vect_print_dump_info (REPORT_DETAILS))
4164 fprintf (vect_dump, "op not supported by target.");
4165 /* Check only during analysis. */
4166 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4167 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4168 < vect_min_worthwhile_factor (code)
4171 if (vect_print_dump_info (REPORT_DETAILS))
4172 fprintf (vect_dump, "proceeding using word mode.");
4175 /* Worthwhile without SIMD support? Check only during analysis. */
4176 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
4177 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4178 < vect_min_worthwhile_factor (code)
4181 if (vect_print_dump_info (REPORT_DETAILS))
4182 fprintf (vect_dump, "not worthwhile without SIMD support.");
4186 if (!vec_stmt) /* transformation not required. */
4188 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
4189 if (vect_print_dump_info (REPORT_DETAILS))
4190 fprintf (vect_dump, "=== vectorizable_operation ===");
4191 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4197 if (vect_print_dump_info (REPORT_DETAILS))
4198 fprintf (vect_dump, "transform binary/unary operation.");
4201 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4203 /* Allocate VECs for vector operands. In case of SLP, vector operands are
4204 created in the previous stages of the recursion, so no allocation is
4205 needed, except for the case of shift with scalar shift argument. In that
4206 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
4207 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
4208 In case of loop-based vectorization we allocate VECs of size 1. We
4209 allocate VEC_OPRNDS1 only in case of binary operation. */
4212 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4213 if (op_type == binary_op)
4214 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4216 else if (scalar_shift_arg)
4217 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4219 /* In case the vectorization factor (VF) is bigger than the number
4220 of elements that we can fit in a vectype (nunits), we have to generate
4221 more than one vector stmt - i.e - we need to "unroll" the
4222 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4223 from one copy of the vector stmt to the next, in the field
4224 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4225 stages to find the correct vector defs to be used when vectorizing
4226 stmts that use the defs of the current stmt. The example below illustrates
4227 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4228 4 vectorized stmts):
4230 before vectorization:
4231 RELATED_STMT VEC_STMT
4235 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4237 RELATED_STMT VEC_STMT
4238 VS1_0: vx0 = memref0 VS1_1 -
4239 VS1_1: vx1 = memref1 VS1_2 -
4240 VS1_2: vx2 = memref2 VS1_3 -
4241 VS1_3: vx3 = memref3 - -
4242 S1: x = load - VS1_0
4245 step2: vectorize stmt S2 (done here):
4246 To vectorize stmt S2 we first need to find the relevant vector
4247 def for the first operand 'x'. This is, as usual, obtained from
4248 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4249 that defines 'x' (S1). This way we find the stmt VS1_0, and the
4250 relevant vector def 'vx0'. Having found 'vx0' we can generate
4251 the vector stmt VS2_0, and as usual, record it in the
4252 STMT_VINFO_VEC_STMT of stmt S2.
4253 When creating the second copy (VS2_1), we obtain the relevant vector
4254 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4255 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4256 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4257 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4258 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4259 chain of stmts and pointers:
4260 RELATED_STMT VEC_STMT
4261 VS1_0: vx0 = memref0 VS1_1 -
4262 VS1_1: vx1 = memref1 VS1_2 -
4263 VS1_2: vx2 = memref2 VS1_3 -
4264 VS1_3: vx3 = memref3 - -
4265 S1: x = load - VS1_0
4266 VS2_0: vz0 = vx0 + v1 VS2_1 -
4267 VS2_1: vz1 = vx1 + v1 VS2_2 -
4268 VS2_2: vz2 = vx2 + v1 VS2_3 -
4269 VS2_3: vz3 = vx3 + v1 - -
4270 S2: z = x + 1 - VS2_0 */
4272 prev_stmt_info = NULL;
4273 for (j = 0; j < ncopies; j++)
4278 if (op_type == binary_op && scalar_shift_arg)
4280 /* Vector shl and shr insn patterns can be defined with scalar
4281 operand 2 (shift operand). In this case, use constant or loop
4282 invariant op1 directly, without extending it to vector mode
4284 optab_op2_mode = insn_data[icode].operand[2].mode;
4285 if (!VECTOR_MODE_P (optab_op2_mode))
4287 if (vect_print_dump_info (REPORT_DETAILS))
4288 fprintf (vect_dump, "operand 1 using scalar mode.");
4290 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4293 /* Store vec_oprnd1 for every vector stmt to be created
4294 for SLP_NODE. We check during the analysis that all the
4295 shift arguments are the same.
4296 TODO: Allow different constants for different vector
4297 stmts generated for an SLP instance. */
4298 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4299 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4304 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4305 (a special case for certain kind of vector shifts); otherwise,
4306 operand 1 should be of a vector type (the usual case). */
4307 if (op_type == binary_op && !vec_oprnd1)
4308 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4311 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4315 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4317 /* Arguments are ready. Create the new vector stmt. */
4318 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4320 vop1 = ((op_type == binary_op)
4321 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4322 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4323 new_temp = make_ssa_name (vec_dest, new_stmt);
4324 gimple_assign_set_lhs (new_stmt, new_temp);
4325 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4327 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4334 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4336 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4337 prev_stmt_info = vinfo_for_stmt (new_stmt);
4340 VEC_free (tree, heap, vec_oprnds0);
4342 VEC_free (tree, heap, vec_oprnds1);
4348 /* Get vectorized definitions for loop-based vectorization. For the first
4349 operand we call vect_get_vec_def_for_operand() (with OPRND containing
4350 scalar operand), and for the rest we get a copy with
4351 vect_get_vec_def_for_stmt_copy() using the previous vector definition
4352 (stored in OPRND). See vect_get_vec_def_for_stmt_copy() for details.
4353 The vectors are collected into VEC_OPRNDS. */
4356 vect_get_loop_based_defs (tree *oprnd, gimple stmt, enum vect_def_type dt,
4357 VEC (tree, heap) **vec_oprnds, int multi_step_cvt)
4361 /* Get first vector operand. */
4362 /* All the vector operands except the very first one (that is scalar oprnd)
4364 if (TREE_CODE (TREE_TYPE (*oprnd)) != VECTOR_TYPE)
4365 vec_oprnd = vect_get_vec_def_for_operand (*oprnd, stmt, NULL);
4367 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, *oprnd);
4369 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4371 /* Get second vector operand. */
4372 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd);
4373 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4377 /* For conversion in multiple steps, continue to get operands
4380 vect_get_loop_based_defs (oprnd, stmt, dt, vec_oprnds, multi_step_cvt - 1);
4384 /* Create vectorized demotion statements for vector operands from VEC_OPRNDS.
4385 For multi-step conversions store the resulting vectors and call the function
4389 vect_create_vectorized_demotion_stmts (VEC (tree, heap) **vec_oprnds,
4390 int multi_step_cvt, gimple stmt,
4391 VEC (tree, heap) *vec_dsts,
4392 gimple_stmt_iterator *gsi,
4393 slp_tree slp_node, enum tree_code code,
4394 stmt_vec_info *prev_stmt_info)
4397 tree vop0, vop1, new_tmp, vec_dest;
4399 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4401 vec_dest = VEC_pop (tree, vec_dsts);
4403 for (i = 0; i < VEC_length (tree, *vec_oprnds); i += 2)
4405 /* Create demotion operation. */
4406 vop0 = VEC_index (tree, *vec_oprnds, i);
4407 vop1 = VEC_index (tree, *vec_oprnds, i + 1);
4408 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4409 new_tmp = make_ssa_name (vec_dest, new_stmt);
4410 gimple_assign_set_lhs (new_stmt, new_tmp);
4411 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4414 /* Store the resulting vector for next recursive call. */
4415 VEC_replace (tree, *vec_oprnds, i/2, new_tmp);
4418 /* This is the last step of the conversion sequence. Store the
4419 vectors in SLP_NODE or in vector info of the scalar statement
4420 (or in STMT_VINFO_RELATED_STMT chain). */
4422 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4425 if (!*prev_stmt_info)
4426 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4428 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt;
4430 *prev_stmt_info = vinfo_for_stmt (new_stmt);
4435 /* For multi-step demotion operations we first generate demotion operations
4436 from the source type to the intermediate types, and then combine the
4437 results (stored in VEC_OPRNDS) in demotion operation to the destination
4441 /* At each level of recursion we have have of the operands we had at the
4443 VEC_truncate (tree, *vec_oprnds, (i+1)/2);
4444 vect_create_vectorized_demotion_stmts (vec_oprnds, multi_step_cvt - 1,
4445 stmt, vec_dsts, gsi, slp_node,
4446 code, prev_stmt_info);
4451 /* Function vectorizable_type_demotion
4453 Check if STMT performs a binary or unary operation that involves
4454 type demotion, and if it can be vectorized.
4455 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4456 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4457 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4460 vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi,
4461 gimple *vec_stmt, slp_tree slp_node)
4466 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4467 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4468 enum tree_code code, code1 = ERROR_MARK;
4471 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4472 stmt_vec_info prev_stmt_info;
4479 int multi_step_cvt = 0;
4480 VEC (tree, heap) *vec_oprnds0 = NULL;
4481 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4482 tree last_oprnd, intermediate_type;
4484 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4487 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4490 /* Is STMT a vectorizable type-demotion operation? */
4491 if (!is_gimple_assign (stmt))
4494 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4497 code = gimple_assign_rhs_code (stmt);
4498 if (!CONVERT_EXPR_CODE_P (code))
4501 op0 = gimple_assign_rhs1 (stmt);
4502 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4505 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4507 scalar_dest = gimple_assign_lhs (stmt);
4508 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4511 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4512 if (nunits_in >= nunits_out)
4515 /* Multiple types in SLP are handled by creating the appropriate number of
4516 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4521 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4523 gcc_assert (ncopies >= 1);
4525 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4526 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4527 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4528 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4529 && CONVERT_EXPR_CODE_P (code))))
4532 /* Check the operands of the operation. */
4533 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4535 if (vect_print_dump_info (REPORT_DETAILS))
4536 fprintf (vect_dump, "use not simple.");
4540 /* Supportable by target? */
4541 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1,
4542 &multi_step_cvt, &interm_types))
4545 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4547 if (!vec_stmt) /* transformation not required. */
4549 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4550 if (vect_print_dump_info (REPORT_DETAILS))
4551 fprintf (vect_dump, "=== vectorizable_demotion ===");
4552 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4557 if (vect_print_dump_info (REPORT_DETAILS))
4558 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4561 /* In case of multi-step demotion, we first generate demotion operations to
4562 the intermediate types, and then from that types to the final one.
4563 We create vector destinations for the intermediate type (TYPES) received
4564 from supportable_narrowing_operation, and store them in the correct order
4565 for future use in vect_create_vectorized_demotion_stmts(). */
4567 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4569 vec_dsts = VEC_alloc (tree, heap, 1);
4571 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4572 VEC_quick_push (tree, vec_dsts, vec_dest);
4576 for (i = VEC_length (tree, interm_types) - 1;
4577 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4579 vec_dest = vect_create_destination_var (scalar_dest,
4581 VEC_quick_push (tree, vec_dsts, vec_dest);
4585 /* In case the vectorization factor (VF) is bigger than the number
4586 of elements that we can fit in a vectype (nunits), we have to generate
4587 more than one vector stmt - i.e - we need to "unroll" the
4588 vector stmt by a factor VF/nunits. */
4590 prev_stmt_info = NULL;
4591 for (j = 0; j < ncopies; j++)
4595 vect_get_slp_defs (slp_node, &vec_oprnds0, NULL);
4598 VEC_free (tree, heap, vec_oprnds0);
4599 vec_oprnds0 = VEC_alloc (tree, heap,
4600 (multi_step_cvt ? vect_pow2 (multi_step_cvt) * 2 : 2));
4601 vect_get_loop_based_defs (&last_oprnd, stmt, dt[0], &vec_oprnds0,
4602 vect_pow2 (multi_step_cvt) - 1);
4605 /* Arguments are ready. Create the new vector stmts. */
4606 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4607 vect_create_vectorized_demotion_stmts (&vec_oprnds0,
4608 multi_step_cvt, stmt, tmp_vec_dsts,
4609 gsi, slp_node, code1,
4613 VEC_free (tree, heap, vec_oprnds0);
4614 VEC_free (tree, heap, vec_dsts);
4615 VEC_free (tree, heap, tmp_vec_dsts);
4616 VEC_free (tree, heap, interm_types);
4618 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4623 /* Create vectorized promotion statements for vector operands from VEC_OPRNDS0
4624 and VEC_OPRNDS1 (for binary operations). For multi-step conversions store
4625 the resulting vectors and call the function recursively. */
4628 vect_create_vectorized_promotion_stmts (VEC (tree, heap) **vec_oprnds0,
4629 VEC (tree, heap) **vec_oprnds1,
4630 int multi_step_cvt, gimple stmt,
4631 VEC (tree, heap) *vec_dsts,
4632 gimple_stmt_iterator *gsi,
4633 slp_tree slp_node, enum tree_code code1,
4634 enum tree_code code2, tree decl1,
4635 tree decl2, int op_type,
4636 stmt_vec_info *prev_stmt_info)
4639 tree vop0, vop1, new_tmp1, new_tmp2, vec_dest;
4640 gimple new_stmt1, new_stmt2;
4641 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4642 VEC (tree, heap) *vec_tmp;
4644 vec_dest = VEC_pop (tree, vec_dsts);
4645 vec_tmp = VEC_alloc (tree, heap, VEC_length (tree, *vec_oprnds0) * 2);
4647 for (i = 0; VEC_iterate (tree, *vec_oprnds0, i, vop0); i++)
4649 if (op_type == binary_op)
4650 vop1 = VEC_index (tree, *vec_oprnds1, i);
4654 /* Generate the two halves of promotion operation. */
4655 new_stmt1 = vect_gen_widened_results_half (code1, decl1, vop0, vop1,
4656 op_type, vec_dest, gsi, stmt);
4657 new_stmt2 = vect_gen_widened_results_half (code2, decl2, vop0, vop1,
4658 op_type, vec_dest, gsi, stmt);
4659 if (is_gimple_call (new_stmt1))
4661 new_tmp1 = gimple_call_lhs (new_stmt1);
4662 new_tmp2 = gimple_call_lhs (new_stmt2);
4666 new_tmp1 = gimple_assign_lhs (new_stmt1);
4667 new_tmp2 = gimple_assign_lhs (new_stmt2);
4672 /* Store the results for the recursive call. */
4673 VEC_quick_push (tree, vec_tmp, new_tmp1);
4674 VEC_quick_push (tree, vec_tmp, new_tmp2);
4678 /* Last step of promotion sequience - store the results. */
4681 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt1);
4682 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt2);
4686 if (!*prev_stmt_info)
4687 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt1;
4689 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt1;
4691 *prev_stmt_info = vinfo_for_stmt (new_stmt1);
4692 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt2;
4693 *prev_stmt_info = vinfo_for_stmt (new_stmt2);
4700 /* For multi-step promotion operation we first generate we call the
4701 function recurcively for every stage. We start from the input type,
4702 create promotion operations to the intermediate types, and then
4703 create promotions to the output type. */
4704 *vec_oprnds0 = VEC_copy (tree, heap, vec_tmp);
4705 VEC_free (tree, heap, vec_tmp);
4706 vect_create_vectorized_promotion_stmts (vec_oprnds0, vec_oprnds1,
4707 multi_step_cvt - 1, stmt,
4708 vec_dsts, gsi, slp_node, code1,
4709 code2, decl2, decl2, op_type,
4715 /* Function vectorizable_type_promotion
4717 Check if STMT performs a binary or unary operation that involves
4718 type promotion, and if it can be vectorized.
4719 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4720 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4721 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4724 vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
4725 gimple *vec_stmt, slp_tree slp_node)
4729 tree op0, op1 = NULL;
4730 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4731 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4732 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4733 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4734 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4738 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4739 stmt_vec_info prev_stmt_info;
4746 tree intermediate_type = NULL_TREE;
4747 int multi_step_cvt = 0;
4748 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4749 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4751 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4754 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4757 /* Is STMT a vectorizable type-promotion operation? */
4758 if (!is_gimple_assign (stmt))
4761 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4764 code = gimple_assign_rhs_code (stmt);
4765 if (!CONVERT_EXPR_CODE_P (code)
4766 && code != WIDEN_MULT_EXPR)
4769 op0 = gimple_assign_rhs1 (stmt);
4770 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4773 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4775 scalar_dest = gimple_assign_lhs (stmt);
4776 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4779 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4780 if (nunits_in <= nunits_out)
4783 /* Multiple types in SLP are handled by creating the appropriate number of
4784 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4789 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4791 gcc_assert (ncopies >= 1);
4793 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4794 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4795 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4796 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4797 && CONVERT_EXPR_CODE_P (code))))
4800 /* Check the operands of the operation. */
4801 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4803 if (vect_print_dump_info (REPORT_DETAILS))
4804 fprintf (vect_dump, "use not simple.");
4808 op_type = TREE_CODE_LENGTH (code);
4809 if (op_type == binary_op)
4811 op1 = gimple_assign_rhs2 (stmt);
4812 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4814 if (vect_print_dump_info (REPORT_DETAILS))
4815 fprintf (vect_dump, "use not simple.");
4820 /* Supportable by target? */
4821 if (!supportable_widening_operation (code, stmt, vectype_in,
4822 &decl1, &decl2, &code1, &code2,
4823 &multi_step_cvt, &interm_types))
4826 /* Binary widening operation can only be supported directly by the
4828 gcc_assert (!(multi_step_cvt && op_type == binary_op));
4830 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4832 if (!vec_stmt) /* transformation not required. */
4834 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4835 if (vect_print_dump_info (REPORT_DETAILS))
4836 fprintf (vect_dump, "=== vectorizable_promotion ===");
4837 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4843 if (vect_print_dump_info (REPORT_DETAILS))
4844 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4848 /* In case of multi-step promotion, we first generate promotion operations
4849 to the intermediate types, and then from that types to the final one.
4850 We store vector destination in VEC_DSTS in the correct order for
4851 recursive creation of promotion operations in
4852 vect_create_vectorized_promotion_stmts(). Vector destinations are created
4853 according to TYPES recieved from supportable_widening_operation(). */
4855 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4857 vec_dsts = VEC_alloc (tree, heap, 1);
4859 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4860 VEC_quick_push (tree, vec_dsts, vec_dest);
4864 for (i = VEC_length (tree, interm_types) - 1;
4865 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4867 vec_dest = vect_create_destination_var (scalar_dest,
4869 VEC_quick_push (tree, vec_dsts, vec_dest);
4875 vec_oprnds0 = VEC_alloc (tree, heap,
4876 (multi_step_cvt ? vect_pow2 (multi_step_cvt) : 1));
4877 if (op_type == binary_op)
4878 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4881 /* In case the vectorization factor (VF) is bigger than the number
4882 of elements that we can fit in a vectype (nunits), we have to generate
4883 more than one vector stmt - i.e - we need to "unroll" the
4884 vector stmt by a factor VF/nunits. */
4886 prev_stmt_info = NULL;
4887 for (j = 0; j < ncopies; j++)
4893 vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1);
4896 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4897 VEC_quick_push (tree, vec_oprnds0, vec_oprnd0);
4898 if (op_type == binary_op)
4900 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4901 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4907 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4908 VEC_replace (tree, vec_oprnds0, 0, vec_oprnd0);
4909 if (op_type == binary_op)
4911 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4912 VEC_replace (tree, vec_oprnds1, 0, vec_oprnd1);
4916 /* Arguments are ready. Create the new vector stmts. */
4917 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4918 vect_create_vectorized_promotion_stmts (&vec_oprnds0, &vec_oprnds1,
4919 multi_step_cvt, stmt,
4921 gsi, slp_node, code1, code2,
4922 decl1, decl2, op_type,
4926 VEC_free (tree, heap, vec_dsts);
4927 VEC_free (tree, heap, tmp_vec_dsts);
4928 VEC_free (tree, heap, interm_types);
4929 VEC_free (tree, heap, vec_oprnds0);
4930 VEC_free (tree, heap, vec_oprnds1);
4932 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4937 /* Function vect_strided_store_supported.
4939 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4940 and FALSE otherwise. */
4943 vect_strided_store_supported (tree vectype)
4945 optab interleave_high_optab, interleave_low_optab;
4948 mode = (int) TYPE_MODE (vectype);
4950 /* Check that the operation is supported. */
4951 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4952 vectype, optab_default);
4953 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4954 vectype, optab_default);
4955 if (!interleave_high_optab || !interleave_low_optab)
4957 if (vect_print_dump_info (REPORT_DETAILS))
4958 fprintf (vect_dump, "no optab for interleave.");
4962 if (optab_handler (interleave_high_optab, mode)->insn_code
4964 || optab_handler (interleave_low_optab, mode)->insn_code
4965 == CODE_FOR_nothing)
4967 if (vect_print_dump_info (REPORT_DETAILS))
4968 fprintf (vect_dump, "interleave op not supported by target.");
4976 /* Function vect_permute_store_chain.
4978 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4979 a power of 2, generate interleave_high/low stmts to reorder the data
4980 correctly for the stores. Return the final references for stores in
4983 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4984 The input is 4 vectors each containing 8 elements. We assign a number to each
4985 element, the input sequence is:
4987 1st vec: 0 1 2 3 4 5 6 7
4988 2nd vec: 8 9 10 11 12 13 14 15
4989 3rd vec: 16 17 18 19 20 21 22 23
4990 4th vec: 24 25 26 27 28 29 30 31
4992 The output sequence should be:
4994 1st vec: 0 8 16 24 1 9 17 25
4995 2nd vec: 2 10 18 26 3 11 19 27
4996 3rd vec: 4 12 20 28 5 13 21 30
4997 4th vec: 6 14 22 30 7 15 23 31
4999 i.e., we interleave the contents of the four vectors in their order.
5001 We use interleave_high/low instructions to create such output. The input of
5002 each interleave_high/low operation is two vectors:
5005 the even elements of the result vector are obtained left-to-right from the
5006 high/low elements of the first vector. The odd elements of the result are
5007 obtained left-to-right from the high/low elements of the second vector.
5008 The output of interleave_high will be: 0 4 1 5
5009 and of interleave_low: 2 6 3 7
5012 The permutation is done in log LENGTH stages. In each stage interleave_high
5013 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5014 where the first argument is taken from the first half of DR_CHAIN and the
5015 second argument from it's second half.
5018 I1: interleave_high (1st vec, 3rd vec)
5019 I2: interleave_low (1st vec, 3rd vec)
5020 I3: interleave_high (2nd vec, 4th vec)
5021 I4: interleave_low (2nd vec, 4th vec)
5023 The output for the first stage is:
5025 I1: 0 16 1 17 2 18 3 19
5026 I2: 4 20 5 21 6 22 7 23
5027 I3: 8 24 9 25 10 26 11 27
5028 I4: 12 28 13 29 14 30 15 31
5030 The output of the second stage, i.e. the final result is:
5032 I1: 0 8 16 24 1 9 17 25
5033 I2: 2 10 18 26 3 11 19 27
5034 I3: 4 12 20 28 5 13 21 30
5035 I4: 6 14 22 30 7 15 23 31. */
5038 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
5039 unsigned int length,
5041 gimple_stmt_iterator *gsi,
5042 VEC(tree,heap) **result_chain)
5044 tree perm_dest, vect1, vect2, high, low;
5046 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5050 enum tree_code high_code, low_code;
5052 scalar_dest = gimple_assign_lhs (stmt);
5054 /* Check that the operation is supported. */
5055 if (!vect_strided_store_supported (vectype))
5058 *result_chain = VEC_copy (tree, heap, dr_chain);
5060 for (i = 0; i < exact_log2 (length); i++)
5062 for (j = 0; j < length/2; j++)
5064 vect1 = VEC_index (tree, dr_chain, j);
5065 vect2 = VEC_index (tree, dr_chain, j+length/2);
5067 /* Create interleaving stmt:
5068 in the case of big endian:
5069 high = interleave_high (vect1, vect2)
5070 and in the case of little endian:
5071 high = interleave_low (vect1, vect2). */
5072 perm_dest = create_tmp_var (vectype, "vect_inter_high");
5073 DECL_GIMPLE_REG_P (perm_dest) = 1;
5074 add_referenced_var (perm_dest);
5075 if (BYTES_BIG_ENDIAN)
5077 high_code = VEC_INTERLEAVE_HIGH_EXPR;
5078 low_code = VEC_INTERLEAVE_LOW_EXPR;
5082 low_code = VEC_INTERLEAVE_HIGH_EXPR;
5083 high_code = VEC_INTERLEAVE_LOW_EXPR;
5085 perm_stmt = gimple_build_assign_with_ops (high_code, perm_dest,
5087 high = make_ssa_name (perm_dest, perm_stmt);
5088 gimple_assign_set_lhs (perm_stmt, high);
5089 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5090 VEC_replace (tree, *result_chain, 2*j, high);
5092 /* Create interleaving stmt:
5093 in the case of big endian:
5094 low = interleave_low (vect1, vect2)
5095 and in the case of little endian:
5096 low = interleave_high (vect1, vect2). */
5097 perm_dest = create_tmp_var (vectype, "vect_inter_low");
5098 DECL_GIMPLE_REG_P (perm_dest) = 1;
5099 add_referenced_var (perm_dest);
5100 perm_stmt = gimple_build_assign_with_ops (low_code, perm_dest,
5102 low = make_ssa_name (perm_dest, perm_stmt);
5103 gimple_assign_set_lhs (perm_stmt, low);
5104 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5105 VEC_replace (tree, *result_chain, 2*j+1, low);
5107 dr_chain = VEC_copy (tree, heap, *result_chain);
5113 /* Function vectorizable_store.
5115 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
5117 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5118 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5119 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5122 vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
5128 tree vec_oprnd = NULL_TREE;
5129 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5130 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
5131 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5132 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5133 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5134 enum machine_mode vec_mode;
5136 enum dr_alignment_support alignment_support_scheme;
5139 enum vect_def_type dt;
5140 stmt_vec_info prev_stmt_info = NULL;
5141 tree dataref_ptr = NULL_TREE;
5142 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5145 gimple next_stmt, first_stmt = NULL;
5146 bool strided_store = false;
5147 unsigned int group_size, i;
5148 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
5150 VEC(tree,heap) *vec_oprnds = NULL;
5151 bool slp = (slp_node != NULL);
5152 stmt_vec_info first_stmt_vinfo;
5153 unsigned int vec_num;
5155 /* Multiple types in SLP are handled by creating the appropriate number of
5156 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
5161 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5163 gcc_assert (ncopies >= 1);
5165 /* FORNOW. This restriction should be relaxed. */
5166 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
5168 if (vect_print_dump_info (REPORT_DETAILS))
5169 fprintf (vect_dump, "multiple types in nested loop.");
5173 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5176 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5179 /* Is vectorizable store? */
5181 if (!is_gimple_assign (stmt))
5184 scalar_dest = gimple_assign_lhs (stmt);
5185 if (TREE_CODE (scalar_dest) != ARRAY_REF
5186 && TREE_CODE (scalar_dest) != INDIRECT_REF
5187 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5190 gcc_assert (gimple_assign_single_p (stmt));
5191 op = gimple_assign_rhs1 (stmt);
5192 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5194 if (vect_print_dump_info (REPORT_DETAILS))
5195 fprintf (vect_dump, "use not simple.");
5199 /* The scalar rhs type needs to be trivially convertible to the vector
5200 component type. This should always be the case. */
5201 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (op)))
5203 if (vect_print_dump_info (REPORT_DETAILS))
5204 fprintf (vect_dump, "??? operands of different types");
5208 vec_mode = TYPE_MODE (vectype);
5209 /* FORNOW. In some cases can vectorize even if data-type not supported
5210 (e.g. - array initialization with 0). */
5211 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
5214 if (!STMT_VINFO_DATA_REF (stmt_info))
5217 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5219 strided_store = true;
5220 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5221 if (!vect_strided_store_supported (vectype)
5222 && !PURE_SLP_STMT (stmt_info) && !slp)
5225 if (first_stmt == stmt)
5227 /* STMT is the leader of the group. Check the operands of all the
5228 stmts of the group. */
5229 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
5232 gcc_assert (gimple_assign_single_p (next_stmt));
5233 op = gimple_assign_rhs1 (next_stmt);
5234 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5236 if (vect_print_dump_info (REPORT_DETAILS))
5237 fprintf (vect_dump, "use not simple.");
5240 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5245 if (!vec_stmt) /* transformation not required. */
5247 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
5248 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
5256 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5257 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5259 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
5262 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5264 /* We vectorize all the stmts of the interleaving group when we
5265 reach the last stmt in the group. */
5266 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
5267 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
5275 strided_store = false;
5277 /* VEC_NUM is the number of vect stmts to be created for this group. */
5279 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5281 vec_num = group_size;
5287 group_size = vec_num = 1;
5288 first_stmt_vinfo = stmt_info;
5291 if (vect_print_dump_info (REPORT_DETAILS))
5292 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
5294 dr_chain = VEC_alloc (tree, heap, group_size);
5295 oprnds = VEC_alloc (tree, heap, group_size);
5297 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5298 gcc_assert (alignment_support_scheme);
5299 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
5301 /* In case the vectorization factor (VF) is bigger than the number
5302 of elements that we can fit in a vectype (nunits), we have to generate
5303 more than one vector stmt - i.e - we need to "unroll" the
5304 vector stmt by a factor VF/nunits. For more details see documentation in
5305 vect_get_vec_def_for_copy_stmt. */
5307 /* In case of interleaving (non-unit strided access):
5314 We create vectorized stores starting from base address (the access of the
5315 first stmt in the chain (S2 in the above example), when the last store stmt
5316 of the chain (S4) is reached:
5319 VS2: &base + vec_size*1 = vx0
5320 VS3: &base + vec_size*2 = vx1
5321 VS4: &base + vec_size*3 = vx3
5323 Then permutation statements are generated:
5325 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
5326 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
5329 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5330 (the order of the data-refs in the output of vect_permute_store_chain
5331 corresponds to the order of scalar stmts in the interleaving chain - see
5332 the documentation of vect_permute_store_chain()).
5334 In case of both multiple types and interleaving, above vector stores and
5335 permutation stmts are created for every copy. The result vector stmts are
5336 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5337 STMT_VINFO_RELATED_STMT for the next copies.
5340 prev_stmt_info = NULL;
5341 for (j = 0; j < ncopies; j++)
5350 /* Get vectorized arguments for SLP_NODE. */
5351 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
5353 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
5357 /* For interleaved stores we collect vectorized defs for all the
5358 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
5359 used as an input to vect_permute_store_chain(), and OPRNDS as
5360 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
5362 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5363 OPRNDS are of size 1. */
5364 next_stmt = first_stmt;
5365 for (i = 0; i < group_size; i++)
5367 /* Since gaps are not supported for interleaved stores,
5368 GROUP_SIZE is the exact number of stmts in the chain.
5369 Therefore, NEXT_STMT can't be NULL_TREE. In case that
5370 there is no interleaving, GROUP_SIZE is 1, and only one
5371 iteration of the loop will be executed. */
5372 gcc_assert (next_stmt
5373 && gimple_assign_single_p (next_stmt));
5374 op = gimple_assign_rhs1 (next_stmt);
5376 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
5378 VEC_quick_push(tree, dr_chain, vec_oprnd);
5379 VEC_quick_push(tree, oprnds, vec_oprnd);
5380 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5384 /* We should have catched mismatched types earlier. */
5385 gcc_assert (useless_type_conversion_p (vectype,
5386 TREE_TYPE (vec_oprnd)));
5387 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
5388 &dummy, &ptr_incr, false,
5390 gcc_assert (!inv_p);
5394 /* For interleaved stores we created vectorized defs for all the
5395 defs stored in OPRNDS in the previous iteration (previous copy).
5396 DR_CHAIN is then used as an input to vect_permute_store_chain(),
5397 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
5399 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5400 OPRNDS are of size 1. */
5401 for (i = 0; i < group_size; i++)
5403 op = VEC_index (tree, oprnds, i);
5404 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
5405 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
5406 VEC_replace(tree, dr_chain, i, vec_oprnd);
5407 VEC_replace(tree, oprnds, i, vec_oprnd);
5410 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
5415 result_chain = VEC_alloc (tree, heap, group_size);
5417 if (!vect_permute_store_chain (dr_chain, group_size, stmt, gsi,
5422 next_stmt = first_stmt;
5423 for (i = 0; i < vec_num; i++)
5426 /* Bump the vector pointer. */
5427 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
5431 vec_oprnd = VEC_index (tree, vec_oprnds, i);
5432 else if (strided_store)
5433 /* For strided stores vectorized defs are interleaved in
5434 vect_permute_store_chain(). */
5435 vec_oprnd = VEC_index (tree, result_chain, i);
5437 data_ref = build_fold_indirect_ref (dataref_ptr);
5439 /* Arguments are ready. Create the new vector stmt. */
5440 new_stmt = gimple_build_assign (data_ref, vec_oprnd);
5441 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5442 mark_symbols_for_renaming (new_stmt);
5448 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5450 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5452 prev_stmt_info = vinfo_for_stmt (new_stmt);
5453 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5459 VEC_free (tree, heap, dr_chain);
5460 VEC_free (tree, heap, oprnds);
5462 VEC_free (tree, heap, result_chain);
5468 /* Function vect_setup_realignment
5470 This function is called when vectorizing an unaligned load using
5471 the dr_explicit_realign[_optimized] scheme.
5472 This function generates the following code at the loop prolog:
5475 x msq_init = *(floor(p)); # prolog load
5476 realignment_token = call target_builtin;
5478 x msq = phi (msq_init, ---)
5480 The stmts marked with x are generated only for the case of
5481 dr_explicit_realign_optimized.
5483 The code above sets up a new (vector) pointer, pointing to the first
5484 location accessed by STMT, and a "floor-aligned" load using that pointer.
5485 It also generates code to compute the "realignment-token" (if the relevant
5486 target hook was defined), and creates a phi-node at the loop-header bb
5487 whose arguments are the result of the prolog-load (created by this
5488 function) and the result of a load that takes place in the loop (to be
5489 created by the caller to this function).
5491 For the case of dr_explicit_realign_optimized:
5492 The caller to this function uses the phi-result (msq) to create the
5493 realignment code inside the loop, and sets up the missing phi argument,
5496 msq = phi (msq_init, lsq)
5497 lsq = *(floor(p')); # load in loop
5498 result = realign_load (msq, lsq, realignment_token);
5500 For the case of dr_explicit_realign:
5502 msq = *(floor(p)); # load in loop
5504 lsq = *(floor(p')); # load in loop
5505 result = realign_load (msq, lsq, realignment_token);
5508 STMT - (scalar) load stmt to be vectorized. This load accesses
5509 a memory location that may be unaligned.
5510 BSI - place where new code is to be inserted.
5511 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5515 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5516 target hook, if defined.
5517 Return value - the result of the loop-header phi node. */
5520 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
5521 tree *realignment_token,
5522 enum dr_alignment_support alignment_support_scheme,
5524 struct loop **at_loop)
5526 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5527 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5528 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5529 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5531 tree scalar_dest = gimple_assign_lhs (stmt);
5538 tree msq_init = NULL_TREE;
5541 tree msq = NULL_TREE;
5542 gimple_seq stmts = NULL;
5544 bool compute_in_loop = false;
5545 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5546 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5547 struct loop *loop_for_initial_load;
5549 gcc_assert (alignment_support_scheme == dr_explicit_realign
5550 || alignment_support_scheme == dr_explicit_realign_optimized);
5552 /* We need to generate three things:
5553 1. the misalignment computation
5554 2. the extra vector load (for the optimized realignment scheme).
5555 3. the phi node for the two vectors from which the realignment is
5556 done (for the optimized realignment scheme).
5559 /* 1. Determine where to generate the misalignment computation.
5561 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5562 calculation will be generated by this function, outside the loop (in the
5563 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5564 caller, inside the loop.
5566 Background: If the misalignment remains fixed throughout the iterations of
5567 the loop, then both realignment schemes are applicable, and also the
5568 misalignment computation can be done outside LOOP. This is because we are
5569 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5570 are a multiple of VS (the Vector Size), and therefore the misalignment in
5571 different vectorized LOOP iterations is always the same.
5572 The problem arises only if the memory access is in an inner-loop nested
5573 inside LOOP, which is now being vectorized using outer-loop vectorization.
5574 This is the only case when the misalignment of the memory access may not
5575 remain fixed throughout the iterations of the inner-loop (as explained in
5576 detail in vect_supportable_dr_alignment). In this case, not only is the
5577 optimized realignment scheme not applicable, but also the misalignment
5578 computation (and generation of the realignment token that is passed to
5579 REALIGN_LOAD) have to be done inside the loop.
5581 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5582 or not, which in turn determines if the misalignment is computed inside
5583 the inner-loop, or outside LOOP. */
5585 if (init_addr != NULL_TREE)
5587 compute_in_loop = true;
5588 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5592 /* 2. Determine where to generate the extra vector load.
5594 For the optimized realignment scheme, instead of generating two vector
5595 loads in each iteration, we generate a single extra vector load in the
5596 preheader of the loop, and in each iteration reuse the result of the
5597 vector load from the previous iteration. In case the memory access is in
5598 an inner-loop nested inside LOOP, which is now being vectorized using
5599 outer-loop vectorization, we need to determine whether this initial vector
5600 load should be generated at the preheader of the inner-loop, or can be
5601 generated at the preheader of LOOP. If the memory access has no evolution
5602 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5603 to be generated inside LOOP (in the preheader of the inner-loop). */
5605 if (nested_in_vect_loop)
5607 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5608 bool invariant_in_outerloop =
5609 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5610 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5613 loop_for_initial_load = loop;
5615 *at_loop = loop_for_initial_load;
5617 /* 3. For the case of the optimized realignment, create the first vector
5618 load at the loop preheader. */
5620 if (alignment_support_scheme == dr_explicit_realign_optimized)
5622 /* Create msq_init = *(floor(p1)) in the loop preheader */
5624 gcc_assert (!compute_in_loop);
5625 pe = loop_preheader_edge (loop_for_initial_load);
5626 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5627 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5628 &init_addr, &inc, true, &inv_p, NULL_TREE);
5629 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5630 new_stmt = gimple_build_assign (vec_dest, data_ref);
5631 new_temp = make_ssa_name (vec_dest, new_stmt);
5632 gimple_assign_set_lhs (new_stmt, new_temp);
5633 mark_symbols_for_renaming (new_stmt);
5634 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5635 gcc_assert (!new_bb);
5636 msq_init = gimple_assign_lhs (new_stmt);
5639 /* 4. Create realignment token using a target builtin, if available.
5640 It is done either inside the containing loop, or before LOOP (as
5641 determined above). */
5643 if (targetm.vectorize.builtin_mask_for_load)
5647 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5648 if (compute_in_loop)
5649 gcc_assert (init_addr); /* already computed by the caller. */
5652 /* Generate the INIT_ADDR computation outside LOOP. */
5653 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5655 pe = loop_preheader_edge (loop);
5656 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5657 gcc_assert (!new_bb);
5660 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5661 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5663 vect_create_destination_var (scalar_dest,
5664 gimple_call_return_type (new_stmt));
5665 new_temp = make_ssa_name (vec_dest, new_stmt);
5666 gimple_call_set_lhs (new_stmt, new_temp);
5668 if (compute_in_loop)
5669 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5672 /* Generate the misalignment computation outside LOOP. */
5673 pe = loop_preheader_edge (loop);
5674 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5675 gcc_assert (!new_bb);
5678 *realignment_token = gimple_call_lhs (new_stmt);
5680 /* The result of the CALL_EXPR to this builtin is determined from
5681 the value of the parameter and no global variables are touched
5682 which makes the builtin a "const" function. Requiring the
5683 builtin to have the "const" attribute makes it unnecessary
5684 to call mark_call_clobbered. */
5685 gcc_assert (TREE_READONLY (builtin_decl));
5688 if (alignment_support_scheme == dr_explicit_realign)
5691 gcc_assert (!compute_in_loop);
5692 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5695 /* 5. Create msq = phi <msq_init, lsq> in loop */
5697 pe = loop_preheader_edge (containing_loop);
5698 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5699 msq = make_ssa_name (vec_dest, NULL);
5700 phi_stmt = create_phi_node (msq, containing_loop->header);
5701 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5702 add_phi_arg (phi_stmt, msq_init, pe);
5708 /* Function vect_strided_load_supported.
5710 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5711 and FALSE otherwise. */
5714 vect_strided_load_supported (tree vectype)
5716 optab perm_even_optab, perm_odd_optab;
5719 mode = (int) TYPE_MODE (vectype);
5721 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype,
5723 if (!perm_even_optab)
5725 if (vect_print_dump_info (REPORT_DETAILS))
5726 fprintf (vect_dump, "no optab for perm_even.");
5730 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5732 if (vect_print_dump_info (REPORT_DETAILS))
5733 fprintf (vect_dump, "perm_even op not supported by target.");
5737 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype,
5739 if (!perm_odd_optab)
5741 if (vect_print_dump_info (REPORT_DETAILS))
5742 fprintf (vect_dump, "no optab for perm_odd.");
5746 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5748 if (vect_print_dump_info (REPORT_DETAILS))
5749 fprintf (vect_dump, "perm_odd op not supported by target.");
5756 /* Function vect_permute_load_chain.
5758 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5759 a power of 2, generate extract_even/odd stmts to reorder the input data
5760 correctly. Return the final references for loads in RESULT_CHAIN.
5762 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5763 The input is 4 vectors each containing 8 elements. We assign a number to each
5764 element, the input sequence is:
5766 1st vec: 0 1 2 3 4 5 6 7
5767 2nd vec: 8 9 10 11 12 13 14 15
5768 3rd vec: 16 17 18 19 20 21 22 23
5769 4th vec: 24 25 26 27 28 29 30 31
5771 The output sequence should be:
5773 1st vec: 0 4 8 12 16 20 24 28
5774 2nd vec: 1 5 9 13 17 21 25 29
5775 3rd vec: 2 6 10 14 18 22 26 30
5776 4th vec: 3 7 11 15 19 23 27 31
5778 i.e., the first output vector should contain the first elements of each
5779 interleaving group, etc.
5781 We use extract_even/odd instructions to create such output. The input of each
5782 extract_even/odd operation is two vectors
5786 and the output is the vector of extracted even/odd elements. The output of
5787 extract_even will be: 0 2 4 6
5788 and of extract_odd: 1 3 5 7
5791 The permutation is done in log LENGTH stages. In each stage extract_even and
5792 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5793 order. In our example,
5795 E1: extract_even (1st vec, 2nd vec)
5796 E2: extract_odd (1st vec, 2nd vec)
5797 E3: extract_even (3rd vec, 4th vec)
5798 E4: extract_odd (3rd vec, 4th vec)
5800 The output for the first stage will be:
5802 E1: 0 2 4 6 8 10 12 14
5803 E2: 1 3 5 7 9 11 13 15
5804 E3: 16 18 20 22 24 26 28 30
5805 E4: 17 19 21 23 25 27 29 31
5807 In order to proceed and create the correct sequence for the next stage (or
5808 for the correct output, if the second stage is the last one, as in our
5809 example), we first put the output of extract_even operation and then the
5810 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5811 The input for the second stage is:
5813 1st vec (E1): 0 2 4 6 8 10 12 14
5814 2nd vec (E3): 16 18 20 22 24 26 28 30
5815 3rd vec (E2): 1 3 5 7 9 11 13 15
5816 4th vec (E4): 17 19 21 23 25 27 29 31
5818 The output of the second stage:
5820 E1: 0 4 8 12 16 20 24 28
5821 E2: 2 6 10 14 18 22 26 30
5822 E3: 1 5 9 13 17 21 25 29
5823 E4: 3 7 11 15 19 23 27 31
5825 And RESULT_CHAIN after reordering:
5827 1st vec (E1): 0 4 8 12 16 20 24 28
5828 2nd vec (E3): 1 5 9 13 17 21 25 29
5829 3rd vec (E2): 2 6 10 14 18 22 26 30
5830 4th vec (E4): 3 7 11 15 19 23 27 31. */
5833 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5834 unsigned int length,
5836 gimple_stmt_iterator *gsi,
5837 VEC(tree,heap) **result_chain)
5839 tree perm_dest, data_ref, first_vect, second_vect;
5841 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5845 /* Check that the operation is supported. */
5846 if (!vect_strided_load_supported (vectype))
5849 *result_chain = VEC_copy (tree, heap, dr_chain);
5850 for (i = 0; i < exact_log2 (length); i++)
5852 for (j = 0; j < length; j +=2)
5854 first_vect = VEC_index (tree, dr_chain, j);
5855 second_vect = VEC_index (tree, dr_chain, j+1);
5857 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5858 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5859 DECL_GIMPLE_REG_P (perm_dest) = 1;
5860 add_referenced_var (perm_dest);
5862 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_EVEN_EXPR,
5863 perm_dest, first_vect,
5866 data_ref = make_ssa_name (perm_dest, perm_stmt);
5867 gimple_assign_set_lhs (perm_stmt, data_ref);
5868 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5869 mark_symbols_for_renaming (perm_stmt);
5871 VEC_replace (tree, *result_chain, j/2, data_ref);
5873 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5874 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5875 DECL_GIMPLE_REG_P (perm_dest) = 1;
5876 add_referenced_var (perm_dest);
5878 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_ODD_EXPR,
5879 perm_dest, first_vect,
5881 data_ref = make_ssa_name (perm_dest, perm_stmt);
5882 gimple_assign_set_lhs (perm_stmt, data_ref);
5883 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5884 mark_symbols_for_renaming (perm_stmt);
5886 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5888 dr_chain = VEC_copy (tree, heap, *result_chain);
5894 /* Function vect_transform_strided_load.
5896 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5897 to perform their permutation and ascribe the result vectorized statements to
5898 the scalar statements.
5902 vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
5903 gimple_stmt_iterator *gsi)
5905 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5906 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5907 gimple next_stmt, new_stmt;
5908 VEC(tree,heap) *result_chain = NULL;
5909 unsigned int i, gap_count;
5912 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5913 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5914 vectors, that are ready for vector computation. */
5915 result_chain = VEC_alloc (tree, heap, size);
5917 if (!vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain))
5920 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5921 Since we scan the chain starting from it's first node, their order
5922 corresponds the order of data-refs in RESULT_CHAIN. */
5923 next_stmt = first_stmt;
5925 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5930 /* Skip the gaps. Loads created for the gaps will be removed by dead
5931 code elimination pass later. No need to check for the first stmt in
5932 the group, since it always exists.
5933 DR_GROUP_GAP is the number of steps in elements from the previous
5934 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5935 correspond to the gaps.
5937 if (next_stmt != first_stmt
5938 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5946 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5947 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5948 copies, and we put the new vector statement in the first available
5950 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5951 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5954 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5957 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5959 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5962 prev_stmt = rel_stmt;
5964 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5967 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5972 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5974 /* If NEXT_STMT accesses the same DR as the previous statement,
5975 put the same TMP_DATA_REF as its vectorized statement; otherwise
5976 get the next data-ref from RESULT_CHAIN. */
5977 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5982 VEC_free (tree, heap, result_chain);
5987 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
5988 building a vector of type MASK_TYPE from it) and two input vectors placed in
5989 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
5990 shifting by STRIDE elements of DR_CHAIN for every copy.
5991 (STRIDE is the number of vectorized stmts for NODE divided by the number of
5993 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
5994 the created stmts must be inserted. */
5997 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
5998 int *mask_array, int mask_nunits,
5999 tree mask_element_type, tree mask_type,
6000 int first_vec_indx, int second_vec_indx,
6001 gimple_stmt_iterator *gsi, slp_tree node,
6002 tree builtin_decl, tree vectype,
6003 VEC(tree,heap) *dr_chain,
6004 int ncopies, int vect_stmts_counter)
6006 tree t = NULL_TREE, mask_vec, mask, perm_dest;
6007 gimple perm_stmt = NULL;
6008 stmt_vec_info next_stmt_info;
6009 int i, group_size, stride, dr_chain_size;
6010 tree first_vec, second_vec, data_ref;
6013 VEC (tree, heap) *params = NULL;
6015 /* Create a vector mask. */
6016 for (i = mask_nunits - 1; i >= 0; --i)
6017 t = tree_cons (NULL_TREE, build_int_cst (mask_element_type, mask_array[i]),
6019 mask_vec = build_vector (mask_type, t);
6020 mask = vect_init_vector (stmt, mask_vec, mask_type, NULL);
6022 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node));
6023 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
6024 dr_chain_size = VEC_length (tree, dr_chain);
6026 /* Initialize the vect stmts of NODE to properly insert the generated
6028 for (i = VEC_length (gimple, SLP_TREE_VEC_STMTS (node));
6029 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6030 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (node), NULL);
6032 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
6033 for (i = 0; i < ncopies; i++)
6035 first_vec = VEC_index (tree, dr_chain, first_vec_indx);
6036 second_vec = VEC_index (tree, dr_chain, second_vec_indx);
6038 /* Build argument list for the vectorized call. */
6039 VEC_free (tree, heap, params);
6040 params = VEC_alloc (tree, heap, 3);
6041 VEC_quick_push (tree, params, first_vec);
6042 VEC_quick_push (tree, params, second_vec);
6043 VEC_quick_push (tree, params, mask);
6045 /* Generate the permute statement. */
6046 perm_stmt = gimple_build_call_vec (builtin_decl, params);
6047 data_ref = make_ssa_name (perm_dest, perm_stmt);
6048 gimple_call_set_lhs (perm_stmt, data_ref);
6049 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6050 FOR_EACH_SSA_TREE_OPERAND (sym, perm_stmt, iter, SSA_OP_ALL_VIRTUALS)
6052 if (TREE_CODE (sym) == SSA_NAME)
6053 sym = SSA_NAME_VAR (sym);
6054 mark_sym_for_renaming (sym);
6057 /* Store the vector statement in NODE. */
6058 VEC_replace (gimple, SLP_TREE_VEC_STMTS (node),
6059 stride * i + vect_stmts_counter, perm_stmt);
6061 first_vec_indx += stride;
6062 second_vec_indx += stride;
6065 /* Mark the scalar stmt as vectorized. */
6066 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
6067 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
6071 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
6072 return in CURRENT_MASK_ELEMENT its equivalent in target specific
6073 representation. Check that the mask is valid and return FALSE if not.
6074 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
6075 the next vector, i.e., the current first vector is not needed. */
6078 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
6079 int mask_nunits, bool only_one_vec, int index,
6080 int *mask, int *current_mask_element,
6081 bool *need_next_vector)
6084 static int number_of_mask_fixes = 1;
6085 static bool mask_fixed = false;
6086 static bool needs_first_vector = false;
6088 /* Convert to target specific representation. */
6089 *current_mask_element = first_mask_element + m;
6090 /* Adjust the value in case it's a mask for second and third vectors. */
6091 *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1);
6093 if (*current_mask_element < mask_nunits)
6094 needs_first_vector = true;
6096 /* We have only one input vector to permute but the mask accesses values in
6097 the next vector as well. */
6098 if (only_one_vec && *current_mask_element >= mask_nunits)
6100 if (vect_print_dump_info (REPORT_DETAILS))
6102 fprintf (vect_dump, "permutation requires at least two vectors ");
6103 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6109 /* The mask requires the next vector. */
6110 if (*current_mask_element >= mask_nunits * 2)
6112 if (needs_first_vector || mask_fixed)
6114 /* We either need the first vector too or have already moved to the
6115 next vector. In both cases, this permutation needs three
6117 if (vect_print_dump_info (REPORT_DETAILS))
6119 fprintf (vect_dump, "permutation requires at "
6120 "least three vectors ");
6121 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6127 /* We move to the next vector, dropping the first one and working with
6128 the second and the third - we need to adjust the values of the mask
6130 *current_mask_element -= mask_nunits * number_of_mask_fixes;
6132 for (i = 0; i < index; i++)
6133 mask[i] -= mask_nunits * number_of_mask_fixes;
6135 (number_of_mask_fixes)++;
6139 *need_next_vector = mask_fixed;
6141 /* This was the last element of this mask. Start a new one. */
6142 if (index == mask_nunits - 1)
6144 number_of_mask_fixes = 1;
6146 needs_first_vector = false;
6153 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6154 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6155 permute statements for SLP_NODE_INSTANCE. */
6157 vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
6158 gimple_stmt_iterator *gsi, int vf,
6159 slp_instance slp_node_instance, bool analyze_only)
6161 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6162 tree mask_element_type = NULL_TREE, mask_type;
6163 int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index;
6165 tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl;
6166 gimple next_scalar_stmt;
6167 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6168 int first_mask_element;
6169 int index, unroll_factor, *mask, current_mask_element, ncopies;
6170 bool only_one_vec = false, need_next_vector = false;
6171 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
6173 if (!targetm.vectorize.builtin_vec_perm)
6175 if (vect_print_dump_info (REPORT_DETAILS))
6177 fprintf (vect_dump, "no builtin for vect permute for ");
6178 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6184 builtin_decl = targetm.vectorize.builtin_vec_perm (vectype,
6185 &mask_element_type);
6186 if (!builtin_decl || !mask_element_type)
6188 if (vect_print_dump_info (REPORT_DETAILS))
6190 fprintf (vect_dump, "no builtin for vect permute for ");
6191 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6197 mask_type = get_vectype_for_scalar_type (mask_element_type);
6198 mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type);
6199 mask = (int *) xmalloc (sizeof (int) * mask_nunits);
6200 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6201 scale = mask_nunits / nunits;
6202 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6204 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
6205 unrolling factor. */
6206 orig_vec_stmts_num = group_size *
6207 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
6208 if (orig_vec_stmts_num == 1)
6209 only_one_vec = true;
6211 /* Number of copies is determined by the final vectorization factor
6212 relatively to SLP_NODE_INSTANCE unrolling factor. */
6213 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6215 /* Generate permutation masks for every NODE. Number of masks for each NODE
6216 is equal to GROUP_SIZE.
6217 E.g., we have a group of three nodes with three loads from the same
6218 location in each node, and the vector size is 4. I.e., we have a
6219 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6220 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6221 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6224 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target
6225 scpecific type, e.g., in bytes for Altivec.
6226 The last mask is illegal since we assume two operands for permute
6227 operation, and the mask element values can't be outside that range. Hence,
6228 the last mask must be converted into {2,5,5,5}.
6229 For the first two permutations we need the first and the second input
6230 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6231 we need the second and the third vectors: {b1,c1,a2,b2} and
6235 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_node_instance),
6241 vect_stmts_counter = 0;
6243 first_vec_index = vec_index++;
6245 second_vec_index = first_vec_index;
6247 second_vec_index = vec_index++;
6249 for (j = 0; j < unroll_factor; j++)
6251 for (k = 0; k < group_size; k++)
6253 first_mask_element = (i + j * group_size) * scale;
6254 for (m = 0; m < scale; m++)
6256 if (!vect_get_mask_element (stmt, first_mask_element, m,
6257 mask_nunits, only_one_vec, index, mask,
6258 ¤t_mask_element, &need_next_vector))
6261 mask[index++] = current_mask_element;
6264 if (index == mask_nunits)
6269 if (need_next_vector)
6271 first_vec_index = second_vec_index;
6272 second_vec_index = vec_index;
6275 next_scalar_stmt = VEC_index (gimple,
6276 SLP_TREE_SCALAR_STMTS (node), scalar_index++);
6278 vect_create_mask_and_perm (stmt, next_scalar_stmt,
6279 mask, mask_nunits, mask_element_type, mask_type,
6280 first_vec_index, second_vec_index, gsi, node,
6281 builtin_decl, vectype, dr_chain, ncopies,
6282 vect_stmts_counter++);
6293 /* vectorizable_load.
6295 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
6297 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6298 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
6299 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6302 vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
6303 slp_tree slp_node, slp_instance slp_node_instance)
6306 tree vec_dest = NULL;
6307 tree data_ref = NULL;
6308 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6309 stmt_vec_info prev_stmt_info;
6310 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6311 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6312 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
6313 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
6314 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
6315 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6318 gimple new_stmt = NULL;
6320 enum dr_alignment_support alignment_support_scheme;
6321 tree dataref_ptr = NULL_TREE;
6323 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6325 int i, j, group_size;
6326 tree msq = NULL_TREE, lsq;
6327 tree offset = NULL_TREE;
6328 tree realignment_token = NULL_TREE;
6330 VEC(tree,heap) *dr_chain = NULL;
6331 bool strided_load = false;
6335 bool compute_in_loop = false;
6336 struct loop *at_loop;
6338 bool slp = (slp_node != NULL);
6339 bool slp_perm = false;
6340 enum tree_code code;
6342 /* Multiple types in SLP are handled by creating the appropriate number of
6343 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
6348 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6350 gcc_assert (ncopies >= 1);
6352 /* FORNOW. This restriction should be relaxed. */
6353 if (nested_in_vect_loop && ncopies > 1)
6355 if (vect_print_dump_info (REPORT_DETAILS))
6356 fprintf (vect_dump, "multiple types in nested loop.");
6360 if (slp && SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance))
6363 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6366 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6369 /* Is vectorizable load? */
6370 if (!is_gimple_assign (stmt))
6373 scalar_dest = gimple_assign_lhs (stmt);
6374 if (TREE_CODE (scalar_dest) != SSA_NAME)
6377 code = gimple_assign_rhs_code (stmt);
6378 if (code != ARRAY_REF
6379 && code != INDIRECT_REF
6380 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
6383 if (!STMT_VINFO_DATA_REF (stmt_info))
6386 scalar_type = TREE_TYPE (DR_REF (dr));
6387 mode = (int) TYPE_MODE (vectype);
6389 /* FORNOW. In some cases can vectorize even if data-type not supported
6390 (e.g. - data copies). */
6391 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
6393 if (vect_print_dump_info (REPORT_DETAILS))
6394 fprintf (vect_dump, "Aligned load, but unsupported type.");
6398 /* The vector component type needs to be trivially convertible to the
6399 scalar lhs. This should always be the case. */
6400 if (!useless_type_conversion_p (TREE_TYPE (scalar_dest), TREE_TYPE (vectype)))
6402 if (vect_print_dump_info (REPORT_DETAILS))
6403 fprintf (vect_dump, "??? operands of different types");
6407 /* Check if the load is a part of an interleaving chain. */
6408 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6410 strided_load = true;
6412 gcc_assert (! nested_in_vect_loop);
6414 /* Check if interleaving is supported. */
6415 if (!vect_strided_load_supported (vectype)
6416 && !PURE_SLP_STMT (stmt_info) && !slp)
6420 if (!vec_stmt) /* transformation not required. */
6422 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
6423 vect_model_load_cost (stmt_info, ncopies, NULL);
6427 if (vect_print_dump_info (REPORT_DETAILS))
6428 fprintf (vect_dump, "transform load.");
6434 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
6435 /* Check if the chain of loads is already vectorized. */
6436 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
6438 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6441 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
6442 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
6444 /* VEC_NUM is the number of vect stmts to be created for this group. */
6447 strided_load = false;
6448 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6451 vec_num = group_size;
6453 dr_chain = VEC_alloc (tree, heap, vec_num);
6459 group_size = vec_num = 1;
6462 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
6463 gcc_assert (alignment_support_scheme);
6465 /* In case the vectorization factor (VF) is bigger than the number
6466 of elements that we can fit in a vectype (nunits), we have to generate
6467 more than one vector stmt - i.e - we need to "unroll" the
6468 vector stmt by a factor VF/nunits. In doing so, we record a pointer
6469 from one copy of the vector stmt to the next, in the field
6470 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
6471 stages to find the correct vector defs to be used when vectorizing
6472 stmts that use the defs of the current stmt. The example below illustrates
6473 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
6474 4 vectorized stmts):
6476 before vectorization:
6477 RELATED_STMT VEC_STMT
6481 step 1: vectorize stmt S1:
6482 We first create the vector stmt VS1_0, and, as usual, record a
6483 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
6484 Next, we create the vector stmt VS1_1, and record a pointer to
6485 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
6486 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
6488 RELATED_STMT VEC_STMT
6489 VS1_0: vx0 = memref0 VS1_1 -
6490 VS1_1: vx1 = memref1 VS1_2 -
6491 VS1_2: vx2 = memref2 VS1_3 -
6492 VS1_3: vx3 = memref3 - -
6493 S1: x = load - VS1_0
6496 See in documentation in vect_get_vec_def_for_stmt_copy for how the
6497 information we recorded in RELATED_STMT field is used to vectorize
6500 /* In case of interleaving (non-unit strided access):
6507 Vectorized loads are created in the order of memory accesses
6508 starting from the access of the first stmt of the chain:
6511 VS2: vx1 = &base + vec_size*1
6512 VS3: vx3 = &base + vec_size*2
6513 VS4: vx4 = &base + vec_size*3
6515 Then permutation statements are generated:
6517 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
6518 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
6521 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
6522 (the order of the data-refs in the output of vect_permute_load_chain
6523 corresponds to the order of scalar stmts in the interleaving chain - see
6524 the documentation of vect_permute_load_chain()).
6525 The generation of permutation stmts and recording them in
6526 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
6528 In case of both multiple types and interleaving, the vector loads and
6529 permutation stmts above are created for every copy. The result vector stmts
6530 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
6531 STMT_VINFO_RELATED_STMT for the next copies. */
6533 /* If the data reference is aligned (dr_aligned) or potentially unaligned
6534 on a target that supports unaligned accesses (dr_unaligned_supported)
6535 we generate the following code:
6539 p = p + indx * vectype_size;
6544 Otherwise, the data reference is potentially unaligned on a target that
6545 does not support unaligned accesses (dr_explicit_realign_optimized) -
6546 then generate the following code, in which the data in each iteration is
6547 obtained by two vector loads, one from the previous iteration, and one
6548 from the current iteration:
6550 msq_init = *(floor(p1))
6551 p2 = initial_addr + VS - 1;
6552 realignment_token = call target_builtin;
6555 p2 = p2 + indx * vectype_size
6557 vec_dest = realign_load (msq, lsq, realignment_token)
6562 /* If the misalignment remains the same throughout the execution of the
6563 loop, we can create the init_addr and permutation mask at the loop
6564 preheader. Otherwise, it needs to be created inside the loop.
6565 This can only occur when vectorizing memory accesses in the inner-loop
6566 nested within an outer-loop that is being vectorized. */
6568 if (nested_in_vect_loop_p (loop, stmt)
6569 && (TREE_INT_CST_LOW (DR_STEP (dr))
6570 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
6572 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
6573 compute_in_loop = true;
6576 if ((alignment_support_scheme == dr_explicit_realign_optimized
6577 || alignment_support_scheme == dr_explicit_realign)
6578 && !compute_in_loop)
6580 msq = vect_setup_realignment (first_stmt, gsi, &realignment_token,
6581 alignment_support_scheme, NULL_TREE,
6583 if (alignment_support_scheme == dr_explicit_realign_optimized)
6585 phi = SSA_NAME_DEF_STMT (msq);
6586 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6592 prev_stmt_info = NULL;
6593 for (j = 0; j < ncopies; j++)
6595 /* 1. Create the vector pointer update chain. */
6597 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
6599 &dummy, &ptr_incr, false,
6603 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
6605 for (i = 0; i < vec_num; i++)
6608 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
6611 /* 2. Create the vector-load in the loop. */
6612 switch (alignment_support_scheme)
6615 gcc_assert (aligned_access_p (first_dr));
6616 data_ref = build_fold_indirect_ref (dataref_ptr);
6618 case dr_unaligned_supported:
6620 int mis = DR_MISALIGNMENT (first_dr);
6621 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
6623 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
6625 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
6628 case dr_explicit_realign:
6631 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6633 if (compute_in_loop)
6634 msq = vect_setup_realignment (first_stmt, gsi,
6636 dr_explicit_realign,
6639 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6640 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6641 new_stmt = gimple_build_assign (vec_dest, data_ref);
6642 new_temp = make_ssa_name (vec_dest, new_stmt);
6643 gimple_assign_set_lhs (new_stmt, new_temp);
6644 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6645 copy_virtual_operands (new_stmt, stmt);
6646 mark_symbols_for_renaming (new_stmt);
6649 bump = size_binop (MULT_EXPR, vs_minus_1,
6650 TYPE_SIZE_UNIT (scalar_type));
6651 ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump);
6652 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
6655 case dr_explicit_realign_optimized:
6656 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6661 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6662 new_stmt = gimple_build_assign (vec_dest, data_ref);
6663 new_temp = make_ssa_name (vec_dest, new_stmt);
6664 gimple_assign_set_lhs (new_stmt, new_temp);
6665 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6666 mark_symbols_for_renaming (new_stmt);
6668 /* 3. Handle explicit realignment if necessary/supported. Create in
6669 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
6670 if (alignment_support_scheme == dr_explicit_realign_optimized
6671 || alignment_support_scheme == dr_explicit_realign)
6675 lsq = gimple_assign_lhs (new_stmt);
6676 if (!realignment_token)
6677 realignment_token = dataref_ptr;
6678 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6679 tmp = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
6681 new_stmt = gimple_build_assign (vec_dest, tmp);
6682 new_temp = make_ssa_name (vec_dest, new_stmt);
6683 gimple_assign_set_lhs (new_stmt, new_temp);
6684 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6686 if (alignment_support_scheme == dr_explicit_realign_optimized)
6689 if (i == vec_num - 1 && j == ncopies - 1)
6690 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
6695 /* 4. Handle invariant-load. */
6698 gcc_assert (!strided_load);
6699 gcc_assert (nested_in_vect_loop_p (loop, stmt));
6704 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
6706 /* CHECKME: bitpos depends on endianess? */
6707 bitpos = bitsize_zero_node;
6708 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6711 vect_create_destination_var (scalar_dest, NULL_TREE);
6712 new_stmt = gimple_build_assign (vec_dest, vec_inv);
6713 new_temp = make_ssa_name (vec_dest, new_stmt);
6714 gimple_assign_set_lhs (new_stmt, new_temp);
6715 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6717 for (k = nunits - 1; k >= 0; --k)
6718 t = tree_cons (NULL_TREE, new_temp, t);
6719 /* FIXME: use build_constructor directly. */
6720 vec_inv = build_constructor_from_list (vectype, t);
6721 new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
6722 new_stmt = SSA_NAME_DEF_STMT (new_temp);
6725 gcc_unreachable (); /* FORNOW. */
6728 /* Collect vector loads and later create their permutation in
6729 vect_transform_strided_load (). */
6730 if (strided_load || slp_perm)
6731 VEC_quick_push (tree, dr_chain, new_temp);
6733 /* Store vector loads in the corresponding SLP_NODE. */
6734 if (slp && !slp_perm)
6735 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
6738 if (slp && !slp_perm)
6743 if (!vect_transform_slp_perm_load (stmt, dr_chain, gsi,
6744 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
6745 slp_node_instance, false))
6747 VEC_free (tree, heap, dr_chain);
6755 if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi))
6758 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6759 VEC_free (tree, heap, dr_chain);
6760 dr_chain = VEC_alloc (tree, heap, group_size);
6765 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6767 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6768 prev_stmt_info = vinfo_for_stmt (new_stmt);
6774 VEC_free (tree, heap, dr_chain);
6780 /* Function vectorizable_live_operation.
6782 STMT computes a value that is used outside the loop. Check if
6783 it can be supported. */
6786 vectorizable_live_operation (gimple stmt,
6787 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6788 gimple *vec_stmt ATTRIBUTE_UNUSED)
6790 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6791 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6792 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6798 enum vect_def_type dt;
6799 enum tree_code code;
6800 enum gimple_rhs_class rhs_class;
6802 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6804 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6807 if (!is_gimple_assign (stmt))
6810 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6813 /* FORNOW. CHECKME. */
6814 if (nested_in_vect_loop_p (loop, stmt))
6817 code = gimple_assign_rhs_code (stmt);
6818 op_type = TREE_CODE_LENGTH (code);
6819 rhs_class = get_gimple_rhs_class (code);
6820 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
6821 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
6823 /* FORNOW: support only if all uses are invariant. This means
6824 that the scalar operations can remain in place, unvectorized.
6825 The original last scalar value that they compute will be used. */
6827 for (i = 0; i < op_type; i++)
6829 if (rhs_class == GIMPLE_SINGLE_RHS)
6830 op = TREE_OPERAND (gimple_op (stmt, 1), i);
6832 op = gimple_op (stmt, i + 1);
6833 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
6835 if (vect_print_dump_info (REPORT_DETAILS))
6836 fprintf (vect_dump, "use not simple.");
6840 if (dt != vect_invariant_def && dt != vect_constant_def)
6844 /* No transformation is required for the cases we currently support. */
6849 /* Function vect_is_simple_cond.
6852 LOOP - the loop that is being vectorized.
6853 COND - Condition that is checked for simple use.
6855 Returns whether a COND can be vectorized. Checks whether
6856 condition operands are supportable using vec_is_simple_use. */
6859 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6863 enum vect_def_type dt;
6865 if (!COMPARISON_CLASS_P (cond))
6868 lhs = TREE_OPERAND (cond, 0);
6869 rhs = TREE_OPERAND (cond, 1);
6871 if (TREE_CODE (lhs) == SSA_NAME)
6873 gimple lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6874 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6877 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6878 && TREE_CODE (lhs) != FIXED_CST)
6881 if (TREE_CODE (rhs) == SSA_NAME)
6883 gimple rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6884 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6887 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
6888 && TREE_CODE (rhs) != FIXED_CST)
6894 /* vectorizable_condition.
6896 Check if STMT is conditional modify expression that can be vectorized.
6897 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6898 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
6901 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6904 vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
6907 tree scalar_dest = NULL_TREE;
6908 tree vec_dest = NULL_TREE;
6909 tree op = NULL_TREE;
6910 tree cond_expr, then_clause, else_clause;
6911 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6912 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6913 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6914 tree vec_compare, vec_cond_expr;
6916 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6917 enum machine_mode vec_mode;
6919 enum vect_def_type dt;
6920 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6921 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6922 enum tree_code code;
6924 gcc_assert (ncopies >= 1);
6926 return false; /* FORNOW */
6928 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6931 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6934 /* FORNOW: SLP not supported. */
6935 if (STMT_SLP_TYPE (stmt_info))
6938 /* FORNOW: not yet supported. */
6939 if (STMT_VINFO_LIVE_P (stmt_info))
6941 if (vect_print_dump_info (REPORT_DETAILS))
6942 fprintf (vect_dump, "value used after loop.");
6946 /* Is vectorizable conditional operation? */
6947 if (!is_gimple_assign (stmt))
6950 code = gimple_assign_rhs_code (stmt);
6952 if (code != COND_EXPR)
6955 gcc_assert (gimple_assign_single_p (stmt));
6956 op = gimple_assign_rhs1 (stmt);
6957 cond_expr = TREE_OPERAND (op, 0);
6958 then_clause = TREE_OPERAND (op, 1);
6959 else_clause = TREE_OPERAND (op, 2);
6961 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6964 /* We do not handle two different vector types for the condition
6966 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6969 if (TREE_CODE (then_clause) == SSA_NAME)
6971 gimple then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6972 if (!vect_is_simple_use (then_clause, loop_vinfo,
6973 &then_def_stmt, &def, &dt))
6976 else if (TREE_CODE (then_clause) != INTEGER_CST
6977 && TREE_CODE (then_clause) != REAL_CST
6978 && TREE_CODE (then_clause) != FIXED_CST)
6981 if (TREE_CODE (else_clause) == SSA_NAME)
6983 gimple else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6984 if (!vect_is_simple_use (else_clause, loop_vinfo,
6985 &else_def_stmt, &def, &dt))
6988 else if (TREE_CODE (else_clause) != INTEGER_CST
6989 && TREE_CODE (else_clause) != REAL_CST
6990 && TREE_CODE (else_clause) != FIXED_CST)
6994 vec_mode = TYPE_MODE (vectype);
6998 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
6999 return expand_vec_cond_expr_p (op, vec_mode);
7005 scalar_dest = gimple_assign_lhs (stmt);
7006 vec_dest = vect_create_destination_var (scalar_dest, vectype);
7008 /* Handle cond expr. */
7010 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
7012 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
7013 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
7014 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
7016 /* Arguments are ready. Create the new vector stmt. */
7017 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
7018 vec_cond_lhs, vec_cond_rhs);
7019 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
7020 vec_compare, vec_then_clause, vec_else_clause);
7022 *vec_stmt = gimple_build_assign (vec_dest, vec_cond_expr);
7023 new_temp = make_ssa_name (vec_dest, *vec_stmt);
7024 gimple_assign_set_lhs (*vec_stmt, new_temp);
7025 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
7031 /* Function vect_transform_stmt.
7033 Create a vectorized stmt to replace STMT, and insert it at BSI. */
7036 vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
7037 bool *strided_store, slp_tree slp_node,
7038 slp_instance slp_node_instance)
7040 bool is_store = false;
7041 gimple vec_stmt = NULL;
7042 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7043 gimple orig_stmt_in_pattern;
7045 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7046 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7048 switch (STMT_VINFO_TYPE (stmt_info))
7050 case type_demotion_vec_info_type:
7051 done = vectorizable_type_demotion (stmt, gsi, &vec_stmt, slp_node);
7055 case type_promotion_vec_info_type:
7056 done = vectorizable_type_promotion (stmt, gsi, &vec_stmt, slp_node);
7060 case type_conversion_vec_info_type:
7061 done = vectorizable_conversion (stmt, gsi, &vec_stmt, slp_node);
7065 case induc_vec_info_type:
7066 gcc_assert (!slp_node);
7067 done = vectorizable_induction (stmt, gsi, &vec_stmt);
7071 case op_vec_info_type:
7072 done = vectorizable_operation (stmt, gsi, &vec_stmt, slp_node);
7076 case assignment_vec_info_type:
7077 done = vectorizable_assignment (stmt, gsi, &vec_stmt, slp_node);
7081 case load_vec_info_type:
7082 done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node,
7087 case store_vec_info_type:
7088 done = vectorizable_store (stmt, gsi, &vec_stmt, slp_node);
7090 if (STMT_VINFO_STRIDED_ACCESS (stmt_info) && !slp_node)
7092 /* In case of interleaving, the whole chain is vectorized when the
7093 last store in the chain is reached. Store stmts before the last
7094 one are skipped, and there vec_stmt_info shouldn't be freed
7096 *strided_store = true;
7097 if (STMT_VINFO_VEC_STMT (stmt_info))
7104 case condition_vec_info_type:
7105 gcc_assert (!slp_node);
7106 done = vectorizable_condition (stmt, gsi, &vec_stmt);
7110 case call_vec_info_type:
7111 gcc_assert (!slp_node);
7112 done = vectorizable_call (stmt, gsi, &vec_stmt);
7115 case reduc_vec_info_type:
7116 gcc_assert (!slp_node);
7117 done = vectorizable_reduction (stmt, gsi, &vec_stmt);
7122 if (!STMT_VINFO_LIVE_P (stmt_info))
7124 if (vect_print_dump_info (REPORT_DETAILS))
7125 fprintf (vect_dump, "stmt not supported.");
7130 /* Handle inner-loop stmts whose DEF is used in the loop-nest that
7131 is being vectorized, but outside the immediately enclosing loop. */
7133 && nested_in_vect_loop_p (loop, stmt)
7134 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type
7135 && (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_outer
7136 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_outer_by_reduction))
7138 struct loop *innerloop = loop->inner;
7139 imm_use_iterator imm_iter;
7140 use_operand_p use_p;
7144 if (vect_print_dump_info (REPORT_DETAILS))
7145 fprintf (vect_dump, "Record the vdef for outer-loop vectorization.");
7147 /* Find the relevant loop-exit phi-node, and reord the vec_stmt there
7148 (to be used when vectorizing outer-loop stmts that use the DEF of
7150 if (gimple_code (stmt) == GIMPLE_PHI)
7151 scalar_dest = PHI_RESULT (stmt);
7153 scalar_dest = gimple_assign_lhs (stmt);
7155 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
7157 if (!flow_bb_inside_loop_p (innerloop, gimple_bb (USE_STMT (use_p))))
7159 exit_phi = USE_STMT (use_p);
7160 STMT_VINFO_VEC_STMT (vinfo_for_stmt (exit_phi)) = vec_stmt;
7165 /* Handle stmts whose DEF is used outside the loop-nest that is
7166 being vectorized. */
7167 if (STMT_VINFO_LIVE_P (stmt_info)
7168 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
7170 done = vectorizable_live_operation (stmt, gsi, &vec_stmt);
7176 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
7177 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
7178 if (orig_stmt_in_pattern)
7180 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
7181 /* STMT was inserted by the vectorizer to replace a computation idiom.
7182 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
7183 computed this idiom. We need to record a pointer to VEC_STMT in
7184 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
7185 documentation of vect_pattern_recog. */
7186 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
7188 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
7189 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
7198 /* This function builds ni_name = number of iterations loop executes
7199 on the loop preheader. */
7202 vect_build_loop_niters (loop_vec_info loop_vinfo)
7205 gimple_seq stmts = NULL;
7207 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7208 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
7210 var = create_tmp_var (TREE_TYPE (ni), "niters");
7211 add_referenced_var (var);
7212 ni_name = force_gimple_operand (ni, &stmts, false, var);
7214 pe = loop_preheader_edge (loop);
7217 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7218 gcc_assert (!new_bb);
7225 /* This function generates the following statements:
7227 ni_name = number of iterations loop executes
7228 ratio = ni_name / vf
7229 ratio_mult_vf_name = ratio * vf
7231 and places them at the loop preheader edge. */
7234 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
7236 tree *ratio_mult_vf_name_ptr,
7237 tree *ratio_name_ptr)
7246 tree ratio_mult_vf_name;
7247 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7248 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
7249 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7252 pe = loop_preheader_edge (loop);
7254 /* Generate temporary variable that contains
7255 number of iterations loop executes. */
7257 ni_name = vect_build_loop_niters (loop_vinfo);
7258 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
7260 /* Create: ratio = ni >> log2(vf) */
7262 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
7263 if (!is_gimple_val (ratio_name))
7265 var = create_tmp_var (TREE_TYPE (ni), "bnd");
7266 add_referenced_var (var);
7269 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
7270 pe = loop_preheader_edge (loop);
7271 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7272 gcc_assert (!new_bb);
7275 /* Create: ratio_mult_vf = ratio << log2 (vf). */
7277 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
7278 ratio_name, log_vf);
7279 if (!is_gimple_val (ratio_mult_vf_name))
7281 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
7282 add_referenced_var (var);
7285 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
7287 pe = loop_preheader_edge (loop);
7288 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7289 gcc_assert (!new_bb);
7292 *ni_name_ptr = ni_name;
7293 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
7294 *ratio_name_ptr = ratio_name;
7300 /* Function vect_update_ivs_after_vectorizer.
7302 "Advance" the induction variables of LOOP to the value they should take
7303 after the execution of LOOP. This is currently necessary because the
7304 vectorizer does not handle induction variables that are used after the
7305 loop. Such a situation occurs when the last iterations of LOOP are
7307 1. We introduced new uses after LOOP for IVs that were not originally used
7308 after LOOP: the IVs of LOOP are now used by an epilog loop.
7309 2. LOOP is going to be vectorized; this means that it will iterate N/VF
7310 times, whereas the loop IVs should be bumped N times.
7313 - LOOP - a loop that is going to be vectorized. The last few iterations
7314 of LOOP were peeled.
7315 - NITERS - the number of iterations that LOOP executes (before it is
7316 vectorized). i.e, the number of times the ivs should be bumped.
7317 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
7318 coming out from LOOP on which there are uses of the LOOP ivs
7319 (this is the path from LOOP->exit to epilog_loop->preheader).
7321 The new definitions of the ivs are placed in LOOP->exit.
7322 The phi args associated with the edge UPDATE_E in the bb
7323 UPDATE_E->dest are updated accordingly.
7325 Assumption 1: Like the rest of the vectorizer, this function assumes
7326 a single loop exit that has a single predecessor.
7328 Assumption 2: The phi nodes in the LOOP header and in update_bb are
7329 organized in the same order.
7331 Assumption 3: The access function of the ivs is simple enough (see
7332 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
7334 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
7335 coming out of LOOP on which the ivs of LOOP are used (this is the path
7336 that leads to the epilog loop; other paths skip the epilog loop). This
7337 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
7338 needs to have its phis updated.
7342 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
7345 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7346 basic_block exit_bb = single_exit (loop)->dest;
7348 gimple_stmt_iterator gsi, gsi1;
7349 basic_block update_bb = update_e->dest;
7351 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
7353 /* Make sure there exists a single-predecessor exit bb: */
7354 gcc_assert (single_pred_p (exit_bb));
7356 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
7357 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
7358 gsi_next (&gsi), gsi_next (&gsi1))
7360 tree access_fn = NULL;
7361 tree evolution_part;
7364 tree var, ni, ni_name;
7365 gimple_stmt_iterator last_gsi;
7367 phi = gsi_stmt (gsi);
7368 phi1 = gsi_stmt (gsi1);
7369 if (vect_print_dump_info (REPORT_DETAILS))
7371 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
7372 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
7375 /* Skip virtual phi's. */
7376 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
7378 if (vect_print_dump_info (REPORT_DETAILS))
7379 fprintf (vect_dump, "virtual phi. skip.");
7383 /* Skip reduction phis. */
7384 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
7386 if (vect_print_dump_info (REPORT_DETAILS))
7387 fprintf (vect_dump, "reduc phi. skip.");
7391 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
7392 gcc_assert (access_fn);
7393 STRIP_NOPS (access_fn);
7395 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
7396 gcc_assert (evolution_part != NULL_TREE);
7398 /* FORNOW: We do not support IVs whose evolution function is a polynomial
7399 of degree >= 2 or exponential. */
7400 gcc_assert (!tree_is_chrec (evolution_part));
7402 step_expr = evolution_part;
7403 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
7406 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
7407 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
7409 fold_convert (sizetype,
7410 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
7411 niters, step_expr)));
7413 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
7414 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
7415 fold_convert (TREE_TYPE (init_expr),
7422 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
7423 add_referenced_var (var);
7425 last_gsi = gsi_last_bb (exit_bb);
7426 ni_name = force_gimple_operand_gsi (&last_gsi, ni, false, var,
7427 true, GSI_SAME_STMT);
7429 /* Fix phi expressions in the successor bb. */
7430 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
7434 /* Return the more conservative threshold between the
7435 min_profitable_iters returned by the cost model and the user
7436 specified threshold, if provided. */
7439 conservative_cost_threshold (loop_vec_info loop_vinfo,
7440 int min_profitable_iters)
7443 int min_scalar_loop_bound;
7445 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
7446 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
7448 /* Use the cost model only if it is more conservative than user specified
7450 th = (unsigned) min_scalar_loop_bound;
7451 if (min_profitable_iters
7452 && (!min_scalar_loop_bound
7453 || min_profitable_iters > min_scalar_loop_bound))
7454 th = (unsigned) min_profitable_iters;
7456 if (th && vect_print_dump_info (REPORT_COST))
7457 fprintf (vect_dump, "Vectorization may not be profitable.");
7462 /* Function vect_do_peeling_for_loop_bound
7464 Peel the last iterations of the loop represented by LOOP_VINFO.
7465 The peeled iterations form a new epilog loop. Given that the loop now
7466 iterates NITERS times, the new epilog loop iterates
7467 NITERS % VECTORIZATION_FACTOR times.
7469 The original loop will later be made to iterate
7470 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
7473 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
7475 tree ni_name, ratio_mult_vf_name;
7476 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7477 struct loop *new_loop;
7479 basic_block preheader;
7481 bool check_profitability = false;
7482 unsigned int th = 0;
7483 int min_profitable_iters;
7485 if (vect_print_dump_info (REPORT_DETAILS))
7486 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
7488 initialize_original_copy_tables ();
7490 /* Generate the following variables on the preheader of original loop:
7492 ni_name = number of iteration the original loop executes
7493 ratio = ni_name / vf
7494 ratio_mult_vf_name = ratio * vf */
7495 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
7496 &ratio_mult_vf_name, ratio);
7498 loop_num = loop->num;
7500 /* If cost model check not done during versioning and
7501 peeling for alignment. */
7502 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7503 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
7504 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7506 check_profitability = true;
7508 /* Get profitability threshold for vectorized loop. */
7509 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7511 th = conservative_cost_threshold (loop_vinfo,
7512 min_profitable_iters);
7515 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
7516 ratio_mult_vf_name, ni_name, false,
7517 th, check_profitability);
7518 gcc_assert (new_loop);
7519 gcc_assert (loop_num == loop->num);
7520 #ifdef ENABLE_CHECKING
7521 slpeel_verify_cfg_after_peeling (loop, new_loop);
7524 /* A guard that controls whether the new_loop is to be executed or skipped
7525 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
7526 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
7527 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
7528 is on the path where the LOOP IVs are used and need to be updated. */
7530 preheader = loop_preheader_edge (new_loop)->src;
7531 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
7532 update_e = EDGE_PRED (preheader, 0);
7534 update_e = EDGE_PRED (preheader, 1);
7536 /* Update IVs of original loop as if they were advanced
7537 by ratio_mult_vf_name steps. */
7538 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
7540 /* After peeling we have to reset scalar evolution analyzer. */
7543 free_original_copy_tables ();
7547 /* Function vect_gen_niters_for_prolog_loop
7549 Set the number of iterations for the loop represented by LOOP_VINFO
7550 to the minimum between LOOP_NITERS (the original iteration count of the loop)
7551 and the misalignment of DR - the data reference recorded in
7552 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
7553 this loop, the data reference DR will refer to an aligned location.
7555 The following computation is generated:
7557 If the misalignment of DR is known at compile time:
7558 addr_mis = int mis = DR_MISALIGNMENT (dr);
7559 Else, compute address misalignment in bytes:
7560 addr_mis = addr & (vectype_size - 1)
7562 prolog_niters = min (LOOP_NITERS, ((VF - addr_mis/elem_size)&(VF-1))/step)
7564 (elem_size = element type size; an element is the scalar element whose type
7565 is the inner type of the vectype)
7567 When the step of the data-ref in the loop is not 1 (as in interleaved data
7568 and SLP), the number of iterations of the prolog must be divided by the step
7569 (which is equal to the size of interleaved group).
7571 The above formulas assume that VF == number of elements in the vector. This
7572 may not hold when there are multiple-types in the loop.
7573 In this case, for some data-references in the loop the VF does not represent
7574 the number of elements that fit in the vector. Therefore, instead of VF we
7575 use TYPE_VECTOR_SUBPARTS. */
7578 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
7580 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
7581 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7584 tree iters, iters_name;
7587 gimple dr_stmt = DR_STMT (dr);
7588 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
7589 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7590 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
7591 tree niters_type = TREE_TYPE (loop_niters);
7593 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
7594 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
7596 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7597 step = DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info)));
7599 pe = loop_preheader_edge (loop);
7601 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
7603 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
7604 int elem_misalign = byte_misalign / element_size;
7606 if (vect_print_dump_info (REPORT_DETAILS))
7607 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
7609 iters = build_int_cst (niters_type,
7610 (((nelements - elem_misalign) & (nelements - 1)) / step));
7614 gimple_seq new_stmts = NULL;
7615 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
7616 &new_stmts, NULL_TREE, loop);
7617 tree ptr_type = TREE_TYPE (start_addr);
7618 tree size = TYPE_SIZE (ptr_type);
7619 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
7620 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
7621 tree elem_size_log =
7622 build_int_cst (type, exact_log2 (vectype_align/nelements));
7623 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
7624 tree nelements_tree = build_int_cst (type, nelements);
7628 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmts);
7629 gcc_assert (!new_bb);
7631 /* Create: byte_misalign = addr & (vectype_size - 1) */
7633 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
7635 /* Create: elem_misalign = byte_misalign / element_size */
7637 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
7639 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
7640 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
7641 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
7642 iters = fold_convert (niters_type, iters);
7645 /* Create: prolog_loop_niters = min (iters, loop_niters) */
7646 /* If the loop bound is known at compile time we already verified that it is
7647 greater than vf; since the misalignment ('iters') is at most vf, there's
7648 no need to generate the MIN_EXPR in this case. */
7649 if (TREE_CODE (loop_niters) != INTEGER_CST)
7650 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
7652 if (vect_print_dump_info (REPORT_DETAILS))
7654 fprintf (vect_dump, "niters for prolog loop: ");
7655 print_generic_expr (vect_dump, iters, TDF_SLIM);
7658 var = create_tmp_var (niters_type, "prolog_loop_niters");
7659 add_referenced_var (var);
7661 iters_name = force_gimple_operand (iters, &stmts, false, var);
7663 /* Insert stmt on loop preheader edge. */
7666 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7667 gcc_assert (!new_bb);
7674 /* Function vect_update_init_of_dr
7676 NITERS iterations were peeled from LOOP. DR represents a data reference
7677 in LOOP. This function updates the information recorded in DR to
7678 account for the fact that the first NITERS iterations had already been
7679 executed. Specifically, it updates the OFFSET field of DR. */
7682 vect_update_init_of_dr (struct data_reference *dr, tree niters)
7684 tree offset = DR_OFFSET (dr);
7686 niters = fold_build2 (MULT_EXPR, sizetype,
7687 fold_convert (sizetype, niters),
7688 fold_convert (sizetype, DR_STEP (dr)));
7689 offset = fold_build2 (PLUS_EXPR, sizetype, offset, niters);
7690 DR_OFFSET (dr) = offset;
7694 /* Function vect_update_inits_of_drs
7696 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
7697 This function updates the information recorded for the data references in
7698 the loop to account for the fact that the first NITERS iterations had
7699 already been executed. Specifically, it updates the initial_condition of
7700 the access_function of all the data_references in the loop. */
7703 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
7706 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
7707 struct data_reference *dr;
7709 if (vect_print_dump_info (REPORT_DETAILS))
7710 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
7712 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
7713 vect_update_init_of_dr (dr, niters);
7717 /* Function vect_do_peeling_for_alignment
7719 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
7720 'niters' is set to the misalignment of one of the data references in the
7721 loop, thereby forcing it to refer to an aligned location at the beginning
7722 of the execution of this loop. The data reference for which we are
7723 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
7726 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
7728 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7729 tree niters_of_prolog_loop, ni_name;
7731 struct loop *new_loop;
7732 bool check_profitability = false;
7733 unsigned int th = 0;
7734 int min_profitable_iters;
7736 if (vect_print_dump_info (REPORT_DETAILS))
7737 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
7739 initialize_original_copy_tables ();
7741 ni_name = vect_build_loop_niters (loop_vinfo);
7742 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
7745 /* If cost model check not done during versioning. */
7746 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7747 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7749 check_profitability = true;
7751 /* Get profitability threshold for vectorized loop. */
7752 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7754 th = conservative_cost_threshold (loop_vinfo,
7755 min_profitable_iters);
7758 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
7760 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
7761 niters_of_prolog_loop, ni_name, true,
7762 th, check_profitability);
7764 gcc_assert (new_loop);
7765 #ifdef ENABLE_CHECKING
7766 slpeel_verify_cfg_after_peeling (new_loop, loop);
7769 /* Update number of times loop executes. */
7770 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
7771 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
7772 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
7774 /* Update the init conditions of the access functions of all data refs. */
7775 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
7777 /* After peeling we have to reset scalar evolution analyzer. */
7780 free_original_copy_tables ();
7784 /* Function vect_create_cond_for_align_checks.
7786 Create a conditional expression that represents the alignment checks for
7787 all of data references (array element references) whose alignment must be
7791 COND_EXPR - input conditional expression. New conditions will be chained
7792 with logical AND operation.
7793 LOOP_VINFO - two fields of the loop information are used.
7794 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
7795 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
7798 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7800 The returned value is the conditional expression to be used in the if
7801 statement that controls which version of the loop gets executed at runtime.
7803 The algorithm makes two assumptions:
7804 1) The number of bytes "n" in a vector is a power of 2.
7805 2) An address "a" is aligned if a%n is zero and that this
7806 test can be done as a&(n-1) == 0. For example, for 16
7807 byte vectors the test is a&0xf == 0. */
7810 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
7812 gimple_seq *cond_expr_stmt_list)
7814 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7815 VEC(gimple,heap) *may_misalign_stmts
7816 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
7818 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
7822 tree int_ptrsize_type;
7824 tree or_tmp_name = NULL_TREE;
7825 tree and_tmp, and_tmp_name;
7828 tree part_cond_expr;
7830 /* Check that mask is one less than a power of 2, i.e., mask is
7831 all zeros followed by all ones. */
7832 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
7834 /* CHECKME: what is the best integer or unsigned type to use to hold a
7835 cast from a pointer value? */
7836 psize = TYPE_SIZE (ptr_type_node);
7838 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
7840 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
7841 of the first vector of the i'th data reference. */
7843 for (i = 0; VEC_iterate (gimple, may_misalign_stmts, i, ref_stmt); i++)
7845 gimple_seq new_stmt_list = NULL;
7847 tree addr_tmp, addr_tmp_name;
7848 tree or_tmp, new_or_tmp_name;
7849 gimple addr_stmt, or_stmt;
7851 /* create: addr_tmp = (int)(address_of_first_vector) */
7853 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
7855 if (new_stmt_list != NULL)
7856 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
7858 sprintf (tmp_name, "%s%d", "addr2int", i);
7859 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7860 add_referenced_var (addr_tmp);
7861 addr_tmp_name = make_ssa_name (addr_tmp, NULL);
7862 addr_stmt = gimple_build_assign_with_ops (NOP_EXPR, addr_tmp_name,
7863 addr_base, NULL_TREE);
7864 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
7865 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
7867 /* The addresses are OR together. */
7869 if (or_tmp_name != NULL_TREE)
7871 /* create: or_tmp = or_tmp | addr_tmp */
7872 sprintf (tmp_name, "%s%d", "orptrs", i);
7873 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7874 add_referenced_var (or_tmp);
7875 new_or_tmp_name = make_ssa_name (or_tmp, NULL);
7876 or_stmt = gimple_build_assign_with_ops (BIT_IOR_EXPR,
7878 or_tmp_name, addr_tmp_name);
7879 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
7880 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
7881 or_tmp_name = new_or_tmp_name;
7884 or_tmp_name = addr_tmp_name;
7888 mask_cst = build_int_cst (int_ptrsize_type, mask);
7890 /* create: and_tmp = or_tmp & mask */
7891 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
7892 add_referenced_var (and_tmp);
7893 and_tmp_name = make_ssa_name (and_tmp, NULL);
7895 and_stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, and_tmp_name,
7896 or_tmp_name, mask_cst);
7897 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
7898 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
7900 /* Make and_tmp the left operand of the conditional test against zero.
7901 if and_tmp has a nonzero bit then some address is unaligned. */
7902 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
7903 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
7904 and_tmp_name, ptrsize_zero);
7906 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7907 *cond_expr, part_cond_expr);
7909 *cond_expr = part_cond_expr;
7912 /* Function vect_vfa_segment_size.
7914 Create an expression that computes the size of segment
7915 that will be accessed for a data reference. The functions takes into
7916 account that realignment loads may access one more vector.
7919 DR: The data reference.
7920 VECT_FACTOR: vectorization factor.
7922 Return an expression whose value is the size of segment which will be
7926 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7928 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7929 DR_STEP (dr), vect_factor);
7931 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7933 tree vector_size = TYPE_SIZE_UNIT
7934 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7936 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7937 segment_length, vector_size);
7939 return fold_convert (sizetype, segment_length);
7942 /* Function vect_create_cond_for_alias_checks.
7944 Create a conditional expression that represents the run-time checks for
7945 overlapping of address ranges represented by a list of data references
7946 relations passed as input.
7949 COND_EXPR - input conditional expression. New conditions will be chained
7950 with logical AND operation.
7951 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7955 COND_EXPR - conditional expression.
7956 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7960 The returned value is the conditional expression to be used in the if
7961 statement that controls which version of the loop gets executed at runtime.
7965 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7967 gimple_seq * cond_expr_stmt_list)
7969 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7970 VEC (ddr_p, heap) * may_alias_ddrs =
7971 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7973 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7977 tree part_cond_expr;
7979 /* Create expression
7980 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7981 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7985 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7986 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
7988 if (VEC_empty (ddr_p, may_alias_ddrs))
7991 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
7993 struct data_reference *dr_a, *dr_b;
7994 gimple dr_group_first_a, dr_group_first_b;
7995 tree addr_base_a, addr_base_b;
7996 tree segment_length_a, segment_length_b;
7997 gimple stmt_a, stmt_b;
8000 stmt_a = DR_STMT (DDR_A (ddr));
8001 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
8002 if (dr_group_first_a)
8004 stmt_a = dr_group_first_a;
8005 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
8009 stmt_b = DR_STMT (DDR_B (ddr));
8010 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
8011 if (dr_group_first_b)
8013 stmt_b = dr_group_first_b;
8014 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
8018 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
8021 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
8024 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
8025 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
8027 if (vect_print_dump_info (REPORT_DR_DETAILS))
8030 "create runtime check for data references ");
8031 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
8032 fprintf (vect_dump, " and ");
8033 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
8038 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
8039 fold_build2 (LT_EXPR, boolean_type_node,
8040 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
8044 fold_build2 (LT_EXPR, boolean_type_node,
8045 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
8051 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
8052 *cond_expr, part_cond_expr);
8054 *cond_expr = part_cond_expr;
8056 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8057 fprintf (vect_dump, "created %u versioning for alias checks.\n",
8058 VEC_length (ddr_p, may_alias_ddrs));
8062 /* Function vect_loop_versioning.
8064 If the loop has data references that may or may not be aligned or/and
8065 has data reference relations whose independence was not proven then
8066 two versions of the loop need to be generated, one which is vectorized
8067 and one which isn't. A test is then generated to control which of the
8068 loops is executed. The test checks for the alignment of all of the
8069 data references that may or may not be aligned. An additional
8070 sequence of runtime tests is generated for each pairs of DDRs whose
8071 independence was not proven. The vectorized version of loop is
8072 executed only if both alias and alignment tests are passed.
8074 The test generated to check which version of loop is executed
8075 is modified to also check for profitability as indicated by the
8076 cost model initially. */
8079 vect_loop_versioning (loop_vec_info loop_vinfo)
8081 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8083 tree cond_expr = NULL_TREE;
8084 gimple_seq cond_expr_stmt_list = NULL;
8085 basic_block condition_bb;
8086 gimple_stmt_iterator gsi, cond_exp_gsi;
8087 basic_block merge_bb;
8088 basic_block new_exit_bb;
8090 gimple orig_phi, new_phi;
8092 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
8093 gimple_seq gimplify_stmt_list = NULL;
8094 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
8095 int min_profitable_iters = 0;
8098 /* Get profitability threshold for vectorized loop. */
8099 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
8101 th = conservative_cost_threshold (loop_vinfo,
8102 min_profitable_iters);
8105 build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
8106 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
8108 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
8111 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
8112 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
8113 &cond_expr_stmt_list);
8115 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8116 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
8117 &cond_expr_stmt_list);
8120 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
8122 force_gimple_operand (cond_expr, &gimplify_stmt_list, true, NULL_TREE);
8123 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
8125 initialize_original_copy_tables ();
8126 nloop = loop_version (loop, cond_expr, &condition_bb,
8127 prob, prob, REG_BR_PROB_BASE - prob, true);
8128 free_original_copy_tables();
8130 /* Loop versioning violates an assumption we try to maintain during
8131 vectorization - that the loop exit block has a single predecessor.
8132 After versioning, the exit block of both loop versions is the same
8133 basic block (i.e. it has two predecessors). Just in order to simplify
8134 following transformations in the vectorizer, we fix this situation
8135 here by adding a new (empty) block on the exit-edge of the loop,
8136 with the proper loop-exit phis to maintain loop-closed-form. */
8138 merge_bb = single_exit (loop)->dest;
8139 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
8140 new_exit_bb = split_edge (single_exit (loop));
8141 new_exit_e = single_exit (loop);
8142 e = EDGE_SUCC (new_exit_bb, 0);
8144 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8146 orig_phi = gsi_stmt (gsi);
8147 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
8149 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
8150 add_phi_arg (new_phi, arg, new_exit_e);
8151 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
8154 /* End loop-exit-fixes after versioning. */
8156 update_ssa (TODO_update_ssa);
8157 if (cond_expr_stmt_list)
8159 cond_exp_gsi = gsi_last_bb (condition_bb);
8160 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, GSI_SAME_STMT);
8164 /* Remove a group of stores (for SLP or interleaving), free their
8168 vect_remove_stores (gimple first_stmt)
8170 gimple next = first_stmt;
8172 gimple_stmt_iterator next_si;
8176 /* Free the attached stmt_vec_info and remove the stmt. */
8177 next_si = gsi_for_stmt (next);
8178 gsi_remove (&next_si, true);
8179 tmp = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
8180 free_stmt_vec_info (next);
8186 /* Vectorize SLP instance tree in postorder. */
8189 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
8190 unsigned int vectorization_factor)
8193 bool strided_store, is_store;
8194 gimple_stmt_iterator si;
8195 stmt_vec_info stmt_info;
8196 unsigned int vec_stmts_size, nunits, group_size;
8199 slp_tree loads_node;
8204 vect_schedule_slp_instance (SLP_TREE_LEFT (node), instance,
8205 vectorization_factor);
8206 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), instance,
8207 vectorization_factor);
8209 stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
8210 stmt_info = vinfo_for_stmt (stmt);
8212 /* VECTYPE is the type of the destination. */
8213 vectype = get_vectype_for_scalar_type (TREE_TYPE (gimple_assign_lhs (stmt)));
8214 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
8215 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
8217 /* For each SLP instance calculate number of vector stmts to be created
8218 for the scalar stmts in each node of the SLP tree. Number of vector
8219 elements in one vector iteration is the number of scalar elements in
8220 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
8222 vec_stmts_size = (vectorization_factor * group_size) / nunits;
8224 /* In case of load permutation we have to allocate vectorized statements for
8225 all the nodes that participate in that permutation. */
8226 if (SLP_INSTANCE_LOAD_PERMUTATION (instance))
8229 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), i, loads_node);
8232 if (!SLP_TREE_VEC_STMTS (loads_node))
8234 SLP_TREE_VEC_STMTS (loads_node) = VEC_alloc (gimple, heap,
8236 SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
8241 if (!SLP_TREE_VEC_STMTS (node))
8243 SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size);
8244 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
8247 if (vect_print_dump_info (REPORT_DETAILS))
8249 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
8250 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8253 /* Loads should be inserted before the first load. */
8254 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
8255 && STMT_VINFO_STRIDED_ACCESS (stmt_info)
8256 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
8257 si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
8259 si = gsi_for_stmt (stmt);
8261 is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance);
8264 if (DR_GROUP_FIRST_DR (stmt_info))
8265 /* If IS_STORE is TRUE, the vectorization of the
8266 interleaving chain was completed - free all the stores in
8268 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8270 /* FORNOW: SLP originates only from strided stores. */
8276 /* FORNOW: SLP originates only from strided stores. */
8282 vect_schedule_slp (loop_vec_info loop_vinfo)
8284 VEC (slp_instance, heap) *slp_instances =
8285 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
8286 slp_instance instance;
8288 bool is_store = false;
8290 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
8292 /* Schedule the tree of INSTANCE. */
8293 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
8294 instance, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
8296 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
8297 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
8298 fprintf (vect_dump, "vectorizing stmts using SLP.");
8304 /* Function vect_transform_loop.
8306 The analysis phase has determined that the loop is vectorizable.
8307 Vectorize the loop - created vectorized stmts to replace the scalar
8308 stmts in the loop, and update the loop exit condition. */
8311 vect_transform_loop (loop_vec_info loop_vinfo)
8313 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8314 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8315 int nbbs = loop->num_nodes;
8316 gimple_stmt_iterator si;
8319 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8321 bool slp_scheduled = false;
8322 unsigned int nunits;
8324 if (vect_print_dump_info (REPORT_DETAILS))
8325 fprintf (vect_dump, "=== vec_transform_loop ===");
8327 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
8328 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8329 vect_loop_versioning (loop_vinfo);
8331 /* CHECKME: we wouldn't need this if we called update_ssa once
8333 bitmap_zero (vect_memsyms_to_rename);
8335 /* Peel the loop if there are data refs with unknown alignment.
8336 Only one data ref with unknown store is allowed. */
8338 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
8339 vect_do_peeling_for_alignment (loop_vinfo);
8341 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
8342 compile time constant), or it is a constant that doesn't divide by the
8343 vectorization factor, then an epilog loop needs to be created.
8344 We therefore duplicate the loop: the original loop will be vectorized,
8345 and will compute the first (n/VF) iterations. The second copy of the loop
8346 will remain scalar and will compute the remaining (n%VF) iterations.
8347 (VF is the vectorization factor). */
8349 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8350 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8351 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
8352 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
8354 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8355 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
8357 /* 1) Make sure the loop header has exactly two entries
8358 2) Make sure we have a preheader basic block. */
8360 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8362 split_edge (loop_preheader_edge (loop));
8364 /* FORNOW: the vectorizer supports only loops which body consist
8365 of one basic block (header + empty latch). When the vectorizer will
8366 support more involved loop forms, the order by which the BBs are
8367 traversed need to be reconsidered. */
8369 for (i = 0; i < nbbs; i++)
8371 basic_block bb = bbs[i];
8372 stmt_vec_info stmt_info;
8375 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
8377 phi = gsi_stmt (si);
8378 if (vect_print_dump_info (REPORT_DETAILS))
8380 fprintf (vect_dump, "------>vectorizing phi: ");
8381 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
8383 stmt_info = vinfo_for_stmt (phi);
8387 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8388 && !STMT_VINFO_LIVE_P (stmt_info))
8391 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
8392 != (unsigned HOST_WIDE_INT) vectorization_factor)
8393 && vect_print_dump_info (REPORT_DETAILS))
8394 fprintf (vect_dump, "multiple-types.");
8396 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
8398 if (vect_print_dump_info (REPORT_DETAILS))
8399 fprintf (vect_dump, "transform phi.");
8400 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8404 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
8406 gimple stmt = gsi_stmt (si);
8409 if (vect_print_dump_info (REPORT_DETAILS))
8411 fprintf (vect_dump, "------>vectorizing statement: ");
8412 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8415 stmt_info = vinfo_for_stmt (stmt);
8417 /* vector stmts created in the outer-loop during vectorization of
8418 stmts in an inner-loop may not have a stmt_info, and do not
8419 need to be vectorized. */
8426 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8427 && !STMT_VINFO_LIVE_P (stmt_info))
8433 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
8435 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8436 if (!STMT_SLP_TYPE (stmt_info)
8437 && nunits != (unsigned int) vectorization_factor
8438 && vect_print_dump_info (REPORT_DETAILS))
8439 /* For SLP VF is set according to unrolling factor, and not to
8440 vector size, hence for SLP this print is not valid. */
8441 fprintf (vect_dump, "multiple-types.");
8443 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8445 if (STMT_SLP_TYPE (stmt_info))
8449 slp_scheduled = true;
8451 if (vect_print_dump_info (REPORT_DETAILS))
8452 fprintf (vect_dump, "=== scheduling SLP instances ===");
8454 is_store = vect_schedule_slp (loop_vinfo);
8456 /* IS_STORE is true if STMT is a store. Stores cannot be of
8457 hybrid SLP type. They are removed in
8458 vect_schedule_slp_instance and their vinfo is destroyed. */
8466 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8467 if (PURE_SLP_STMT (stmt_info))
8474 /* -------- vectorize statement ------------ */
8475 if (vect_print_dump_info (REPORT_DETAILS))
8476 fprintf (vect_dump, "transform statement.");
8478 strided_store = false;
8479 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
8482 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
8484 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8485 interleaving chain was completed - free all the stores in
8487 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8488 gsi_remove (&si, true);
8493 /* Free the attached stmt_vec_info and remove the stmt. */
8494 free_stmt_vec_info (stmt);
8495 gsi_remove (&si, true);
8503 slpeel_make_loop_iterate_ntimes (loop, ratio);
8505 mark_set_for_renaming (vect_memsyms_to_rename);
8507 /* The memory tags and pointers in vectorized statements need to
8508 have their SSA forms updated. FIXME, why can't this be delayed
8509 until all the loops have been transformed? */
8510 update_ssa (TODO_update_ssa);
8512 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8513 fprintf (vect_dump, "LOOP VECTORIZED.");
8514 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8515 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");