1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
40 #include "tree-data-ref.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "langhooks.h"
45 #include "tree-pass.h"
49 /* Utility functions for the code transformation. */
50 static bool vect_transform_stmt (gimple, gimple_stmt_iterator *, bool *,
51 slp_tree, slp_instance);
52 static tree vect_create_destination_var (tree, tree);
53 static tree vect_create_data_ref_ptr
54 (gimple, struct loop*, tree, tree *, gimple *, bool, bool *, tree);
55 static tree vect_create_addr_base_for_vector_ref
56 (gimple, gimple_seq *, tree, struct loop *);
57 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
58 static tree vect_get_vec_def_for_operand (tree, gimple, tree *);
59 static tree vect_init_vector (gimple, tree, tree, gimple_stmt_iterator *);
60 static void vect_finish_stmt_generation
61 (gimple stmt, gimple vec_stmt, gimple_stmt_iterator *);
62 static bool vect_is_simple_cond (tree, loop_vec_info);
63 static void vect_create_epilog_for_reduction
64 (tree, gimple, int, enum tree_code, gimple);
65 static tree get_initial_def_for_reduction (gimple, tree, tree *);
67 /* Utility function dealing with loop peeling (not peeling itself). */
68 static void vect_generate_tmps_on_preheader
69 (loop_vec_info, tree *, tree *, tree *);
70 static tree vect_build_loop_niters (loop_vec_info);
71 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
72 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
73 static void vect_update_init_of_dr (struct data_reference *, tree niters);
74 static void vect_update_inits_of_drs (loop_vec_info, tree);
75 static int vect_min_worthwhile_factor (enum tree_code);
79 cost_for_stmt (gimple stmt)
81 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
83 switch (STMT_VINFO_TYPE (stmt_info))
85 case load_vec_info_type:
86 return TARG_SCALAR_LOAD_COST;
87 case store_vec_info_type:
88 return TARG_SCALAR_STORE_COST;
89 case op_vec_info_type:
90 case condition_vec_info_type:
91 case assignment_vec_info_type:
92 case reduc_vec_info_type:
93 case induc_vec_info_type:
94 case type_promotion_vec_info_type:
95 case type_demotion_vec_info_type:
96 case type_conversion_vec_info_type:
97 case call_vec_info_type:
98 return TARG_SCALAR_STMT_COST;
99 case undef_vec_info_type:
106 /* Function vect_estimate_min_profitable_iters
108 Return the number of iterations required for the vector version of the
109 loop to be profitable relative to the cost of the scalar version of the
112 TODO: Take profile info into account before making vectorization
113 decisions, if available. */
116 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
119 int min_profitable_iters;
120 int peel_iters_prologue;
121 int peel_iters_epilogue;
122 int vec_inside_cost = 0;
123 int vec_outside_cost = 0;
124 int scalar_single_iter_cost = 0;
125 int scalar_outside_cost = 0;
126 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
127 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
128 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
129 int nbbs = loop->num_nodes;
130 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
131 int peel_guard_costs = 0;
132 int innerloop_iters = 0, factor;
133 VEC (slp_instance, heap) *slp_instances;
134 slp_instance instance;
136 /* Cost model disabled. */
137 if (!flag_vect_cost_model)
139 if (vect_print_dump_info (REPORT_COST))
140 fprintf (vect_dump, "cost model disabled.");
144 /* Requires loop versioning tests to handle misalignment. */
145 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
147 /* FIXME: Make cost depend on complexity of individual check. */
149 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
150 if (vect_print_dump_info (REPORT_COST))
151 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
152 "versioning to treat misalignment.\n");
155 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
157 /* FIXME: Make cost depend on complexity of individual check. */
159 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
160 if (vect_print_dump_info (REPORT_COST))
161 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
162 "versioning aliasing.\n");
165 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
166 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
168 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
171 /* Count statements in scalar loop. Using this as scalar cost for a single
174 TODO: Add outer loop support.
176 TODO: Consider assigning different costs to different scalar
181 innerloop_iters = 50; /* FIXME */
183 for (i = 0; i < nbbs; i++)
185 gimple_stmt_iterator si;
186 basic_block bb = bbs[i];
188 if (bb->loop_father == loop->inner)
189 factor = innerloop_iters;
193 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
195 gimple stmt = gsi_stmt (si);
196 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
197 /* Skip stmts that are not vectorized inside the loop. */
198 if (!STMT_VINFO_RELEVANT_P (stmt_info)
199 && (!STMT_VINFO_LIVE_P (stmt_info)
200 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
202 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
203 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
204 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
205 some of the "outside" costs are generated inside the outer-loop. */
206 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
210 /* Add additional cost for the peeled instructions in prologue and epilogue
213 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
214 at compile-time - we assume it's vf/2 (the worst would be vf-1).
216 TODO: Build an expression that represents peel_iters for prologue and
217 epilogue to be used in a run-time test. */
219 if (byte_misalign < 0)
221 peel_iters_prologue = vf/2;
222 if (vect_print_dump_info (REPORT_COST))
223 fprintf (vect_dump, "cost model: "
224 "prologue peel iters set to vf/2.");
226 /* If peeling for alignment is unknown, loop bound of main loop becomes
228 peel_iters_epilogue = vf/2;
229 if (vect_print_dump_info (REPORT_COST))
230 fprintf (vect_dump, "cost model: "
231 "epilogue peel iters set to vf/2 because "
232 "peeling for alignment is unknown .");
234 /* If peeled iterations are unknown, count a taken branch and a not taken
235 branch per peeled loop. Even if scalar loop iterations are known,
236 vector iterations are not known since peeled prologue iterations are
237 not known. Hence guards remain the same. */
238 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
239 + TARG_COND_NOT_TAKEN_BRANCH_COST);
245 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
246 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
247 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
248 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
250 peel_iters_prologue = nelements - (byte_misalign / element_size);
253 peel_iters_prologue = 0;
255 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
257 peel_iters_epilogue = vf/2;
258 if (vect_print_dump_info (REPORT_COST))
259 fprintf (vect_dump, "cost model: "
260 "epilogue peel iters set to vf/2 because "
261 "loop iterations are unknown .");
263 /* If peeled iterations are known but number of scalar loop
264 iterations are unknown, count a taken branch per peeled loop. */
265 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
270 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
271 peel_iters_prologue = niters < peel_iters_prologue ?
272 niters : peel_iters_prologue;
273 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
277 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
278 + (peel_iters_epilogue * scalar_single_iter_cost)
281 /* FORNOW: The scalar outside cost is incremented in one of the
284 1. The vectorizer checks for alignment and aliasing and generates
285 a condition that allows dynamic vectorization. A cost model
286 check is ANDED with the versioning condition. Hence scalar code
287 path now has the added cost of the versioning check.
289 if (cost > th & versioning_check)
292 Hence run-time scalar is incremented by not-taken branch cost.
294 2. The vectorizer then checks if a prologue is required. If the
295 cost model check was not done before during versioning, it has to
296 be done before the prologue check.
299 prologue = scalar_iters
304 if (prologue == num_iters)
307 Hence the run-time scalar cost is incremented by a taken branch,
308 plus a not-taken branch, plus a taken branch cost.
310 3. The vectorizer then checks if an epilogue is required. If the
311 cost model check was not done before during prologue check, it
312 has to be done with the epilogue check.
318 if (prologue == num_iters)
321 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
324 Hence the run-time scalar cost should be incremented by 2 taken
327 TODO: The back end may reorder the BBS's differently and reverse
328 conditions/branch directions. Change the estimates below to
329 something more reasonable. */
331 /* If the number of iterations is known and we do not do versioning, we can
332 decide whether to vectorize at compile time. Hence the scalar version
333 do not carry cost model guard costs. */
334 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
335 || VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
336 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
338 /* Cost model check occurs at versioning. */
339 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
340 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
341 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
344 /* Cost model check occurs at prologue generation. */
345 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
346 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
347 + TARG_COND_NOT_TAKEN_BRANCH_COST;
348 /* Cost model check occurs at epilogue generation. */
350 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
355 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
356 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
358 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
359 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
362 /* Calculate number of iterations required to make the vector version
363 profitable, relative to the loop bodies only. The following condition
365 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
367 SIC = scalar iteration cost, VIC = vector iteration cost,
368 VOC = vector outside cost, VF = vectorization factor,
369 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
370 SOC = scalar outside cost for run time cost model check. */
372 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
374 if (vec_outside_cost <= 0)
375 min_profitable_iters = 1;
378 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
379 - vec_inside_cost * peel_iters_prologue
380 - vec_inside_cost * peel_iters_epilogue)
381 / ((scalar_single_iter_cost * vf)
384 if ((scalar_single_iter_cost * vf * min_profitable_iters)
385 <= ((vec_inside_cost * min_profitable_iters)
386 + ((vec_outside_cost - scalar_outside_cost) * vf)))
387 min_profitable_iters++;
390 /* vector version will never be profitable. */
393 if (vect_print_dump_info (REPORT_COST))
394 fprintf (vect_dump, "cost model: vector iteration cost = %d "
395 "is divisible by scalar iteration cost = %d by a factor "
396 "greater than or equal to the vectorization factor = %d .",
397 vec_inside_cost, scalar_single_iter_cost, vf);
401 if (vect_print_dump_info (REPORT_COST))
403 fprintf (vect_dump, "Cost model analysis: \n");
404 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
406 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
408 fprintf (vect_dump, " Scalar iteration cost: %d\n",
409 scalar_single_iter_cost);
410 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
411 fprintf (vect_dump, " prologue iterations: %d\n",
412 peel_iters_prologue);
413 fprintf (vect_dump, " epilogue iterations: %d\n",
414 peel_iters_epilogue);
415 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
416 min_profitable_iters);
419 min_profitable_iters =
420 min_profitable_iters < vf ? vf : min_profitable_iters;
422 /* Because the condition we create is:
423 if (niters <= min_profitable_iters)
424 then skip the vectorized loop. */
425 min_profitable_iters--;
427 if (vect_print_dump_info (REPORT_COST))
428 fprintf (vect_dump, " Profitability threshold = %d\n",
429 min_profitable_iters);
431 return min_profitable_iters;
435 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
436 functions. Design better to avoid maintenance issues. */
438 /* Function vect_model_reduction_cost.
440 Models cost for a reduction operation, including the vector ops
441 generated within the strip-mine loop, the initial definition before
442 the loop, and the epilogue code that must be generated. */
445 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
452 gimple stmt, orig_stmt;
454 enum machine_mode mode;
455 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
456 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
459 /* Cost of reduction op inside loop. */
460 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
462 stmt = STMT_VINFO_STMT (stmt_info);
464 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
466 case GIMPLE_SINGLE_RHS:
467 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
468 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
470 case GIMPLE_UNARY_RHS:
471 reduction_op = gimple_assign_rhs1 (stmt);
473 case GIMPLE_BINARY_RHS:
474 reduction_op = gimple_assign_rhs2 (stmt);
480 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
483 if (vect_print_dump_info (REPORT_COST))
485 fprintf (vect_dump, "unsupported data-type ");
486 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
491 mode = TYPE_MODE (vectype);
492 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
495 orig_stmt = STMT_VINFO_STMT (stmt_info);
497 code = gimple_assign_rhs_code (orig_stmt);
499 /* Add in cost for initial definition. */
500 outer_cost += TARG_SCALAR_TO_VEC_COST;
502 /* Determine cost of epilogue code.
504 We have a reduction operator that will reduce the vector in one statement.
505 Also requires scalar extract. */
507 if (!nested_in_vect_loop_p (loop, orig_stmt))
509 if (reduc_code < NUM_TREE_CODES)
510 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
513 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
515 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
516 int element_bitsize = tree_low_cst (bitsize, 1);
517 int nelements = vec_size_in_bits / element_bitsize;
519 optab = optab_for_tree_code (code, vectype, optab_default);
521 /* We have a whole vector shift available. */
522 if (VECTOR_MODE_P (mode)
523 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
524 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
525 /* Final reduction via vector shifts and the reduction operator. Also
526 requires scalar extract. */
527 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
528 + TARG_VEC_TO_SCALAR_COST);
530 /* Use extracts and reduction op for final reduction. For N elements,
531 we have N extracts and N-1 reduction ops. */
532 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
536 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
538 if (vect_print_dump_info (REPORT_COST))
539 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
540 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
541 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
547 /* Function vect_model_induction_cost.
549 Models cost for induction operations. */
552 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
554 /* loop cost for vec_loop. */
555 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
556 /* prologue cost for vec_init and vec_step. */
557 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
559 if (vect_print_dump_info (REPORT_COST))
560 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
561 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
562 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
566 /* Function vect_model_simple_cost.
568 Models cost for simple operations, i.e. those that only emit ncopies of a
569 single op. Right now, this does not account for multiple insns that could
570 be generated for the single vector op. We will handle that shortly. */
573 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
574 enum vect_def_type *dt, slp_tree slp_node)
577 int inside_cost = 0, outside_cost = 0;
579 /* The SLP costs were already calculated during SLP tree build. */
580 if (PURE_SLP_STMT (stmt_info))
583 inside_cost = ncopies * TARG_VEC_STMT_COST;
585 /* FORNOW: Assuming maximum 2 args per stmts. */
586 for (i = 0; i < 2; i++)
588 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
589 outside_cost += TARG_SCALAR_TO_VEC_COST;
592 if (vect_print_dump_info (REPORT_COST))
593 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
594 "outside_cost = %d .", inside_cost, outside_cost);
596 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
597 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
598 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
602 /* Function vect_cost_strided_group_size
604 For strided load or store, return the group_size only if it is the first
605 load or store of a group, else return 1. This ensures that group size is
606 only returned once per group. */
609 vect_cost_strided_group_size (stmt_vec_info stmt_info)
611 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
613 if (first_stmt == STMT_VINFO_STMT (stmt_info))
614 return DR_GROUP_SIZE (stmt_info);
620 /* Function vect_model_store_cost
622 Models cost for stores. In the case of strided accesses, one access
623 has the overhead of the strided access attributed to it. */
626 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
627 enum vect_def_type dt, slp_tree slp_node)
630 int inside_cost = 0, outside_cost = 0;
632 /* The SLP costs were already calculated during SLP tree build. */
633 if (PURE_SLP_STMT (stmt_info))
636 if (dt == vect_constant_def || dt == vect_invariant_def)
637 outside_cost = TARG_SCALAR_TO_VEC_COST;
639 /* Strided access? */
640 if (DR_GROUP_FIRST_DR (stmt_info) && !slp_node)
641 group_size = vect_cost_strided_group_size (stmt_info);
642 /* Not a strided access. */
646 /* Is this an access in a group of stores, which provide strided access?
647 If so, add in the cost of the permutes. */
650 /* Uses a high and low interleave operation for each needed permute. */
651 inside_cost = ncopies * exact_log2(group_size) * group_size
652 * TARG_VEC_STMT_COST;
654 if (vect_print_dump_info (REPORT_COST))
655 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
660 /* Costs of the stores. */
661 inside_cost += ncopies * TARG_VEC_STORE_COST;
663 if (vect_print_dump_info (REPORT_COST))
664 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
665 "outside_cost = %d .", inside_cost, outside_cost);
667 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
668 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
669 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
673 /* Function vect_model_load_cost
675 Models cost for loads. In the case of strided accesses, the last access
676 has the overhead of the strided access attributed to it. Since unaligned
677 accesses are supported for loads, we also account for the costs of the
678 access scheme chosen. */
681 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
685 int alignment_support_cheme;
687 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
688 int inside_cost = 0, outside_cost = 0;
690 /* The SLP costs were already calculated during SLP tree build. */
691 if (PURE_SLP_STMT (stmt_info))
694 /* Strided accesses? */
695 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
696 if (first_stmt && !slp_node)
698 group_size = vect_cost_strided_group_size (stmt_info);
699 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
701 /* Not a strided access. */
708 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
710 /* Is this an access in a group of loads providing strided access?
711 If so, add in the cost of the permutes. */
714 /* Uses an even and odd extract operations for each needed permute. */
715 inside_cost = ncopies * exact_log2(group_size) * group_size
716 * TARG_VEC_STMT_COST;
718 if (vect_print_dump_info (REPORT_COST))
719 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
724 /* The loads themselves. */
725 switch (alignment_support_cheme)
729 inside_cost += ncopies * TARG_VEC_LOAD_COST;
731 if (vect_print_dump_info (REPORT_COST))
732 fprintf (vect_dump, "vect_model_load_cost: aligned.");
736 case dr_unaligned_supported:
738 /* Here, we assign an additional cost for the unaligned load. */
739 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
741 if (vect_print_dump_info (REPORT_COST))
742 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
747 case dr_explicit_realign:
749 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
751 /* FIXME: If the misalignment remains fixed across the iterations of
752 the containing loop, the following cost should be added to the
754 if (targetm.vectorize.builtin_mask_for_load)
755 inside_cost += TARG_VEC_STMT_COST;
759 case dr_explicit_realign_optimized:
761 if (vect_print_dump_info (REPORT_COST))
762 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
765 /* Unaligned software pipeline has a load of an address, an initial
766 load, and possibly a mask operation to "prime" the loop. However,
767 if this is an access in a group of loads, which provide strided
768 access, then the above cost should only be considered for one
769 access in the group. Inside the loop, there is a load op
770 and a realignment op. */
772 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
774 outside_cost = 2*TARG_VEC_STMT_COST;
775 if (targetm.vectorize.builtin_mask_for_load)
776 outside_cost += TARG_VEC_STMT_COST;
779 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
788 if (vect_print_dump_info (REPORT_COST))
789 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
790 "outside_cost = %d .", inside_cost, outside_cost);
792 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
793 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
794 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
798 /* Function vect_get_new_vect_var.
800 Returns a name for a new variable. The current naming scheme appends the
801 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
802 the name of vectorizer generated variables, and appends that to NAME if
806 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
813 case vect_simple_var:
816 case vect_scalar_var:
819 case vect_pointer_var:
828 char* tmp = concat (prefix, name, NULL);
829 new_vect_var = create_tmp_var (type, tmp);
833 new_vect_var = create_tmp_var (type, prefix);
835 /* Mark vector typed variable as a gimple register variable. */
836 if (TREE_CODE (type) == VECTOR_TYPE)
837 DECL_GIMPLE_REG_P (new_vect_var) = true;
843 /* Function vect_create_addr_base_for_vector_ref.
845 Create an expression that computes the address of the first memory location
846 that will be accessed for a data reference.
849 STMT: The statement containing the data reference.
850 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
851 OFFSET: Optional. If supplied, it is be added to the initial address.
852 LOOP: Specify relative to which loop-nest should the address be computed.
853 For example, when the dataref is in an inner-loop nested in an
854 outer-loop that is now being vectorized, LOOP can be either the
855 outer-loop, or the inner-loop. The first memory location accessed
856 by the following dataref ('in' points to short):
863 if LOOP=i_loop: &in (relative to i_loop)
864 if LOOP=j_loop: &in+i*2B (relative to j_loop)
867 1. Return an SSA_NAME whose value is the address of the memory location of
868 the first vector of the data reference.
869 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
870 these statement(s) which define the returned SSA_NAME.
872 FORNOW: We are only handling array accesses with step 1. */
875 vect_create_addr_base_for_vector_ref (gimple stmt,
876 gimple_seq *new_stmt_list,
880 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
881 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
882 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
883 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
885 tree data_ref_base_var;
887 tree addr_base, addr_expr;
889 gimple_seq seq = NULL;
890 tree base_offset = unshare_expr (DR_OFFSET (dr));
891 tree init = unshare_expr (DR_INIT (dr));
892 tree vect_ptr_type, addr_expr2;
893 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
896 if (loop != containing_loop)
898 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
899 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
901 gcc_assert (nested_in_vect_loop_p (loop, stmt));
903 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
904 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
905 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
908 /* Create data_ref_base */
909 base_name = build_fold_indirect_ref (data_ref_base);
910 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
911 add_referenced_var (data_ref_base_var);
912 data_ref_base = force_gimple_operand (data_ref_base, &seq, true,
914 gimple_seq_add_seq (new_stmt_list, seq);
916 /* Create base_offset */
917 base_offset = size_binop (PLUS_EXPR,
918 fold_convert (sizetype, base_offset),
919 fold_convert (sizetype, init));
920 dest = create_tmp_var (sizetype, "base_off");
921 add_referenced_var (dest);
922 base_offset = force_gimple_operand (base_offset, &seq, true, dest);
923 gimple_seq_add_seq (new_stmt_list, seq);
927 tree tmp = create_tmp_var (sizetype, "offset");
929 add_referenced_var (tmp);
930 offset = fold_build2 (MULT_EXPR, sizetype,
931 fold_convert (sizetype, offset), step);
932 base_offset = fold_build2 (PLUS_EXPR, sizetype,
933 base_offset, offset);
934 base_offset = force_gimple_operand (base_offset, &seq, false, tmp);
935 gimple_seq_add_seq (new_stmt_list, seq);
938 /* base + base_offset */
939 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
940 data_ref_base, base_offset);
942 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
944 /* addr_expr = addr_base */
945 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
946 get_name (base_name));
947 add_referenced_var (addr_expr);
948 vec_stmt = fold_convert (vect_ptr_type, addr_base);
949 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
950 get_name (base_name));
951 add_referenced_var (addr_expr2);
952 vec_stmt = force_gimple_operand (vec_stmt, &seq, false, addr_expr2);
953 gimple_seq_add_seq (new_stmt_list, seq);
955 if (vect_print_dump_info (REPORT_DETAILS))
957 fprintf (vect_dump, "created ");
958 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
964 /* Function vect_create_data_ref_ptr.
966 Create a new pointer to vector type (vp), that points to the first location
967 accessed in the loop by STMT, along with the def-use update chain to
968 appropriately advance the pointer through the loop iterations. Also set
969 aliasing information for the pointer. This vector pointer is used by the
970 callers to this function to create a memory reference expression for vector
974 1. STMT: a stmt that references memory. Expected to be of the form
975 GIMPLE_ASSIGN <name, data-ref> or
976 GIMPLE_ASSIGN <data-ref, name>.
977 2. AT_LOOP: the loop where the vector memref is to be created.
978 3. OFFSET (optional): an offset to be added to the initial address accessed
979 by the data-ref in STMT.
980 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
981 pointing to the initial address.
982 5. TYPE: if not NULL indicates the required type of the data-ref.
985 1. Declare a new ptr to vector_type, and have it point to the base of the
986 data reference (initial addressed accessed by the data reference).
987 For example, for vector of type V8HI, the following code is generated:
990 vp = (v8hi *)initial_address;
992 if OFFSET is not supplied:
993 initial_address = &a[init];
994 if OFFSET is supplied:
995 initial_address = &a[init + OFFSET];
997 Return the initial_address in INITIAL_ADDRESS.
999 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
1000 update the pointer in each iteration of the loop.
1002 Return the increment stmt that updates the pointer in PTR_INCR.
1004 3. Set INV_P to true if the access pattern of the data reference in the
1005 vectorized loop is invariant. Set it to false otherwise.
1007 4. Return the pointer. */
1010 vect_create_data_ref_ptr (gimple stmt, struct loop *at_loop,
1011 tree offset, tree *initial_address, gimple *ptr_incr,
1012 bool only_init, bool *inv_p, tree type)
1015 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1016 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1017 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1018 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
1019 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
1020 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1026 gimple_seq new_stmt_list = NULL;
1030 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1032 gimple_stmt_iterator incr_gsi;
1034 tree indx_before_incr, indx_after_incr;
1038 /* Check the step (evolution) of the load in LOOP, and record
1039 whether it's invariant. */
1040 if (nested_in_vect_loop)
1041 step = STMT_VINFO_DR_STEP (stmt_info);
1043 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1045 if (tree_int_cst_compare (step, size_zero_node) == 0)
1050 /* Create an expression for the first address accessed by this load
1052 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1054 if (vect_print_dump_info (REPORT_DETAILS))
1056 tree data_ref_base = base_name;
1057 fprintf (vect_dump, "create vector-pointer variable to type: ");
1058 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1059 if (TREE_CODE (data_ref_base) == VAR_DECL)
1060 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
1061 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1062 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
1063 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1064 fprintf (vect_dump, " vectorizing a record based array ref: ");
1065 else if (TREE_CODE (data_ref_base) == SSA_NAME)
1066 fprintf (vect_dump, " vectorizing a pointer ref: ");
1067 print_generic_expr (vect_dump, base_name, TDF_SLIM);
1070 /** (1) Create the new vector-pointer variable: **/
1072 vect_ptr_type = build_pointer_type (type);
1074 vect_ptr_type = build_pointer_type (vectype);
1076 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == SSA_NAME
1077 && TYPE_RESTRICT (TREE_TYPE (DR_BASE_ADDRESS (dr))))
1078 vect_ptr_type = build_qualified_type (vect_ptr_type, TYPE_QUAL_RESTRICT);
1079 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1080 get_name (base_name));
1081 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == SSA_NAME
1082 && TYPE_RESTRICT (TREE_TYPE (DR_BASE_ADDRESS (dr))))
1084 get_alias_set (base_name);
1085 DECL_POINTER_ALIAS_SET (vect_ptr)
1086 = DECL_POINTER_ALIAS_SET (SSA_NAME_VAR (DR_BASE_ADDRESS (dr)));
1089 add_referenced_var (vect_ptr);
1091 /** (2) Add aliasing information to the new vector-pointer:
1092 (The points-to info (DR_PTR_INFO) may be defined later.) **/
1094 tag = DR_SYMBOL_TAG (dr);
1097 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1098 tag must be created with tag added to its may alias list. */
1100 new_type_alias (vect_ptr, tag, DR_REF (dr));
1102 set_symbol_mem_tag (vect_ptr, tag);
1104 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1105 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1106 def-use update cycles for the pointer: One relative to the outer-loop
1107 (LOOP), which is what steps (3) and (4) below do. The other is relative
1108 to the inner-loop (which is the inner-most loop containing the dataref),
1109 and this is done be step (5) below.
1111 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1112 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1113 redundant. Steps (3),(4) create the following:
1116 LOOP: vp1 = phi(vp0,vp2)
1122 If there is an inner-loop nested in loop, then step (5) will also be
1123 applied, and an additional update in the inner-loop will be created:
1126 LOOP: vp1 = phi(vp0,vp2)
1128 inner: vp3 = phi(vp1,vp4)
1129 vp4 = vp3 + inner_step
1135 /** (3) Calculate the initial address the vector-pointer, and set
1136 the vector-pointer to point to it before the loop: **/
1138 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1140 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1142 pe = loop_preheader_edge (loop);
1145 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
1146 gcc_assert (!new_bb);
1149 *initial_address = new_temp;
1151 /* Create: p = (vectype *) initial_base */
1152 vec_stmt = gimple_build_assign (vect_ptr,
1153 fold_convert (vect_ptr_type, new_temp));
1154 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1155 gimple_assign_set_lhs (vec_stmt, vect_ptr_init);
1156 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
1157 gcc_assert (!new_bb);
1160 /** (4) Handle the updating of the vector-pointer inside the loop.
1161 This is needed when ONLY_INIT is false, and also when AT_LOOP
1162 is the inner-loop nested in LOOP (during outer-loop vectorization).
1165 if (only_init && at_loop == loop) /* No update in loop is required. */
1167 /* Copy the points-to information if it exists. */
1168 if (DR_PTR_INFO (dr))
1169 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1170 vptr = vect_ptr_init;
1174 /* The step of the vector pointer is the Vector Size. */
1175 tree step = TYPE_SIZE_UNIT (vectype);
1176 /* One exception to the above is when the scalar step of the load in
1177 LOOP is zero. In this case the step here is also zero. */
1179 step = size_zero_node;
1181 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1183 create_iv (vect_ptr_init,
1184 fold_convert (vect_ptr_type, step),
1185 vect_ptr, loop, &incr_gsi, insert_after,
1186 &indx_before_incr, &indx_after_incr);
1187 incr = gsi_stmt (incr_gsi);
1188 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1190 /* Copy the points-to information if it exists. */
1191 if (DR_PTR_INFO (dr))
1193 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1194 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1196 merge_alias_info (vect_ptr_init, indx_before_incr);
1197 merge_alias_info (vect_ptr_init, indx_after_incr);
1201 vptr = indx_before_incr;
1204 if (!nested_in_vect_loop || only_init)
1208 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1209 nested in LOOP, if exists: **/
1211 gcc_assert (nested_in_vect_loop);
1214 standard_iv_increment_position (containing_loop, &incr_gsi,
1216 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), vect_ptr,
1217 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
1219 incr = gsi_stmt (incr_gsi);
1220 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1222 /* Copy the points-to information if it exists. */
1223 if (DR_PTR_INFO (dr))
1225 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1226 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1228 merge_alias_info (vect_ptr_init, indx_before_incr);
1229 merge_alias_info (vect_ptr_init, indx_after_incr);
1233 return indx_before_incr;
1240 /* Function bump_vector_ptr
1242 Increment a pointer (to a vector type) by vector-size. If requested,
1243 i.e. if PTR-INCR is given, then also connect the new increment stmt
1244 to the existing def-use update-chain of the pointer, by modifying
1245 the PTR_INCR as illustrated below:
1247 The pointer def-use update-chain before this function:
1248 DATAREF_PTR = phi (p_0, p_2)
1250 PTR_INCR: p_2 = DATAREF_PTR + step
1252 The pointer def-use update-chain after this function:
1253 DATAREF_PTR = phi (p_0, p_2)
1255 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1257 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1260 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1262 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1263 the loop. The increment amount across iterations is expected
1265 BSI - location where the new update stmt is to be placed.
1266 STMT - the original scalar memory-access stmt that is being vectorized.
1267 BUMP - optional. The offset by which to bump the pointer. If not given,
1268 the offset is assumed to be vector_size.
1270 Output: Return NEW_DATAREF_PTR as illustrated above.
1275 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
1276 gimple stmt, tree bump)
1278 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1279 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1280 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1281 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1282 tree update = TYPE_SIZE_UNIT (vectype);
1285 use_operand_p use_p;
1286 tree new_dataref_ptr;
1291 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, ptr_var,
1292 dataref_ptr, update);
1293 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1294 gimple_assign_set_lhs (incr_stmt, new_dataref_ptr);
1295 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
1297 /* Copy the points-to information if it exists. */
1298 if (DR_PTR_INFO (dr))
1299 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1300 merge_alias_info (new_dataref_ptr, dataref_ptr);
1303 return new_dataref_ptr;
1305 /* Update the vector-pointer's cross-iteration increment. */
1306 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1308 tree use = USE_FROM_PTR (use_p);
1310 if (use == dataref_ptr)
1311 SET_USE (use_p, new_dataref_ptr);
1313 gcc_assert (tree_int_cst_compare (use, update) == 0);
1316 return new_dataref_ptr;
1320 /* Function vect_create_destination_var.
1322 Create a new temporary of type VECTYPE. */
1325 vect_create_destination_var (tree scalar_dest, tree vectype)
1328 const char *new_name;
1330 enum vect_var_kind kind;
1332 kind = vectype ? vect_simple_var : vect_scalar_var;
1333 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1335 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1337 new_name = get_name (scalar_dest);
1340 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1341 add_referenced_var (vec_dest);
1347 /* Function vect_init_vector.
1349 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1350 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1351 is not NULL. Otherwise, place the initialization at the loop preheader.
1352 Return the DEF of INIT_STMT.
1353 It will be used in the vectorization of STMT. */
1356 vect_init_vector (gimple stmt, tree vector_var, tree vector_type,
1357 gimple_stmt_iterator *gsi)
1359 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1367 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1368 add_referenced_var (new_var);
1369 init_stmt = gimple_build_assign (new_var, vector_var);
1370 new_temp = make_ssa_name (new_var, init_stmt);
1371 gimple_assign_set_lhs (init_stmt, new_temp);
1374 vect_finish_stmt_generation (stmt, init_stmt, gsi);
1377 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1378 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1380 if (nested_in_vect_loop_p (loop, stmt))
1382 pe = loop_preheader_edge (loop);
1383 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1384 gcc_assert (!new_bb);
1387 if (vect_print_dump_info (REPORT_DETAILS))
1389 fprintf (vect_dump, "created new init_stmt: ");
1390 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1393 vec_oprnd = gimple_assign_lhs (init_stmt);
1398 /* For constant and loop invariant defs of SLP_NODE this function returns
1399 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1400 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1401 stmts. NUMBER_OF_VECTORS is the number of vector defs to create. */
1404 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1405 unsigned int op_num, unsigned int number_of_vectors)
1407 VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1408 gimple stmt = VEC_index (gimple, stmts, 0);
1409 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1410 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1414 int j, number_of_places_left_in_vector;
1417 int group_size = VEC_length (gimple, stmts);
1418 unsigned int vec_num, i;
1419 int number_of_copies = 1;
1420 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1421 bool constant_p, is_store;
1423 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1426 op = gimple_assign_rhs1 (stmt);
1431 op = gimple_op (stmt, op_num + 1);
1434 if (CONSTANT_CLASS_P (op))
1436 vector_type = vectype;
1441 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1442 gcc_assert (vector_type);
1446 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
1448 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1449 created vectors. It is greater than 1 if unrolling is performed.
1451 For example, we have two scalar operands, s1 and s2 (e.g., group of
1452 strided accesses of size two), while NUNITS is four (i.e., four scalars
1453 of this type can be packed in a vector). The output vector will contain
1454 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1457 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1458 containing the operands.
1460 For example, NUNITS is four as before, and the group size is 8
1461 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1462 {s5, s6, s7, s8}. */
1464 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1466 number_of_places_left_in_vector = nunits;
1467 for (j = 0; j < number_of_copies; j++)
1469 for (i = group_size - 1; VEC_iterate (gimple, stmts, i, stmt); i--)
1472 op = gimple_assign_rhs1 (stmt);
1474 op = gimple_op (stmt, op_num + 1);
1476 /* Create 'vect_ = {op0,op1,...,opn}'. */
1477 t = tree_cons (NULL_TREE, op, t);
1479 number_of_places_left_in_vector--;
1481 if (number_of_places_left_in_vector == 0)
1483 number_of_places_left_in_vector = nunits;
1486 vec_cst = build_vector (vector_type, t);
1488 vec_cst = build_constructor_from_list (vector_type, t);
1489 VEC_quick_push (tree, voprnds,
1490 vect_init_vector (stmt, vec_cst, vector_type, NULL));
1496 /* Since the vectors are created in the reverse order, we should invert
1498 vec_num = VEC_length (tree, voprnds);
1499 for (j = vec_num - 1; j >= 0; j--)
1501 vop = VEC_index (tree, voprnds, j);
1502 VEC_quick_push (tree, *vec_oprnds, vop);
1505 VEC_free (tree, heap, voprnds);
1507 /* In case that VF is greater than the unrolling factor needed for the SLP
1508 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1509 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1510 to replicate the vectors. */
1511 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1513 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1514 VEC_quick_push (tree, *vec_oprnds, vop);
1519 /* Get vectorized definitions from SLP_NODE that contains corresponding
1520 vectorized def-stmts. */
1523 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1526 gimple vec_def_stmt;
1529 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1532 VEC_iterate (gimple, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1535 gcc_assert (vec_def_stmt);
1536 vec_oprnd = gimple_get_lhs (vec_def_stmt);
1537 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1542 /* Get vectorized definitions for SLP_NODE.
1543 If the scalar definitions are loop invariants or constants, collect them and
1544 call vect_get_constant_vectors() to create vector stmts.
1545 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1546 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1547 vect_get_slp_vect_defs() to retrieve them.
1548 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1549 the right node. This is used when the second operand must remain scalar. */
1552 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1553 VEC (tree,heap) **vec_oprnds1)
1556 enum tree_code code;
1557 int number_of_vects;
1558 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
1560 first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1561 /* The number of vector defs is determined by the number of vector statements
1562 in the node from which we get those statements. */
1563 if (SLP_TREE_LEFT (slp_node))
1564 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_LEFT (slp_node));
1567 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1568 /* Number of vector stmts was calculated according to LHS in
1569 vect_schedule_slp_instance(), fix it by replacing LHS with RHS, if
1570 necessary. See vect_get_smallest_scalar_type() for details. */
1571 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
1573 if (rhs_size_unit != lhs_size_unit)
1575 number_of_vects *= rhs_size_unit;
1576 number_of_vects /= lhs_size_unit;
1580 /* Allocate memory for vectorized defs. */
1581 *vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects);
1583 /* SLP_NODE corresponds either to a group of stores or to a group of
1584 unary/binary operations. We don't call this function for loads. */
1585 if (SLP_TREE_LEFT (slp_node))
1586 /* The defs are already vectorized. */
1587 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1589 /* Build vectors from scalar defs. */
1590 vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects);
1592 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1593 /* Since we don't call this function with loads, this is a group of
1597 code = gimple_assign_rhs_code (first_stmt);
1598 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1)
1601 /* The number of vector defs is determined by the number of vector statements
1602 in the node from which we get those statements. */
1603 if (SLP_TREE_RIGHT (slp_node))
1604 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_RIGHT (slp_node));
1606 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1608 *vec_oprnds1 = VEC_alloc (tree, heap, number_of_vects);
1610 if (SLP_TREE_RIGHT (slp_node))
1611 /* The defs are already vectorized. */
1612 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1614 /* Build vectors from scalar defs. */
1615 vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects);
1619 /* Function get_initial_def_for_induction
1622 STMT - a stmt that performs an induction operation in the loop.
1623 IV_PHI - the initial value of the induction variable
1626 Return a vector variable, initialized with the first VF values of
1627 the induction variable. E.g., for an iv with IV_PHI='X' and
1628 evolution S, for a vector of 4 units, we want to return:
1629 [X, X + S, X + 2*S, X + 3*S]. */
1632 get_initial_def_for_induction (gimple iv_phi)
1634 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1635 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1636 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1637 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
1640 edge pe = loop_preheader_edge (loop);
1641 struct loop *iv_loop;
1643 tree vec, vec_init, vec_step, t;
1647 gimple init_stmt, induction_phi, new_stmt;
1648 tree induc_def, vec_def, vec_dest;
1649 tree init_expr, step_expr;
1650 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1655 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1656 bool nested_in_vect_loop = false;
1657 gimple_seq stmts = NULL;
1658 imm_use_iterator imm_iter;
1659 use_operand_p use_p;
1663 gimple_stmt_iterator si;
1664 basic_block bb = gimple_bb (iv_phi);
1666 vectype = get_vectype_for_scalar_type (scalar_type);
1667 gcc_assert (vectype);
1668 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1669 ncopies = vf / nunits;
1671 gcc_assert (phi_info);
1672 gcc_assert (ncopies >= 1);
1674 /* Find the first insertion point in the BB. */
1675 si = gsi_after_labels (bb);
1677 if (INTEGRAL_TYPE_P (scalar_type) || POINTER_TYPE_P (scalar_type))
1678 step_expr = build_int_cst (scalar_type, 0);
1680 step_expr = build_real (scalar_type, dconst0);
1682 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1683 if (nested_in_vect_loop_p (loop, iv_phi))
1685 nested_in_vect_loop = true;
1686 iv_loop = loop->inner;
1690 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
1692 latch_e = loop_latch_edge (iv_loop);
1693 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1695 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1696 gcc_assert (access_fn);
1697 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1698 &init_expr, &step_expr);
1700 pe = loop_preheader_edge (iv_loop);
1702 /* Create the vector that holds the initial_value of the induction. */
1703 if (nested_in_vect_loop)
1705 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1706 been created during vectorization of previous stmts; We obtain it from
1707 the STMT_VINFO_VEC_STMT of the defining stmt. */
1708 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1709 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1713 /* iv_loop is the loop to be vectorized. Create:
1714 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1715 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1716 add_referenced_var (new_var);
1718 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1721 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
1722 gcc_assert (!new_bb);
1726 t = tree_cons (NULL_TREE, init_expr, t);
1727 for (i = 1; i < nunits; i++)
1729 /* Create: new_name_i = new_name + step_expr */
1730 enum tree_code code = POINTER_TYPE_P (scalar_type)
1731 ? POINTER_PLUS_EXPR : PLUS_EXPR;
1732 init_stmt = gimple_build_assign_with_ops (code, new_var,
1733 new_name, step_expr);
1734 new_name = make_ssa_name (new_var, init_stmt);
1735 gimple_assign_set_lhs (init_stmt, new_name);
1737 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1738 gcc_assert (!new_bb);
1740 if (vect_print_dump_info (REPORT_DETAILS))
1742 fprintf (vect_dump, "created new init_stmt: ");
1743 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1745 t = tree_cons (NULL_TREE, new_name, t);
1747 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1748 vec = build_constructor_from_list (vectype, nreverse (t));
1749 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1753 /* Create the vector that holds the step of the induction. */
1754 if (nested_in_vect_loop)
1755 /* iv_loop is nested in the loop to be vectorized. Generate:
1756 vec_step = [S, S, S, S] */
1757 new_name = step_expr;
1760 /* iv_loop is the loop to be vectorized. Generate:
1761 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1762 expr = build_int_cst (scalar_type, vf);
1763 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1767 for (i = 0; i < nunits; i++)
1768 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1769 gcc_assert (CONSTANT_CLASS_P (new_name));
1770 vec = build_vector (vectype, t);
1771 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1774 /* Create the following def-use cycle:
1779 vec_iv = PHI <vec_init, vec_loop>
1783 vec_loop = vec_iv + vec_step; */
1785 /* Create the induction-phi that defines the induction-operand. */
1786 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1787 add_referenced_var (vec_dest);
1788 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1789 set_vinfo_for_stmt (induction_phi,
1790 new_stmt_vec_info (induction_phi, loop_vinfo));
1791 induc_def = PHI_RESULT (induction_phi);
1793 /* Create the iv update inside the loop */
1794 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1795 induc_def, vec_step);
1796 vec_def = make_ssa_name (vec_dest, new_stmt);
1797 gimple_assign_set_lhs (new_stmt, vec_def);
1798 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1799 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
1801 /* Set the arguments of the phi node: */
1802 add_phi_arg (induction_phi, vec_init, pe);
1803 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1806 /* In case that vectorization factor (VF) is bigger than the number
1807 of elements that we can fit in a vectype (nunits), we have to generate
1808 more than one vector stmt - i.e - we need to "unroll" the
1809 vector stmt by a factor VF/nunits. For more details see documentation
1810 in vectorizable_operation. */
1814 stmt_vec_info prev_stmt_vinfo;
1815 /* FORNOW. This restriction should be relaxed. */
1816 gcc_assert (!nested_in_vect_loop);
1818 /* Create the vector that holds the step of the induction. */
1819 expr = build_int_cst (scalar_type, nunits);
1820 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1822 for (i = 0; i < nunits; i++)
1823 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1824 gcc_assert (CONSTANT_CLASS_P (new_name));
1825 vec = build_vector (vectype, t);
1826 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1828 vec_def = induc_def;
1829 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1830 for (i = 1; i < ncopies; i++)
1832 /* vec_i = vec_prev + vec_step */
1833 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1835 vec_def = make_ssa_name (vec_dest, new_stmt);
1836 gimple_assign_set_lhs (new_stmt, vec_def);
1838 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1839 set_vinfo_for_stmt (new_stmt,
1840 new_stmt_vec_info (new_stmt, loop_vinfo));
1841 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1842 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1846 if (nested_in_vect_loop)
1848 /* Find the loop-closed exit-phi of the induction, and record
1849 the final vector of induction results: */
1851 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1853 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
1855 exit_phi = USE_STMT (use_p);
1861 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1862 /* FORNOW. Currently not supporting the case that an inner-loop induction
1863 is not used in the outer-loop (i.e. only outside the outer-loop). */
1864 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1865 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1867 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1868 if (vect_print_dump_info (REPORT_DETAILS))
1870 fprintf (vect_dump, "vector of inductions after inner-loop:");
1871 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
1877 if (vect_print_dump_info (REPORT_DETAILS))
1879 fprintf (vect_dump, "transform induction: created def-use cycle: ");
1880 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
1881 fprintf (vect_dump, "\n");
1882 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
1885 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1890 /* Function vect_get_vec_def_for_operand.
1892 OP is an operand in STMT. This function returns a (vector) def that will be
1893 used in the vectorized stmt for STMT.
1895 In the case that OP is an SSA_NAME which is defined in the loop, then
1896 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1898 In case OP is an invariant or constant, a new stmt that creates a vector def
1899 needs to be introduced. */
1902 vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
1907 stmt_vec_info def_stmt_info = NULL;
1908 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1909 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1910 unsigned int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1911 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1917 enum vect_def_type dt;
1921 if (vect_print_dump_info (REPORT_DETAILS))
1923 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1924 print_generic_expr (vect_dump, op, TDF_SLIM);
1927 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1928 gcc_assert (is_simple_use);
1929 if (vect_print_dump_info (REPORT_DETAILS))
1933 fprintf (vect_dump, "def = ");
1934 print_generic_expr (vect_dump, def, TDF_SLIM);
1938 fprintf (vect_dump, " def_stmt = ");
1939 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1945 /* Case 1: operand is a constant. */
1946 case vect_constant_def:
1951 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1952 if (vect_print_dump_info (REPORT_DETAILS))
1953 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1955 for (i = nunits - 1; i >= 0; --i)
1957 t = tree_cons (NULL_TREE, op, t);
1959 vec_cst = build_vector (vectype, t);
1960 return vect_init_vector (stmt, vec_cst, vectype, NULL);
1963 /* Case 2: operand is defined outside the loop - loop invariant. */
1964 case vect_invariant_def:
1966 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1967 gcc_assert (vector_type);
1968 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
1973 /* Create 'vec_inv = {inv,inv,..,inv}' */
1974 if (vect_print_dump_info (REPORT_DETAILS))
1975 fprintf (vect_dump, "Create vector_inv.");
1977 for (i = nunits - 1; i >= 0; --i)
1979 t = tree_cons (NULL_TREE, def, t);
1982 /* FIXME: use build_constructor directly. */
1983 vec_inv = build_constructor_from_list (vector_type, t);
1984 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1987 /* Case 3: operand is defined inside the loop. */
1991 *scalar_def = NULL/* FIXME tuples: def_stmt*/;
1993 /* Get the def from the vectorized stmt. */
1994 def_stmt_info = vinfo_for_stmt (def_stmt);
1995 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1996 gcc_assert (vec_stmt);
1997 if (gimple_code (vec_stmt) == GIMPLE_PHI)
1998 vec_oprnd = PHI_RESULT (vec_stmt);
1999 else if (is_gimple_call (vec_stmt))
2000 vec_oprnd = gimple_call_lhs (vec_stmt);
2002 vec_oprnd = gimple_assign_lhs (vec_stmt);
2006 /* Case 4: operand is defined by a loop header phi - reduction */
2007 case vect_reduction_def:
2011 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2012 loop = (gimple_bb (def_stmt))->loop_father;
2014 /* Get the def before the loop */
2015 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
2016 return get_initial_def_for_reduction (stmt, op, scalar_def);
2019 /* Case 5: operand is defined by loop-header phi - induction. */
2020 case vect_induction_def:
2022 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2024 /* Get the def from the vectorized stmt. */
2025 def_stmt_info = vinfo_for_stmt (def_stmt);
2026 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
2027 gcc_assert (vec_stmt && gimple_code (vec_stmt) == GIMPLE_PHI);
2028 vec_oprnd = PHI_RESULT (vec_stmt);
2038 /* Function vect_get_vec_def_for_stmt_copy
2040 Return a vector-def for an operand. This function is used when the
2041 vectorized stmt to be created (by the caller to this function) is a "copy"
2042 created in case the vectorized result cannot fit in one vector, and several
2043 copies of the vector-stmt are required. In this case the vector-def is
2044 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
2045 of the stmt that defines VEC_OPRND.
2046 DT is the type of the vector def VEC_OPRND.
2049 In case the vectorization factor (VF) is bigger than the number
2050 of elements that can fit in a vectype (nunits), we have to generate
2051 more than one vector stmt to vectorize the scalar stmt. This situation
2052 arises when there are multiple data-types operated upon in the loop; the
2053 smallest data-type determines the VF, and as a result, when vectorizing
2054 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
2055 vector stmt (each computing a vector of 'nunits' results, and together
2056 computing 'VF' results in each iteration). This function is called when
2057 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
2058 which VF=16 and nunits=4, so the number of copies required is 4):
2060 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
2062 S1: x = load VS1.0: vx.0 = memref0 VS1.1
2063 VS1.1: vx.1 = memref1 VS1.2
2064 VS1.2: vx.2 = memref2 VS1.3
2065 VS1.3: vx.3 = memref3
2067 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
2068 VSnew.1: vz1 = vx.1 + ... VSnew.2
2069 VSnew.2: vz2 = vx.2 + ... VSnew.3
2070 VSnew.3: vz3 = vx.3 + ...
2072 The vectorization of S1 is explained in vectorizable_load.
2073 The vectorization of S2:
2074 To create the first vector-stmt out of the 4 copies - VSnew.0 -
2075 the function 'vect_get_vec_def_for_operand' is called to
2076 get the relevant vector-def for each operand of S2. For operand x it
2077 returns the vector-def 'vx.0'.
2079 To create the remaining copies of the vector-stmt (VSnew.j), this
2080 function is called to get the relevant vector-def for each operand. It is
2081 obtained from the respective VS1.j stmt, which is recorded in the
2082 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2084 For example, to obtain the vector-def 'vx.1' in order to create the
2085 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2086 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2087 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2088 and return its def ('vx.1').
2089 Overall, to create the above sequence this function will be called 3 times:
2090 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2091 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2092 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
2095 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2097 gimple vec_stmt_for_operand;
2098 stmt_vec_info def_stmt_info;
2100 /* Do nothing; can reuse same def. */
2101 if (dt == vect_invariant_def || dt == vect_constant_def )
2104 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2105 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2106 gcc_assert (def_stmt_info);
2107 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2108 gcc_assert (vec_stmt_for_operand);
2109 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2110 if (gimple_code (vec_stmt_for_operand) == GIMPLE_PHI)
2111 vec_oprnd = PHI_RESULT (vec_stmt_for_operand);
2113 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2118 /* Get vectorized definitions for the operands to create a copy of an original
2119 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
2122 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2123 VEC(tree,heap) **vec_oprnds0,
2124 VEC(tree,heap) **vec_oprnds1)
2126 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2128 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2129 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2131 if (vec_oprnds1 && *vec_oprnds1)
2133 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2134 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2135 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2140 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
2143 vect_get_vec_defs (tree op0, tree op1, gimple stmt,
2144 VEC(tree,heap) **vec_oprnds0, VEC(tree,heap) **vec_oprnds1,
2148 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2153 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2154 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2155 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2159 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2160 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2161 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2167 /* Function vect_finish_stmt_generation.
2169 Insert a new stmt. */
2172 vect_finish_stmt_generation (gimple stmt, gimple vec_stmt,
2173 gimple_stmt_iterator *gsi)
2175 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2176 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2178 gcc_assert (gimple_code (stmt) != GIMPLE_LABEL);
2180 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
2182 set_vinfo_for_stmt (vec_stmt, new_stmt_vec_info (vec_stmt, loop_vinfo));
2184 if (vect_print_dump_info (REPORT_DETAILS))
2186 fprintf (vect_dump, "add new stmt: ");
2187 print_gimple_stmt (vect_dump, vec_stmt, 0, TDF_SLIM);
2190 gimple_set_location (vec_stmt, gimple_location (gsi_stmt (*gsi)));
2194 /* Function get_initial_def_for_reduction
2197 STMT - a stmt that performs a reduction operation in the loop.
2198 INIT_VAL - the initial value of the reduction variable
2201 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2202 of the reduction (used for adjusting the epilog - see below).
2203 Return a vector variable, initialized according to the operation that STMT
2204 performs. This vector will be used as the initial value of the
2205 vector of partial results.
2207 Option1 (adjust in epilog): Initialize the vector as follows:
2210 min/max: [init_val,init_val,..,init_val,init_val]
2211 bit and/or: [init_val,init_val,..,init_val,init_val]
2212 and when necessary (e.g. add/mult case) let the caller know
2213 that it needs to adjust the result by init_val.
2215 Option2: Initialize the vector as follows:
2216 add: [0,0,...,0,init_val]
2217 mult: [1,1,...,1,init_val]
2218 min/max: [init_val,init_val,...,init_val]
2219 bit and/or: [init_val,init_val,...,init_val]
2220 and no adjustments are needed.
2222 For example, for the following code:
2228 STMT is 's = s + a[i]', and the reduction variable is 's'.
2229 For a vector of 4 units, we want to return either [0,0,0,init_val],
2230 or [0,0,0,0] and let the caller know that it needs to adjust
2231 the result at the end by 'init_val'.
2233 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2234 initialization vector is simpler (same element in all entries).
2235 A cost model should help decide between these two schemes. */
2238 get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
2240 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2241 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2242 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2243 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2244 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2245 tree scalar_type = TREE_TYPE (vectype);
2246 enum tree_code code = gimple_assign_rhs_code (stmt);
2247 tree type = TREE_TYPE (init_val);
2253 bool nested_in_vect_loop = false;
2255 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2256 if (nested_in_vect_loop_p (loop, stmt))
2257 nested_in_vect_loop = true;
2259 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2261 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2265 case WIDEN_SUM_EXPR:
2268 if (nested_in_vect_loop)
2269 *adjustment_def = vecdef;
2271 *adjustment_def = init_val;
2272 /* Create a vector of zeros for init_def. */
2273 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2274 def_for_init = build_real (scalar_type, dconst0);
2276 def_for_init = build_int_cst (scalar_type, 0);
2278 for (i = nunits - 1; i >= 0; --i)
2279 t = tree_cons (NULL_TREE, def_for_init, t);
2280 init_def = build_vector (vectype, t);
2285 *adjustment_def = NULL_TREE;
2297 /* Function vect_create_epilog_for_reduction
2299 Create code at the loop-epilog to finalize the result of a reduction
2302 VECT_DEF is a vector of partial results.
2303 REDUC_CODE is the tree-code for the epilog reduction.
2304 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2305 number of elements that we can fit in a vectype (nunits). In this case
2306 we have to generate more than one vector stmt - i.e - we need to "unroll"
2307 the vector stmt by a factor VF/nunits. For more details see documentation
2308 in vectorizable_operation.
2309 STMT is the scalar reduction stmt that is being vectorized.
2310 REDUCTION_PHI is the phi-node that carries the reduction computation.
2313 1. Creates the reduction def-use cycle: sets the arguments for
2315 The loop-entry argument is the vectorized initial-value of the reduction.
2316 The loop-latch argument is VECT_DEF - the vector of partial sums.
2317 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2318 by applying the operation specified by REDUC_CODE if available, or by
2319 other means (whole-vector shifts or a scalar loop).
2320 The function also creates a new phi node at the loop exit to preserve
2321 loop-closed form, as illustrated below.
2323 The flow at the entry to this function:
2326 vec_def = phi <null, null> # REDUCTION_PHI
2327 VECT_DEF = vector_stmt # vectorized form of STMT
2328 s_loop = scalar_stmt # (scalar) STMT
2330 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2334 The above is transformed by this function into:
2337 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2338 VECT_DEF = vector_stmt # vectorized form of STMT
2339 s_loop = scalar_stmt # (scalar) STMT
2341 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2342 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2343 v_out2 = reduce <v_out1>
2344 s_out3 = extract_field <v_out2, 0>
2345 s_out4 = adjust_result <s_out3>
2351 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2353 enum tree_code reduc_code,
2354 gimple reduction_phi)
2356 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2357 stmt_vec_info prev_phi_info;
2359 enum machine_mode mode;
2360 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2361 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2362 basic_block exit_bb;
2365 gimple new_phi = NULL, phi;
2366 gimple_stmt_iterator exit_gsi;
2368 tree new_temp = NULL_TREE;
2370 gimple epilog_stmt = NULL;
2371 tree new_scalar_dest, new_dest;
2373 tree bitsize, bitpos, bytesize;
2374 enum tree_code code = gimple_assign_rhs_code (stmt);
2375 tree adjustment_def;
2376 tree vec_initial_def, def;
2378 imm_use_iterator imm_iter;
2379 use_operand_p use_p;
2380 bool extract_scalar_result = false;
2381 tree reduction_op, expr;
2384 bool nested_in_vect_loop = false;
2385 VEC(gimple,heap) *phis = NULL;
2386 enum vect_def_type dt = vect_unknown_def_type;
2389 if (nested_in_vect_loop_p (loop, stmt))
2392 nested_in_vect_loop = true;
2395 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2397 case GIMPLE_SINGLE_RHS:
2398 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2399 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2401 case GIMPLE_UNARY_RHS:
2402 reduction_op = gimple_assign_rhs1 (stmt);
2404 case GIMPLE_BINARY_RHS:
2405 reduction_op = gimple_assign_rhs2 (stmt);
2411 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2412 gcc_assert (vectype);
2413 mode = TYPE_MODE (vectype);
2415 /*** 1. Create the reduction def-use cycle ***/
2417 /* For the case of reduction, vect_get_vec_def_for_operand returns
2418 the scalar def before the loop, that defines the initial value
2419 of the reduction variable. */
2420 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2423 phi = reduction_phi;
2425 for (j = 0; j < ncopies; j++)
2427 /* 1.1 set the loop-entry arg of the reduction-phi: */
2428 add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop));
2430 /* 1.2 set the loop-latch arg for the reduction-phi: */
2432 def = vect_get_vec_def_for_stmt_copy (dt, def);
2433 add_phi_arg (phi, def, loop_latch_edge (loop));
2435 if (vect_print_dump_info (REPORT_DETAILS))
2437 fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2438 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2439 fprintf (vect_dump, "\n");
2440 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2443 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2446 /*** 2. Create epilog code
2447 The reduction epilog code operates across the elements of the vector
2448 of partial results computed by the vectorized loop.
2449 The reduction epilog code consists of:
2450 step 1: compute the scalar result in a vector (v_out2)
2451 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2452 step 3: adjust the scalar result (s_out3) if needed.
2454 Step 1 can be accomplished using one the following three schemes:
2455 (scheme 1) using reduc_code, if available.
2456 (scheme 2) using whole-vector shifts, if available.
2457 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2460 The overall epilog code looks like this:
2462 s_out0 = phi <s_loop> # original EXIT_PHI
2463 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2464 v_out2 = reduce <v_out1> # step 1
2465 s_out3 = extract_field <v_out2, 0> # step 2
2466 s_out4 = adjust_result <s_out3> # step 3
2468 (step 3 is optional, and steps 1 and 2 may be combined).
2469 Lastly, the uses of s_out0 are replaced by s_out4.
2473 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2474 v_out1 = phi <v_loop> */
2476 exit_bb = single_exit (loop)->dest;
2478 prev_phi_info = NULL;
2479 for (j = 0; j < ncopies; j++)
2481 phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2482 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
2487 def = vect_get_vec_def_for_stmt_copy (dt, def);
2488 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
2490 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
2491 prev_phi_info = vinfo_for_stmt (phi);
2493 exit_gsi = gsi_after_labels (exit_bb);
2495 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2496 (i.e. when reduc_code is not available) and in the final adjustment
2497 code (if needed). Also get the original scalar reduction variable as
2498 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2499 represents a reduction pattern), the tree-code and scalar-def are
2500 taken from the original stmt that the pattern-stmt (STMT) replaces.
2501 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2502 are taken from STMT. */
2504 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2507 /* Regular reduction */
2512 /* Reduction pattern */
2513 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2514 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2515 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2517 code = gimple_assign_rhs_code (orig_stmt);
2518 scalar_dest = gimple_assign_lhs (orig_stmt);
2519 scalar_type = TREE_TYPE (scalar_dest);
2520 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2521 bitsize = TYPE_SIZE (scalar_type);
2522 bytesize = TYPE_SIZE_UNIT (scalar_type);
2525 /* In case this is a reduction in an inner-loop while vectorizing an outer
2526 loop - we don't need to extract a single scalar result at the end of the
2527 inner-loop. The final vector of partial results will be used in the
2528 vectorized outer-loop, or reduced to a scalar result at the end of the
2530 if (nested_in_vect_loop)
2531 goto vect_finalize_reduction;
2534 gcc_assert (ncopies == 1);
2536 /* 2.3 Create the reduction code, using one of the three schemes described
2539 if (reduc_code < NUM_TREE_CODES)
2543 /*** Case 1: Create:
2544 v_out2 = reduc_expr <v_out1> */
2546 if (vect_print_dump_info (REPORT_DETAILS))
2547 fprintf (vect_dump, "Reduce using direct vector reduction.");
2549 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2550 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2551 epilog_stmt = gimple_build_assign (vec_dest, tmp);
2552 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2553 gimple_assign_set_lhs (epilog_stmt, new_temp);
2554 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2556 extract_scalar_result = true;
2560 enum tree_code shift_code = 0;
2561 bool have_whole_vector_shift = true;
2563 int element_bitsize = tree_low_cst (bitsize, 1);
2564 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2567 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2568 shift_code = VEC_RSHIFT_EXPR;
2570 have_whole_vector_shift = false;
2572 /* Regardless of whether we have a whole vector shift, if we're
2573 emulating the operation via tree-vect-generic, we don't want
2574 to use it. Only the first round of the reduction is likely
2575 to still be profitable via emulation. */
2576 /* ??? It might be better to emit a reduction tree code here, so that
2577 tree-vect-generic can expand the first round via bit tricks. */
2578 if (!VECTOR_MODE_P (mode))
2579 have_whole_vector_shift = false;
2582 optab optab = optab_for_tree_code (code, vectype, optab_default);
2583 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2584 have_whole_vector_shift = false;
2587 if (have_whole_vector_shift)
2589 /*** Case 2: Create:
2590 for (offset = VS/2; offset >= element_size; offset/=2)
2592 Create: va' = vec_shift <va, offset>
2593 Create: va = vop <va, va'>
2596 if (vect_print_dump_info (REPORT_DETAILS))
2597 fprintf (vect_dump, "Reduce using vector shifts");
2599 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2600 new_temp = PHI_RESULT (new_phi);
2602 for (bit_offset = vec_size_in_bits/2;
2603 bit_offset >= element_bitsize;
2606 tree bitpos = size_int (bit_offset);
2607 epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
2609 new_name = make_ssa_name (vec_dest, epilog_stmt);
2610 gimple_assign_set_lhs (epilog_stmt, new_name);
2611 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2613 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
2614 new_name, new_temp);
2615 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2616 gimple_assign_set_lhs (epilog_stmt, new_temp);
2617 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2620 extract_scalar_result = true;
2626 /*** Case 3: Create:
2627 s = extract_field <v_out2, 0>
2628 for (offset = element_size;
2629 offset < vector_size;
2630 offset += element_size;)
2632 Create: s' = extract_field <v_out2, offset>
2633 Create: s = op <s, s'>
2636 if (vect_print_dump_info (REPORT_DETAILS))
2637 fprintf (vect_dump, "Reduce using scalar code. ");
2639 vec_temp = PHI_RESULT (new_phi);
2640 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2641 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2643 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2644 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2645 gimple_assign_set_lhs (epilog_stmt, new_temp);
2646 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2648 for (bit_offset = element_bitsize;
2649 bit_offset < vec_size_in_bits;
2650 bit_offset += element_bitsize)
2652 tree bitpos = bitsize_int (bit_offset);
2653 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2656 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2657 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2658 gimple_assign_set_lhs (epilog_stmt, new_name);
2659 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2661 epilog_stmt = gimple_build_assign_with_ops (code,
2663 new_name, new_temp);
2664 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2665 gimple_assign_set_lhs (epilog_stmt, new_temp);
2666 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2669 extract_scalar_result = false;
2673 /* 2.4 Extract the final scalar result. Create:
2674 s_out3 = extract_field <v_out2, bitpos> */
2676 if (extract_scalar_result)
2680 gcc_assert (!nested_in_vect_loop);
2681 if (vect_print_dump_info (REPORT_DETAILS))
2682 fprintf (vect_dump, "extract scalar result");
2684 if (BYTES_BIG_ENDIAN)
2685 bitpos = size_binop (MULT_EXPR,
2686 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2687 TYPE_SIZE (scalar_type));
2689 bitpos = bitsize_zero_node;
2691 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2692 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2693 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2694 gimple_assign_set_lhs (epilog_stmt, new_temp);
2695 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2698 vect_finalize_reduction:
2700 /* 2.5 Adjust the final result by the initial value of the reduction
2701 variable. (When such adjustment is not needed, then
2702 'adjustment_def' is zero). For example, if code is PLUS we create:
2703 new_temp = loop_exit_def + adjustment_def */
2707 if (nested_in_vect_loop)
2709 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2710 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2711 new_dest = vect_create_destination_var (scalar_dest, vectype);
2715 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2716 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2717 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2719 epilog_stmt = gimple_build_assign (new_dest, expr);
2720 new_temp = make_ssa_name (new_dest, epilog_stmt);
2721 gimple_assign_set_lhs (epilog_stmt, new_temp);
2722 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
2723 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2727 /* 2.6 Handle the loop-exit phi */
2729 /* Replace uses of s_out0 with uses of s_out3:
2730 Find the loop-closed-use at the loop exit of the original scalar result.
2731 (The reduction result is expected to have two immediate uses - one at the
2732 latch block, and one at the loop exit). */
2733 phis = VEC_alloc (gimple, heap, 10);
2734 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2736 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2738 exit_phi = USE_STMT (use_p);
2739 VEC_quick_push (gimple, phis, exit_phi);
2742 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2743 gcc_assert (!VEC_empty (gimple, phis));
2745 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
2747 if (nested_in_vect_loop)
2749 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2751 /* FORNOW. Currently not supporting the case that an inner-loop
2752 reduction is not used in the outer-loop (but only outside the
2754 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2755 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2757 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2758 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2759 set_vinfo_for_stmt (epilog_stmt,
2760 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2762 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
2763 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
2767 /* Replace the uses: */
2768 orig_name = PHI_RESULT (exit_phi);
2769 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2770 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2771 SET_USE (use_p, new_temp);
2773 VEC_free (gimple, heap, phis);
2777 /* Function vectorizable_reduction.
2779 Check if STMT performs a reduction operation that can be vectorized.
2780 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2781 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2782 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2784 This function also handles reduction idioms (patterns) that have been
2785 recognized in advance during vect_pattern_recog. In this case, STMT may be
2787 X = pattern_expr (arg0, arg1, ..., X)
2788 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2789 sequence that had been detected and replaced by the pattern-stmt (STMT).
2791 In some cases of reduction patterns, the type of the reduction variable X is
2792 different than the type of the other arguments of STMT.
2793 In such cases, the vectype that is used when transforming STMT into a vector
2794 stmt is different than the vectype that is used to determine the
2795 vectorization factor, because it consists of a different number of elements
2796 than the actual number of elements that are being operated upon in parallel.
2798 For example, consider an accumulation of shorts into an int accumulator.
2799 On some targets it's possible to vectorize this pattern operating on 8
2800 shorts at a time (hence, the vectype for purposes of determining the
2801 vectorization factor should be V8HI); on the other hand, the vectype that
2802 is used to create the vector form is actually V4SI (the type of the result).
2804 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2805 indicates what is the actual level of parallelism (V8HI in the example), so
2806 that the right vectorization factor would be derived. This vectype
2807 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2808 be used to create the vectorized stmt. The right vectype for the vectorized
2809 stmt is obtained from the type of the result X:
2810 get_vectype_for_scalar_type (TREE_TYPE (X))
2812 This means that, contrary to "regular" reductions (or "regular" stmts in
2813 general), the following equation:
2814 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2815 does *NOT* necessarily hold for reduction patterns. */
2818 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
2823 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2824 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2825 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2826 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2827 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2828 enum tree_code code, orig_code, epilog_reduc_code = 0;
2829 enum machine_mode vec_mode;
2831 optab optab, reduc_optab;
2832 tree new_temp = NULL_TREE;
2835 enum vect_def_type dt;
2836 gimple new_phi = NULL;
2840 stmt_vec_info orig_stmt_info;
2841 tree expr = NULL_TREE;
2843 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2844 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2846 stmt_vec_info prev_stmt_info, prev_phi_info;
2847 gimple first_phi = NULL;
2848 bool single_defuse_cycle = false;
2850 gimple new_stmt = NULL;
2854 if (nested_in_vect_loop_p (loop, stmt))
2857 gcc_assert (ncopies >= 1);
2859 /* FORNOW: SLP not supported. */
2860 if (STMT_SLP_TYPE (stmt_info))
2863 /* 1. Is vectorizable reduction? */
2865 /* Not supportable if the reduction variable is used in the loop. */
2866 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2869 /* Reductions that are not used even in an enclosing outer-loop,
2870 are expected to be "live" (used out of the loop). */
2871 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2872 && !STMT_VINFO_LIVE_P (stmt_info))
2875 /* Make sure it was already recognized as a reduction computation. */
2876 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2879 /* 2. Has this been recognized as a reduction pattern?
2881 Check if STMT represents a pattern that has been recognized
2882 in earlier analysis stages. For stmts that represent a pattern,
2883 the STMT_VINFO_RELATED_STMT field records the last stmt in
2884 the original sequence that constitutes the pattern. */
2886 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2889 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2890 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2891 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2892 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2895 /* 3. Check the operands of the operation. The first operands are defined
2896 inside the loop body. The last operand is the reduction variable,
2897 which is defined by the loop-header-phi. */
2899 gcc_assert (is_gimple_assign (stmt));
2902 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2904 case GIMPLE_SINGLE_RHS:
2905 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
2906 if (op_type == ternary_op)
2908 tree rhs = gimple_assign_rhs1 (stmt);
2909 ops[0] = TREE_OPERAND (rhs, 0);
2910 ops[1] = TREE_OPERAND (rhs, 1);
2911 ops[2] = TREE_OPERAND (rhs, 2);
2912 code = TREE_CODE (rhs);
2918 case GIMPLE_BINARY_RHS:
2919 code = gimple_assign_rhs_code (stmt);
2920 op_type = TREE_CODE_LENGTH (code);
2921 gcc_assert (op_type == binary_op);
2922 ops[0] = gimple_assign_rhs1 (stmt);
2923 ops[1] = gimple_assign_rhs2 (stmt);
2926 case GIMPLE_UNARY_RHS:
2933 scalar_dest = gimple_assign_lhs (stmt);
2934 scalar_type = TREE_TYPE (scalar_dest);
2935 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2936 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2939 /* All uses but the last are expected to be defined in the loop.
2940 The last use is the reduction variable. */
2941 for (i = 0; i < op_type-1; i++)
2943 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt,
2945 gcc_assert (is_simple_use);
2946 if (dt != vect_loop_def
2947 && dt != vect_invariant_def
2948 && dt != vect_constant_def
2949 && dt != vect_induction_def)
2953 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt, &def, &dt);
2954 gcc_assert (is_simple_use);
2955 gcc_assert (dt == vect_reduction_def);
2956 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2958 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2960 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2962 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2965 /* 4. Supportable by target? */
2967 /* 4.1. check support for the operation in the loop */
2968 optab = optab_for_tree_code (code, vectype, optab_default);
2971 if (vect_print_dump_info (REPORT_DETAILS))
2972 fprintf (vect_dump, "no optab.");
2975 vec_mode = TYPE_MODE (vectype);
2976 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2978 if (vect_print_dump_info (REPORT_DETAILS))
2979 fprintf (vect_dump, "op not supported by target.");
2980 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2981 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2982 < vect_min_worthwhile_factor (code))
2984 if (vect_print_dump_info (REPORT_DETAILS))
2985 fprintf (vect_dump, "proceeding using word mode.");
2988 /* Worthwhile without SIMD support? */
2989 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2990 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2991 < vect_min_worthwhile_factor (code))
2993 if (vect_print_dump_info (REPORT_DETAILS))
2994 fprintf (vect_dump, "not worthwhile without SIMD support.");
2998 /* 4.2. Check support for the epilog operation.
3000 If STMT represents a reduction pattern, then the type of the
3001 reduction variable may be different than the type of the rest
3002 of the arguments. For example, consider the case of accumulation
3003 of shorts into an int accumulator; The original code:
3004 S1: int_a = (int) short_a;
3005 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
3008 STMT: int_acc = widen_sum <short_a, int_acc>
3011 1. The tree-code that is used to create the vector operation in the
3012 epilog code (that reduces the partial results) is not the
3013 tree-code of STMT, but is rather the tree-code of the original
3014 stmt from the pattern that STMT is replacing. I.e, in the example
3015 above we want to use 'widen_sum' in the loop, but 'plus' in the
3017 2. The type (mode) we use to check available target support
3018 for the vector operation to be created in the *epilog*, is
3019 determined by the type of the reduction variable (in the example
3020 above we'd check this: plus_optab[vect_int_mode]).
3021 However the type (mode) we use to check available target support
3022 for the vector operation to be created *inside the loop*, is
3023 determined by the type of the other arguments to STMT (in the
3024 example we'd check this: widen_sum_optab[vect_short_mode]).
3026 This is contrary to "regular" reductions, in which the types of all
3027 the arguments are the same as the type of the reduction variable.
3028 For "regular" reductions we can therefore use the same vector type
3029 (and also the same tree-code) when generating the epilog code and
3030 when generating the code inside the loop. */
3034 /* This is a reduction pattern: get the vectype from the type of the
3035 reduction variable, and get the tree-code from orig_stmt. */
3036 orig_code = gimple_assign_rhs_code (orig_stmt);
3037 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3040 if (vect_print_dump_info (REPORT_DETAILS))
3042 fprintf (vect_dump, "unsupported data-type ");
3043 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3048 vec_mode = TYPE_MODE (vectype);
3052 /* Regular reduction: use the same vectype and tree-code as used for
3053 the vector code inside the loop can be used for the epilog code. */
3057 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3059 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default);
3062 if (vect_print_dump_info (REPORT_DETAILS))
3063 fprintf (vect_dump, "no optab for reduction.");
3064 epilog_reduc_code = NUM_TREE_CODES;
3066 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
3068 if (vect_print_dump_info (REPORT_DETAILS))
3069 fprintf (vect_dump, "reduc op not supported by target.");
3070 epilog_reduc_code = NUM_TREE_CODES;
3073 if (!vec_stmt) /* transformation not required. */
3075 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3076 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3083 if (vect_print_dump_info (REPORT_DETAILS))
3084 fprintf (vect_dump, "transform reduction.");
3086 /* Create the destination vector */
3087 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3089 /* In case the vectorization factor (VF) is bigger than the number
3090 of elements that we can fit in a vectype (nunits), we have to generate
3091 more than one vector stmt - i.e - we need to "unroll" the
3092 vector stmt by a factor VF/nunits. For more details see documentation
3093 in vectorizable_operation. */
3095 /* If the reduction is used in an outer loop we need to generate
3096 VF intermediate results, like so (e.g. for ncopies=2):
3101 (i.e. we generate VF results in 2 registers).
3102 In this case we have a separate def-use cycle for each copy, and therefore
3103 for each copy we get the vector def for the reduction variable from the
3104 respective phi node created for this copy.
3106 Otherwise (the reduction is unused in the loop nest), we can combine
3107 together intermediate results, like so (e.g. for ncopies=2):
3111 (i.e. we generate VF/2 results in a single register).
3112 In this case for each copy we get the vector def for the reduction variable
3113 from the vectorized reduction operation generated in the previous iteration.
3116 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop)
3118 single_defuse_cycle = true;
3122 epilog_copies = ncopies;
3124 prev_stmt_info = NULL;
3125 prev_phi_info = NULL;
3126 for (j = 0; j < ncopies; j++)
3128 if (j == 0 || !single_defuse_cycle)
3130 /* Create the reduction-phi that defines the reduction-operand. */
3131 new_phi = create_phi_node (vec_dest, loop->header);
3132 set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo));
3138 loop_vec_def0 = vect_get_vec_def_for_operand (ops[0], stmt, NULL);
3139 if (op_type == ternary_op)
3141 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, NULL);
3144 /* Get the vector def for the reduction variable from the phi node */
3145 reduc_def = PHI_RESULT (new_phi);
3146 first_phi = new_phi;
3150 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3151 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3152 if (op_type == ternary_op)
3153 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3155 if (single_defuse_cycle)
3156 reduc_def = gimple_assign_lhs (new_stmt);
3158 reduc_def = PHI_RESULT (new_phi);
3160 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3163 /* Arguments are ready. create the new vector stmt. */
3164 if (op_type == binary_op)
3165 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3167 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3169 new_stmt = gimple_build_assign (vec_dest, expr);
3170 new_temp = make_ssa_name (vec_dest, new_stmt);
3171 gimple_assign_set_lhs (new_stmt, new_temp);
3172 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3175 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3177 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3178 prev_stmt_info = vinfo_for_stmt (new_stmt);
3179 prev_phi_info = vinfo_for_stmt (new_phi);
3182 /* Finalize the reduction-phi (set its arguments) and create the
3183 epilog reduction code. */
3184 if (!single_defuse_cycle)
3185 new_temp = gimple_assign_lhs (*vec_stmt);
3186 vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3187 epilog_reduc_code, first_phi);
3191 /* Checks if CALL can be vectorized in type VECTYPE. Returns
3192 a function declaration if the target has a vectorized version
3193 of the function, or NULL_TREE if the function cannot be vectorized. */
3196 vectorizable_function (gimple call, tree vectype_out, tree vectype_in)
3198 tree fndecl = gimple_call_fndecl (call);
3199 enum built_in_function code;
3201 /* We only handle functions that do not read or clobber memory -- i.e.
3202 const or novops ones. */
3203 if (!(gimple_call_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3207 || TREE_CODE (fndecl) != FUNCTION_DECL
3208 || !DECL_BUILT_IN (fndecl))
3211 code = DECL_FUNCTION_CODE (fndecl);
3212 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3216 /* Function vectorizable_call.
3218 Check if STMT performs a function call that can be vectorized.
3219 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3220 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3221 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3224 vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
3229 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3230 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3231 tree vectype_out, vectype_in;
3234 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3235 tree fndecl, new_temp, def, rhs_type, lhs_type;
3237 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3240 VEC(tree, heap) *vargs = NULL;
3241 enum { NARROW, NONE, WIDEN } modifier;
3244 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3247 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3250 /* FORNOW: SLP not supported. */
3251 if (STMT_SLP_TYPE (stmt_info))
3254 /* Is STMT a vectorizable call? */
3255 if (!is_gimple_call (stmt))
3258 if (TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
3261 /* Process function arguments. */
3262 rhs_type = NULL_TREE;
3263 nargs = gimple_call_num_args (stmt);
3265 /* Bail out if the function has more than two arguments, we
3266 do not have interesting builtin functions to vectorize with
3267 more than two arguments. No arguments is also not good. */
3268 if (nargs == 0 || nargs > 2)
3271 for (i = 0; i < nargs; i++)
3273 op = gimple_call_arg (stmt, i);
3275 /* We can only handle calls with arguments of the same type. */
3277 && rhs_type != TREE_TYPE (op))
3279 if (vect_print_dump_info (REPORT_DETAILS))
3280 fprintf (vect_dump, "argument types differ.");
3283 rhs_type = TREE_TYPE (op);
3285 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[i]))
3287 if (vect_print_dump_info (REPORT_DETAILS))
3288 fprintf (vect_dump, "use not simple.");
3293 vectype_in = get_vectype_for_scalar_type (rhs_type);
3296 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3298 lhs_type = TREE_TYPE (gimple_call_lhs (stmt));
3299 vectype_out = get_vectype_for_scalar_type (lhs_type);
3302 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3305 if (nunits_in == nunits_out / 2)
3307 else if (nunits_out == nunits_in)
3309 else if (nunits_out == nunits_in / 2)
3314 /* For now, we only vectorize functions if a target specific builtin
3315 is available. TODO -- in some cases, it might be profitable to
3316 insert the calls for pieces of the vector, in order to be able
3317 to vectorize other operations in the loop. */
3318 fndecl = vectorizable_function (stmt, vectype_out, vectype_in);
3319 if (fndecl == NULL_TREE)
3321 if (vect_print_dump_info (REPORT_DETAILS))
3322 fprintf (vect_dump, "function is not vectorizable.");
3327 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3329 if (modifier == NARROW)
3330 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3332 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3334 /* Sanity check: make sure that at least one copy of the vectorized stmt
3335 needs to be generated. */
3336 gcc_assert (ncopies >= 1);
3338 if (!vec_stmt) /* transformation not required. */
3340 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3341 if (vect_print_dump_info (REPORT_DETAILS))
3342 fprintf (vect_dump, "=== vectorizable_call ===");
3343 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3349 if (vect_print_dump_info (REPORT_DETAILS))
3350 fprintf (vect_dump, "transform operation.");
3353 scalar_dest = gimple_call_lhs (stmt);
3354 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3356 prev_stmt_info = NULL;
3360 for (j = 0; j < ncopies; ++j)
3362 /* Build argument list for the vectorized call. */
3364 vargs = VEC_alloc (tree, heap, nargs);
3366 VEC_truncate (tree, vargs, 0);
3368 for (i = 0; i < nargs; i++)
3370 op = gimple_call_arg (stmt, i);
3373 = vect_get_vec_def_for_operand (op, stmt, NULL);
3376 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3378 VEC_quick_push (tree, vargs, vec_oprnd0);
3381 new_stmt = gimple_build_call_vec (fndecl, vargs);
3382 new_temp = make_ssa_name (vec_dest, new_stmt);
3383 gimple_call_set_lhs (new_stmt, new_temp);
3385 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3388 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3390 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3392 prev_stmt_info = vinfo_for_stmt (new_stmt);
3398 for (j = 0; j < ncopies; ++j)
3400 /* Build argument list for the vectorized call. */
3402 vargs = VEC_alloc (tree, heap, nargs * 2);
3404 VEC_truncate (tree, vargs, 0);
3406 for (i = 0; i < nargs; i++)
3408 op = gimple_call_arg (stmt, i);
3412 = vect_get_vec_def_for_operand (op, stmt, NULL);
3414 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3419 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3421 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3424 VEC_quick_push (tree, vargs, vec_oprnd0);
3425 VEC_quick_push (tree, vargs, vec_oprnd1);
3428 new_stmt = gimple_build_call_vec (fndecl, vargs);
3429 new_temp = make_ssa_name (vec_dest, new_stmt);
3430 gimple_call_set_lhs (new_stmt, new_temp);
3432 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3435 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3437 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3439 prev_stmt_info = vinfo_for_stmt (new_stmt);
3442 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3447 /* No current target implements this case. */
3451 VEC_free (tree, heap, vargs);
3453 /* The call in STMT might prevent it from being removed in dce.
3454 We however cannot remove it here, due to the way the ssa name
3455 it defines is mapped to the new definition. So just replace
3456 rhs of the statement with something harmless. */
3458 type = TREE_TYPE (scalar_dest);
3459 new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
3460 fold_convert (type, integer_zero_node));
3461 set_vinfo_for_stmt (new_stmt, stmt_info);
3462 set_vinfo_for_stmt (stmt, NULL);
3463 STMT_VINFO_STMT (stmt_info) = new_stmt;
3464 gsi_replace (gsi, new_stmt, false);
3465 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3471 /* Function vect_gen_widened_results_half
3473 Create a vector stmt whose code, type, number of arguments, and result
3474 variable are CODE, OP_TYPE, and VEC_DEST, and its arguments are
3475 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3476 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3477 needs to be created (DECL is a function-decl of a target-builtin).
3478 STMT is the original scalar stmt that we are vectorizing. */
3481 vect_gen_widened_results_half (enum tree_code code,
3483 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3484 tree vec_dest, gimple_stmt_iterator *gsi,
3492 /* Generate half of the widened result: */
3493 if (code == CALL_EXPR)
3495 /* Target specific support */
3496 if (op_type == binary_op)
3497 new_stmt = gimple_build_call (decl, 2, vec_oprnd0, vec_oprnd1);
3499 new_stmt = gimple_build_call (decl, 1, vec_oprnd0);
3500 new_temp = make_ssa_name (vec_dest, new_stmt);
3501 gimple_call_set_lhs (new_stmt, new_temp);
3505 /* Generic support */
3506 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3507 if (op_type != binary_op)
3509 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vec_oprnd0,
3511 new_temp = make_ssa_name (vec_dest, new_stmt);
3512 gimple_assign_set_lhs (new_stmt, new_temp);
3514 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3516 if (code == CALL_EXPR)
3518 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3520 if (TREE_CODE (sym) == SSA_NAME)
3521 sym = SSA_NAME_VAR (sym);
3522 mark_sym_for_renaming (sym);
3530 /* Check if STMT performs a conversion operation, that can be vectorized.
3531 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3532 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3533 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3536 vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
3537 gimple *vec_stmt, slp_tree slp_node)
3542 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3543 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3544 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3545 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3546 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3550 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3551 gimple new_stmt = NULL;
3552 stmt_vec_info prev_stmt_info;
3555 tree vectype_out, vectype_in;
3558 tree rhs_type, lhs_type;
3560 enum { NARROW, NONE, WIDEN } modifier;
3562 VEC(tree,heap) *vec_oprnds0 = NULL;
3565 VEC(tree,heap) *dummy = NULL;
3568 /* Is STMT a vectorizable conversion? */
3570 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3573 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3576 if (!is_gimple_assign (stmt))
3579 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
3582 code = gimple_assign_rhs_code (stmt);
3583 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3586 /* Check types of lhs and rhs. */
3587 op0 = gimple_assign_rhs1 (stmt);
3588 rhs_type = TREE_TYPE (op0);
3589 vectype_in = get_vectype_for_scalar_type (rhs_type);
3592 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3594 scalar_dest = gimple_assign_lhs (stmt);
3595 lhs_type = TREE_TYPE (scalar_dest);
3596 vectype_out = get_vectype_for_scalar_type (lhs_type);
3599 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3602 if (nunits_in == nunits_out / 2)
3604 else if (nunits_out == nunits_in)
3606 else if (nunits_out == nunits_in / 2)
3611 if (modifier == NONE)
3612 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3614 /* Bail out if the types are both integral or non-integral. */
3615 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3616 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3619 integral_type = INTEGRAL_TYPE_P (rhs_type) ? vectype_in : vectype_out;
3621 if (modifier == NARROW)
3622 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3624 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3626 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3627 this, so we can safely override NCOPIES with 1 here. */
3631 /* Sanity check: make sure that at least one copy of the vectorized stmt
3632 needs to be generated. */
3633 gcc_assert (ncopies >= 1);
3635 /* Check the operands of the operation. */
3636 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3638 if (vect_print_dump_info (REPORT_DETAILS))
3639 fprintf (vect_dump, "use not simple.");
3643 /* Supportable by target? */
3644 if ((modifier == NONE
3645 && !targetm.vectorize.builtin_conversion (code, integral_type))
3646 || (modifier == WIDEN
3647 && !supportable_widening_operation (code, stmt, vectype_in,
3650 &dummy_int, &dummy))
3651 || (modifier == NARROW
3652 && !supportable_narrowing_operation (code, stmt, vectype_in,
3653 &code1, &dummy_int, &dummy)))
3655 if (vect_print_dump_info (REPORT_DETAILS))
3656 fprintf (vect_dump, "conversion not supported by target.");
3660 if (modifier != NONE)
3662 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3663 /* FORNOW: SLP not supported. */
3664 if (STMT_SLP_TYPE (stmt_info))
3668 if (!vec_stmt) /* transformation not required. */
3670 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3675 if (vect_print_dump_info (REPORT_DETAILS))
3676 fprintf (vect_dump, "transform conversion.");
3679 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3681 if (modifier == NONE && !slp_node)
3682 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3684 prev_stmt_info = NULL;
3688 for (j = 0; j < ncopies; j++)
3694 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3696 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3699 targetm.vectorize.builtin_conversion (code, integral_type);
3700 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3702 /* Arguments are ready. create the new vector stmt. */
3703 new_stmt = gimple_build_call (builtin_decl, 1, vop0);
3704 new_temp = make_ssa_name (vec_dest, new_stmt);
3705 gimple_call_set_lhs (new_stmt, new_temp);
3706 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3707 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3708 SSA_OP_ALL_VIRTUALS)
3710 if (TREE_CODE (sym) == SSA_NAME)
3711 sym = SSA_NAME_VAR (sym);
3712 mark_sym_for_renaming (sym);
3715 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3719 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3721 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3722 prev_stmt_info = vinfo_for_stmt (new_stmt);
3727 /* In case the vectorization factor (VF) is bigger than the number
3728 of elements that we can fit in a vectype (nunits), we have to
3729 generate more than one vector stmt - i.e - we need to "unroll"
3730 the vector stmt by a factor VF/nunits. */
3731 for (j = 0; j < ncopies; j++)
3734 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3736 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3738 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3740 /* Generate first half of the widened result: */
3742 = vect_gen_widened_results_half (code1, decl1,
3743 vec_oprnd0, vec_oprnd1,
3744 unary_op, vec_dest, gsi, stmt);
3746 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3748 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3749 prev_stmt_info = vinfo_for_stmt (new_stmt);
3751 /* Generate second half of the widened result: */
3753 = vect_gen_widened_results_half (code2, decl2,
3754 vec_oprnd0, vec_oprnd1,
3755 unary_op, vec_dest, gsi, stmt);
3756 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3757 prev_stmt_info = vinfo_for_stmt (new_stmt);
3762 /* In case the vectorization factor (VF) is bigger than the number
3763 of elements that we can fit in a vectype (nunits), we have to
3764 generate more than one vector stmt - i.e - we need to "unroll"
3765 the vector stmt by a factor VF/nunits. */
3766 for (j = 0; j < ncopies; j++)
3771 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3772 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3776 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3777 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3780 /* Arguments are ready. Create the new vector stmt. */
3781 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3782 new_stmt = gimple_build_assign_with_ops (code1, vec_dest, vec_oprnd0,
3784 new_temp = make_ssa_name (vec_dest, new_stmt);
3785 gimple_assign_set_lhs (new_stmt, new_temp);
3786 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3789 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3791 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3793 prev_stmt_info = vinfo_for_stmt (new_stmt);
3796 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3800 VEC_free (tree, heap, vec_oprnds0);
3806 /* Function vectorizable_assignment.
3808 Check if STMT performs an assignment (copy) that can be vectorized.
3809 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3810 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3811 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3814 vectorizable_assignment (gimple stmt, gimple_stmt_iterator *gsi,
3815 gimple *vec_stmt, slp_tree slp_node)
3820 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3821 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3822 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3826 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3827 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3830 VEC(tree,heap) *vec_oprnds = NULL;
3833 /* Multiple types in SLP are handled by creating the appropriate number of
3834 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
3839 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3841 gcc_assert (ncopies >= 1);
3843 return false; /* FORNOW */
3845 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3848 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3851 /* Is vectorizable assignment? */
3852 if (!is_gimple_assign (stmt))
3855 scalar_dest = gimple_assign_lhs (stmt);
3856 if (TREE_CODE (scalar_dest) != SSA_NAME)
3859 if (gimple_assign_single_p (stmt)
3860 || gimple_assign_rhs_code (stmt) == PAREN_EXPR)
3861 op = gimple_assign_rhs1 (stmt);
3865 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3867 if (vect_print_dump_info (REPORT_DETAILS))
3868 fprintf (vect_dump, "use not simple.");
3872 if (!vec_stmt) /* transformation not required. */
3874 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3875 if (vect_print_dump_info (REPORT_DETAILS))
3876 fprintf (vect_dump, "=== vectorizable_assignment ===");
3877 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3882 if (vect_print_dump_info (REPORT_DETAILS))
3883 fprintf (vect_dump, "transform assignment.");
3886 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3889 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3891 /* Arguments are ready. create the new vector stmt. */
3892 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3894 *vec_stmt = gimple_build_assign (vec_dest, vop);
3895 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3896 gimple_assign_set_lhs (*vec_stmt, new_temp);
3897 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
3898 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3901 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3904 VEC_free (tree, heap, vec_oprnds);
3909 /* Function vect_min_worthwhile_factor.
3911 For a loop where we could vectorize the operation indicated by CODE,
3912 return the minimum vectorization factor that makes it worthwhile
3913 to use generic vectors. */
3915 vect_min_worthwhile_factor (enum tree_code code)
3936 /* Function vectorizable_induction
3938 Check if PHI performs an induction computation that can be vectorized.
3939 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3940 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3941 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3944 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3947 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3948 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3949 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3950 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3951 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3952 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3955 gcc_assert (ncopies >= 1);
3956 /* FORNOW. This restriction should be relaxed. */
3957 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
3959 if (vect_print_dump_info (REPORT_DETAILS))
3960 fprintf (vect_dump, "multiple types in nested loop.");
3964 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3967 /* FORNOW: SLP not supported. */
3968 if (STMT_SLP_TYPE (stmt_info))
3971 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3973 if (gimple_code (phi) != GIMPLE_PHI)
3976 if (!vec_stmt) /* transformation not required. */
3978 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3979 if (vect_print_dump_info (REPORT_DETAILS))
3980 fprintf (vect_dump, "=== vectorizable_induction ===");
3981 vect_model_induction_cost (stmt_info, ncopies);
3987 if (vect_print_dump_info (REPORT_DETAILS))
3988 fprintf (vect_dump, "transform induction phi.");
3990 vec_def = get_initial_def_for_induction (phi);
3991 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3996 /* Function vectorizable_operation.
3998 Check if STMT performs a binary or unary operation that can be vectorized.
3999 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4000 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4001 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4004 vectorizable_operation (gimple stmt, gimple_stmt_iterator *gsi,
4005 gimple *vec_stmt, slp_tree slp_node)
4009 tree op0, op1 = NULL;
4010 tree vec_oprnd1 = NULL_TREE;
4011 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4012 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4013 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4014 enum tree_code code;
4015 enum machine_mode vec_mode;
4020 enum machine_mode optab_op2_mode;
4023 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4024 gimple new_stmt = NULL;
4025 stmt_vec_info prev_stmt_info;
4026 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
4031 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4034 bool shift_p = false;
4035 bool scalar_shift_arg = false;
4037 /* Multiple types in SLP are handled by creating the appropriate number of
4038 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4043 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4045 gcc_assert (ncopies >= 1);
4047 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4050 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4053 /* Is STMT a vectorizable binary/unary operation? */
4054 if (!is_gimple_assign (stmt))
4057 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4060 scalar_dest = gimple_assign_lhs (stmt);
4061 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4064 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4065 if (nunits_out != nunits_in)
4068 code = gimple_assign_rhs_code (stmt);
4070 /* For pointer addition, we should use the normal plus for
4071 the vector addition. */
4072 if (code == POINTER_PLUS_EXPR)
4075 /* Support only unary or binary operations. */
4076 op_type = TREE_CODE_LENGTH (code);
4077 if (op_type != unary_op && op_type != binary_op)
4079 if (vect_print_dump_info (REPORT_DETAILS))
4080 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
4084 op0 = gimple_assign_rhs1 (stmt);
4085 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4087 if (vect_print_dump_info (REPORT_DETAILS))
4088 fprintf (vect_dump, "use not simple.");
4092 if (op_type == binary_op)
4094 op1 = gimple_assign_rhs2 (stmt);
4095 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4097 if (vect_print_dump_info (REPORT_DETAILS))
4098 fprintf (vect_dump, "use not simple.");
4103 /* If this is a shift/rotate, determine whether the shift amount is a vector,
4104 or scalar. If the shift/rotate amount is a vector, use the vector/vector
4106 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
4107 || code == RROTATE_EXPR)
4111 /* vector shifted by vector */
4112 if (dt[1] == vect_loop_def)
4114 optab = optab_for_tree_code (code, vectype, optab_vector);
4115 if (vect_print_dump_info (REPORT_DETAILS))
4116 fprintf (vect_dump, "vector/vector shift/rotate found.");
4119 /* See if the machine has a vector shifted by scalar insn and if not
4120 then see if it has a vector shifted by vector insn */
4121 else if (dt[1] == vect_constant_def || dt[1] == vect_invariant_def)
4123 optab = optab_for_tree_code (code, vectype, optab_scalar);
4125 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4126 != CODE_FOR_nothing))
4128 scalar_shift_arg = true;
4129 if (vect_print_dump_info (REPORT_DETAILS))
4130 fprintf (vect_dump, "vector/scalar shift/rotate found.");
4134 optab = optab_for_tree_code (code, vectype, optab_vector);
4135 if (vect_print_dump_info (REPORT_DETAILS)
4137 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4138 != CODE_FOR_nothing))
4139 fprintf (vect_dump, "vector/vector shift/rotate found.");
4145 if (vect_print_dump_info (REPORT_DETAILS))
4146 fprintf (vect_dump, "operand mode requires invariant argument.");
4151 optab = optab_for_tree_code (code, vectype, optab_default);
4153 /* Supportable by target? */
4156 if (vect_print_dump_info (REPORT_DETAILS))
4157 fprintf (vect_dump, "no optab.");
4160 vec_mode = TYPE_MODE (vectype);
4161 icode = (int) optab_handler (optab, vec_mode)->insn_code;
4162 if (icode == CODE_FOR_nothing)
4164 if (vect_print_dump_info (REPORT_DETAILS))
4165 fprintf (vect_dump, "op not supported by target.");
4166 /* Check only during analysis. */
4167 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4168 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4169 < vect_min_worthwhile_factor (code)
4172 if (vect_print_dump_info (REPORT_DETAILS))
4173 fprintf (vect_dump, "proceeding using word mode.");
4176 /* Worthwhile without SIMD support? Check only during analysis. */
4177 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
4178 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4179 < vect_min_worthwhile_factor (code)
4182 if (vect_print_dump_info (REPORT_DETAILS))
4183 fprintf (vect_dump, "not worthwhile without SIMD support.");
4187 if (!vec_stmt) /* transformation not required. */
4189 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
4190 if (vect_print_dump_info (REPORT_DETAILS))
4191 fprintf (vect_dump, "=== vectorizable_operation ===");
4192 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4198 if (vect_print_dump_info (REPORT_DETAILS))
4199 fprintf (vect_dump, "transform binary/unary operation.");
4202 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4204 /* Allocate VECs for vector operands. In case of SLP, vector operands are
4205 created in the previous stages of the recursion, so no allocation is
4206 needed, except for the case of shift with scalar shift argument. In that
4207 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
4208 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
4209 In case of loop-based vectorization we allocate VECs of size 1. We
4210 allocate VEC_OPRNDS1 only in case of binary operation. */
4213 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4214 if (op_type == binary_op)
4215 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4217 else if (scalar_shift_arg)
4218 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4220 /* In case the vectorization factor (VF) is bigger than the number
4221 of elements that we can fit in a vectype (nunits), we have to generate
4222 more than one vector stmt - i.e - we need to "unroll" the
4223 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4224 from one copy of the vector stmt to the next, in the field
4225 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4226 stages to find the correct vector defs to be used when vectorizing
4227 stmts that use the defs of the current stmt. The example below illustrates
4228 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4229 4 vectorized stmts):
4231 before vectorization:
4232 RELATED_STMT VEC_STMT
4236 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4238 RELATED_STMT VEC_STMT
4239 VS1_0: vx0 = memref0 VS1_1 -
4240 VS1_1: vx1 = memref1 VS1_2 -
4241 VS1_2: vx2 = memref2 VS1_3 -
4242 VS1_3: vx3 = memref3 - -
4243 S1: x = load - VS1_0
4246 step2: vectorize stmt S2 (done here):
4247 To vectorize stmt S2 we first need to find the relevant vector
4248 def for the first operand 'x'. This is, as usual, obtained from
4249 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4250 that defines 'x' (S1). This way we find the stmt VS1_0, and the
4251 relevant vector def 'vx0'. Having found 'vx0' we can generate
4252 the vector stmt VS2_0, and as usual, record it in the
4253 STMT_VINFO_VEC_STMT of stmt S2.
4254 When creating the second copy (VS2_1), we obtain the relevant vector
4255 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4256 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4257 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4258 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4259 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4260 chain of stmts and pointers:
4261 RELATED_STMT VEC_STMT
4262 VS1_0: vx0 = memref0 VS1_1 -
4263 VS1_1: vx1 = memref1 VS1_2 -
4264 VS1_2: vx2 = memref2 VS1_3 -
4265 VS1_3: vx3 = memref3 - -
4266 S1: x = load - VS1_0
4267 VS2_0: vz0 = vx0 + v1 VS2_1 -
4268 VS2_1: vz1 = vx1 + v1 VS2_2 -
4269 VS2_2: vz2 = vx2 + v1 VS2_3 -
4270 VS2_3: vz3 = vx3 + v1 - -
4271 S2: z = x + 1 - VS2_0 */
4273 prev_stmt_info = NULL;
4274 for (j = 0; j < ncopies; j++)
4279 if (op_type == binary_op && scalar_shift_arg)
4281 /* Vector shl and shr insn patterns can be defined with scalar
4282 operand 2 (shift operand). In this case, use constant or loop
4283 invariant op1 directly, without extending it to vector mode
4285 optab_op2_mode = insn_data[icode].operand[2].mode;
4286 if (!VECTOR_MODE_P (optab_op2_mode))
4288 if (vect_print_dump_info (REPORT_DETAILS))
4289 fprintf (vect_dump, "operand 1 using scalar mode.");
4291 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4294 /* Store vec_oprnd1 for every vector stmt to be created
4295 for SLP_NODE. We check during the analysis that all the
4296 shift arguments are the same.
4297 TODO: Allow different constants for different vector
4298 stmts generated for an SLP instance. */
4299 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4300 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4305 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4306 (a special case for certain kind of vector shifts); otherwise,
4307 operand 1 should be of a vector type (the usual case). */
4308 if (op_type == binary_op && !vec_oprnd1)
4309 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4312 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4316 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4318 /* Arguments are ready. Create the new vector stmt. */
4319 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4321 vop1 = ((op_type == binary_op)
4322 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4323 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4324 new_temp = make_ssa_name (vec_dest, new_stmt);
4325 gimple_assign_set_lhs (new_stmt, new_temp);
4326 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4328 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4335 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4337 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4338 prev_stmt_info = vinfo_for_stmt (new_stmt);
4341 VEC_free (tree, heap, vec_oprnds0);
4343 VEC_free (tree, heap, vec_oprnds1);
4349 /* Get vectorized definitions for loop-based vectorization. For the first
4350 operand we call vect_get_vec_def_for_operand() (with OPRND containing
4351 scalar operand), and for the rest we get a copy with
4352 vect_get_vec_def_for_stmt_copy() using the previous vector definition
4353 (stored in OPRND). See vect_get_vec_def_for_stmt_copy() for details.
4354 The vectors are collected into VEC_OPRNDS. */
4357 vect_get_loop_based_defs (tree *oprnd, gimple stmt, enum vect_def_type dt,
4358 VEC (tree, heap) **vec_oprnds, int multi_step_cvt)
4362 /* Get first vector operand. */
4363 /* All the vector operands except the very first one (that is scalar oprnd)
4365 if (TREE_CODE (TREE_TYPE (*oprnd)) != VECTOR_TYPE)
4366 vec_oprnd = vect_get_vec_def_for_operand (*oprnd, stmt, NULL);
4368 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, *oprnd);
4370 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4372 /* Get second vector operand. */
4373 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd);
4374 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4378 /* For conversion in multiple steps, continue to get operands
4381 vect_get_loop_based_defs (oprnd, stmt, dt, vec_oprnds, multi_step_cvt - 1);
4385 /* Create vectorized demotion statements for vector operands from VEC_OPRNDS.
4386 For multi-step conversions store the resulting vectors and call the function
4390 vect_create_vectorized_demotion_stmts (VEC (tree, heap) **vec_oprnds,
4391 int multi_step_cvt, gimple stmt,
4392 VEC (tree, heap) *vec_dsts,
4393 gimple_stmt_iterator *gsi,
4394 slp_tree slp_node, enum tree_code code,
4395 stmt_vec_info *prev_stmt_info)
4398 tree vop0, vop1, new_tmp, vec_dest;
4400 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4402 vec_dest = VEC_pop (tree, vec_dsts);
4404 for (i = 0; i < VEC_length (tree, *vec_oprnds); i += 2)
4406 /* Create demotion operation. */
4407 vop0 = VEC_index (tree, *vec_oprnds, i);
4408 vop1 = VEC_index (tree, *vec_oprnds, i + 1);
4409 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4410 new_tmp = make_ssa_name (vec_dest, new_stmt);
4411 gimple_assign_set_lhs (new_stmt, new_tmp);
4412 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4415 /* Store the resulting vector for next recursive call. */
4416 VEC_replace (tree, *vec_oprnds, i/2, new_tmp);
4419 /* This is the last step of the conversion sequence. Store the
4420 vectors in SLP_NODE or in vector info of the scalar statement
4421 (or in STMT_VINFO_RELATED_STMT chain). */
4423 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4426 if (!*prev_stmt_info)
4427 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4429 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt;
4431 *prev_stmt_info = vinfo_for_stmt (new_stmt);
4436 /* For multi-step demotion operations we first generate demotion operations
4437 from the source type to the intermediate types, and then combine the
4438 results (stored in VEC_OPRNDS) in demotion operation to the destination
4442 /* At each level of recursion we have have of the operands we had at the
4444 VEC_truncate (tree, *vec_oprnds, (i+1)/2);
4445 vect_create_vectorized_demotion_stmts (vec_oprnds, multi_step_cvt - 1,
4446 stmt, vec_dsts, gsi, slp_node,
4447 code, prev_stmt_info);
4452 /* Function vectorizable_type_demotion
4454 Check if STMT performs a binary or unary operation that involves
4455 type demotion, and if it can be vectorized.
4456 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4457 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4458 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4461 vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi,
4462 gimple *vec_stmt, slp_tree slp_node)
4467 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4468 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4469 enum tree_code code, code1 = ERROR_MARK;
4472 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4473 stmt_vec_info prev_stmt_info;
4480 int multi_step_cvt = 0;
4481 VEC (tree, heap) *vec_oprnds0 = NULL;
4482 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4483 tree last_oprnd, intermediate_type;
4485 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4488 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4491 /* Is STMT a vectorizable type-demotion operation? */
4492 if (!is_gimple_assign (stmt))
4495 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4498 code = gimple_assign_rhs_code (stmt);
4499 if (!CONVERT_EXPR_CODE_P (code))
4502 op0 = gimple_assign_rhs1 (stmt);
4503 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4506 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4508 scalar_dest = gimple_assign_lhs (stmt);
4509 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4512 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4513 if (nunits_in >= nunits_out)
4516 /* Multiple types in SLP are handled by creating the appropriate number of
4517 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4522 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4524 gcc_assert (ncopies >= 1);
4526 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4527 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4528 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4529 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4530 && CONVERT_EXPR_CODE_P (code))))
4533 /* Check the operands of the operation. */
4534 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4536 if (vect_print_dump_info (REPORT_DETAILS))
4537 fprintf (vect_dump, "use not simple.");
4541 /* Supportable by target? */
4542 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1,
4543 &multi_step_cvt, &interm_types))
4546 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4548 if (!vec_stmt) /* transformation not required. */
4550 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4551 if (vect_print_dump_info (REPORT_DETAILS))
4552 fprintf (vect_dump, "=== vectorizable_demotion ===");
4553 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4558 if (vect_print_dump_info (REPORT_DETAILS))
4559 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4562 /* In case of multi-step demotion, we first generate demotion operations to
4563 the intermediate types, and then from that types to the final one.
4564 We create vector destinations for the intermediate type (TYPES) received
4565 from supportable_narrowing_operation, and store them in the correct order
4566 for future use in vect_create_vectorized_demotion_stmts(). */
4568 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4570 vec_dsts = VEC_alloc (tree, heap, 1);
4572 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4573 VEC_quick_push (tree, vec_dsts, vec_dest);
4577 for (i = VEC_length (tree, interm_types) - 1;
4578 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4580 vec_dest = vect_create_destination_var (scalar_dest,
4582 VEC_quick_push (tree, vec_dsts, vec_dest);
4586 /* In case the vectorization factor (VF) is bigger than the number
4587 of elements that we can fit in a vectype (nunits), we have to generate
4588 more than one vector stmt - i.e - we need to "unroll" the
4589 vector stmt by a factor VF/nunits. */
4591 prev_stmt_info = NULL;
4592 for (j = 0; j < ncopies; j++)
4596 vect_get_slp_defs (slp_node, &vec_oprnds0, NULL);
4599 VEC_free (tree, heap, vec_oprnds0);
4600 vec_oprnds0 = VEC_alloc (tree, heap,
4601 (multi_step_cvt ? vect_pow2 (multi_step_cvt) * 2 : 2));
4602 vect_get_loop_based_defs (&last_oprnd, stmt, dt[0], &vec_oprnds0,
4603 vect_pow2 (multi_step_cvt) - 1);
4606 /* Arguments are ready. Create the new vector stmts. */
4607 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4608 vect_create_vectorized_demotion_stmts (&vec_oprnds0,
4609 multi_step_cvt, stmt, tmp_vec_dsts,
4610 gsi, slp_node, code1,
4614 VEC_free (tree, heap, vec_oprnds0);
4615 VEC_free (tree, heap, vec_dsts);
4616 VEC_free (tree, heap, tmp_vec_dsts);
4617 VEC_free (tree, heap, interm_types);
4619 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4624 /* Create vectorized promotion statements for vector operands from VEC_OPRNDS0
4625 and VEC_OPRNDS1 (for binary operations). For multi-step conversions store
4626 the resulting vectors and call the function recursively. */
4629 vect_create_vectorized_promotion_stmts (VEC (tree, heap) **vec_oprnds0,
4630 VEC (tree, heap) **vec_oprnds1,
4631 int multi_step_cvt, gimple stmt,
4632 VEC (tree, heap) *vec_dsts,
4633 gimple_stmt_iterator *gsi,
4634 slp_tree slp_node, enum tree_code code1,
4635 enum tree_code code2, tree decl1,
4636 tree decl2, int op_type,
4637 stmt_vec_info *prev_stmt_info)
4640 tree vop0, vop1, new_tmp1, new_tmp2, vec_dest;
4641 gimple new_stmt1, new_stmt2;
4642 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4643 VEC (tree, heap) *vec_tmp;
4645 vec_dest = VEC_pop (tree, vec_dsts);
4646 vec_tmp = VEC_alloc (tree, heap, VEC_length (tree, *vec_oprnds0) * 2);
4648 for (i = 0; VEC_iterate (tree, *vec_oprnds0, i, vop0); i++)
4650 if (op_type == binary_op)
4651 vop1 = VEC_index (tree, *vec_oprnds1, i);
4655 /* Generate the two halves of promotion operation. */
4656 new_stmt1 = vect_gen_widened_results_half (code1, decl1, vop0, vop1,
4657 op_type, vec_dest, gsi, stmt);
4658 new_stmt2 = vect_gen_widened_results_half (code2, decl2, vop0, vop1,
4659 op_type, vec_dest, gsi, stmt);
4660 if (is_gimple_call (new_stmt1))
4662 new_tmp1 = gimple_call_lhs (new_stmt1);
4663 new_tmp2 = gimple_call_lhs (new_stmt2);
4667 new_tmp1 = gimple_assign_lhs (new_stmt1);
4668 new_tmp2 = gimple_assign_lhs (new_stmt2);
4673 /* Store the results for the recursive call. */
4674 VEC_quick_push (tree, vec_tmp, new_tmp1);
4675 VEC_quick_push (tree, vec_tmp, new_tmp2);
4679 /* Last step of promotion sequience - store the results. */
4682 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt1);
4683 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt2);
4687 if (!*prev_stmt_info)
4688 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt1;
4690 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt1;
4692 *prev_stmt_info = vinfo_for_stmt (new_stmt1);
4693 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt2;
4694 *prev_stmt_info = vinfo_for_stmt (new_stmt2);
4701 /* For multi-step promotion operation we first generate we call the
4702 function recurcively for every stage. We start from the input type,
4703 create promotion operations to the intermediate types, and then
4704 create promotions to the output type. */
4705 *vec_oprnds0 = VEC_copy (tree, heap, vec_tmp);
4706 VEC_free (tree, heap, vec_tmp);
4707 vect_create_vectorized_promotion_stmts (vec_oprnds0, vec_oprnds1,
4708 multi_step_cvt - 1, stmt,
4709 vec_dsts, gsi, slp_node, code1,
4710 code2, decl2, decl2, op_type,
4716 /* Function vectorizable_type_promotion
4718 Check if STMT performs a binary or unary operation that involves
4719 type promotion, and if it can be vectorized.
4720 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4721 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4722 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4725 vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
4726 gimple *vec_stmt, slp_tree slp_node)
4730 tree op0, op1 = NULL;
4731 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4732 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4733 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4734 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4735 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4739 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4740 stmt_vec_info prev_stmt_info;
4747 tree intermediate_type = NULL_TREE;
4748 int multi_step_cvt = 0;
4749 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4750 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4752 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4755 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4758 /* Is STMT a vectorizable type-promotion operation? */
4759 if (!is_gimple_assign (stmt))
4762 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4765 code = gimple_assign_rhs_code (stmt);
4766 if (!CONVERT_EXPR_CODE_P (code)
4767 && code != WIDEN_MULT_EXPR)
4770 op0 = gimple_assign_rhs1 (stmt);
4771 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4774 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4776 scalar_dest = gimple_assign_lhs (stmt);
4777 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4780 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4781 if (nunits_in <= nunits_out)
4784 /* Multiple types in SLP are handled by creating the appropriate number of
4785 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4790 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4792 gcc_assert (ncopies >= 1);
4794 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4795 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4796 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4797 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4798 && CONVERT_EXPR_CODE_P (code))))
4801 /* Check the operands of the operation. */
4802 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4804 if (vect_print_dump_info (REPORT_DETAILS))
4805 fprintf (vect_dump, "use not simple.");
4809 op_type = TREE_CODE_LENGTH (code);
4810 if (op_type == binary_op)
4812 op1 = gimple_assign_rhs2 (stmt);
4813 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4815 if (vect_print_dump_info (REPORT_DETAILS))
4816 fprintf (vect_dump, "use not simple.");
4821 /* Supportable by target? */
4822 if (!supportable_widening_operation (code, stmt, vectype_in,
4823 &decl1, &decl2, &code1, &code2,
4824 &multi_step_cvt, &interm_types))
4827 /* Binary widening operation can only be supported directly by the
4829 gcc_assert (!(multi_step_cvt && op_type == binary_op));
4831 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4833 if (!vec_stmt) /* transformation not required. */
4835 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4836 if (vect_print_dump_info (REPORT_DETAILS))
4837 fprintf (vect_dump, "=== vectorizable_promotion ===");
4838 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4844 if (vect_print_dump_info (REPORT_DETAILS))
4845 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4849 /* In case of multi-step promotion, we first generate promotion operations
4850 to the intermediate types, and then from that types to the final one.
4851 We store vector destination in VEC_DSTS in the correct order for
4852 recursive creation of promotion operations in
4853 vect_create_vectorized_promotion_stmts(). Vector destinations are created
4854 according to TYPES recieved from supportable_widening_operation(). */
4856 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4858 vec_dsts = VEC_alloc (tree, heap, 1);
4860 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4861 VEC_quick_push (tree, vec_dsts, vec_dest);
4865 for (i = VEC_length (tree, interm_types) - 1;
4866 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4868 vec_dest = vect_create_destination_var (scalar_dest,
4870 VEC_quick_push (tree, vec_dsts, vec_dest);
4876 vec_oprnds0 = VEC_alloc (tree, heap,
4877 (multi_step_cvt ? vect_pow2 (multi_step_cvt) : 1));
4878 if (op_type == binary_op)
4879 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4882 /* In case the vectorization factor (VF) is bigger than the number
4883 of elements that we can fit in a vectype (nunits), we have to generate
4884 more than one vector stmt - i.e - we need to "unroll" the
4885 vector stmt by a factor VF/nunits. */
4887 prev_stmt_info = NULL;
4888 for (j = 0; j < ncopies; j++)
4894 vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1);
4897 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4898 VEC_quick_push (tree, vec_oprnds0, vec_oprnd0);
4899 if (op_type == binary_op)
4901 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4902 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4908 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4909 VEC_replace (tree, vec_oprnds0, 0, vec_oprnd0);
4910 if (op_type == binary_op)
4912 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4913 VEC_replace (tree, vec_oprnds1, 0, vec_oprnd1);
4917 /* Arguments are ready. Create the new vector stmts. */
4918 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4919 vect_create_vectorized_promotion_stmts (&vec_oprnds0, &vec_oprnds1,
4920 multi_step_cvt, stmt,
4922 gsi, slp_node, code1, code2,
4923 decl1, decl2, op_type,
4927 VEC_free (tree, heap, vec_dsts);
4928 VEC_free (tree, heap, tmp_vec_dsts);
4929 VEC_free (tree, heap, interm_types);
4930 VEC_free (tree, heap, vec_oprnds0);
4931 VEC_free (tree, heap, vec_oprnds1);
4933 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4938 /* Function vect_strided_store_supported.
4940 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4941 and FALSE otherwise. */
4944 vect_strided_store_supported (tree vectype)
4946 optab interleave_high_optab, interleave_low_optab;
4949 mode = (int) TYPE_MODE (vectype);
4951 /* Check that the operation is supported. */
4952 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4953 vectype, optab_default);
4954 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4955 vectype, optab_default);
4956 if (!interleave_high_optab || !interleave_low_optab)
4958 if (vect_print_dump_info (REPORT_DETAILS))
4959 fprintf (vect_dump, "no optab for interleave.");
4963 if (optab_handler (interleave_high_optab, mode)->insn_code
4965 || optab_handler (interleave_low_optab, mode)->insn_code
4966 == CODE_FOR_nothing)
4968 if (vect_print_dump_info (REPORT_DETAILS))
4969 fprintf (vect_dump, "interleave op not supported by target.");
4977 /* Function vect_permute_store_chain.
4979 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4980 a power of 2, generate interleave_high/low stmts to reorder the data
4981 correctly for the stores. Return the final references for stores in
4984 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4985 The input is 4 vectors each containing 8 elements. We assign a number to each
4986 element, the input sequence is:
4988 1st vec: 0 1 2 3 4 5 6 7
4989 2nd vec: 8 9 10 11 12 13 14 15
4990 3rd vec: 16 17 18 19 20 21 22 23
4991 4th vec: 24 25 26 27 28 29 30 31
4993 The output sequence should be:
4995 1st vec: 0 8 16 24 1 9 17 25
4996 2nd vec: 2 10 18 26 3 11 19 27
4997 3rd vec: 4 12 20 28 5 13 21 30
4998 4th vec: 6 14 22 30 7 15 23 31
5000 i.e., we interleave the contents of the four vectors in their order.
5002 We use interleave_high/low instructions to create such output. The input of
5003 each interleave_high/low operation is two vectors:
5006 the even elements of the result vector are obtained left-to-right from the
5007 high/low elements of the first vector. The odd elements of the result are
5008 obtained left-to-right from the high/low elements of the second vector.
5009 The output of interleave_high will be: 0 4 1 5
5010 and of interleave_low: 2 6 3 7
5013 The permutation is done in log LENGTH stages. In each stage interleave_high
5014 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5015 where the first argument is taken from the first half of DR_CHAIN and the
5016 second argument from it's second half.
5019 I1: interleave_high (1st vec, 3rd vec)
5020 I2: interleave_low (1st vec, 3rd vec)
5021 I3: interleave_high (2nd vec, 4th vec)
5022 I4: interleave_low (2nd vec, 4th vec)
5024 The output for the first stage is:
5026 I1: 0 16 1 17 2 18 3 19
5027 I2: 4 20 5 21 6 22 7 23
5028 I3: 8 24 9 25 10 26 11 27
5029 I4: 12 28 13 29 14 30 15 31
5031 The output of the second stage, i.e. the final result is:
5033 I1: 0 8 16 24 1 9 17 25
5034 I2: 2 10 18 26 3 11 19 27
5035 I3: 4 12 20 28 5 13 21 30
5036 I4: 6 14 22 30 7 15 23 31. */
5039 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
5040 unsigned int length,
5042 gimple_stmt_iterator *gsi,
5043 VEC(tree,heap) **result_chain)
5045 tree perm_dest, vect1, vect2, high, low;
5047 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5051 enum tree_code high_code, low_code;
5053 scalar_dest = gimple_assign_lhs (stmt);
5055 /* Check that the operation is supported. */
5056 if (!vect_strided_store_supported (vectype))
5059 *result_chain = VEC_copy (tree, heap, dr_chain);
5061 for (i = 0; i < exact_log2 (length); i++)
5063 for (j = 0; j < length/2; j++)
5065 vect1 = VEC_index (tree, dr_chain, j);
5066 vect2 = VEC_index (tree, dr_chain, j+length/2);
5068 /* Create interleaving stmt:
5069 in the case of big endian:
5070 high = interleave_high (vect1, vect2)
5071 and in the case of little endian:
5072 high = interleave_low (vect1, vect2). */
5073 perm_dest = create_tmp_var (vectype, "vect_inter_high");
5074 DECL_GIMPLE_REG_P (perm_dest) = 1;
5075 add_referenced_var (perm_dest);
5076 if (BYTES_BIG_ENDIAN)
5078 high_code = VEC_INTERLEAVE_HIGH_EXPR;
5079 low_code = VEC_INTERLEAVE_LOW_EXPR;
5083 low_code = VEC_INTERLEAVE_HIGH_EXPR;
5084 high_code = VEC_INTERLEAVE_LOW_EXPR;
5086 perm_stmt = gimple_build_assign_with_ops (high_code, perm_dest,
5088 high = make_ssa_name (perm_dest, perm_stmt);
5089 gimple_assign_set_lhs (perm_stmt, high);
5090 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5091 VEC_replace (tree, *result_chain, 2*j, high);
5093 /* Create interleaving stmt:
5094 in the case of big endian:
5095 low = interleave_low (vect1, vect2)
5096 and in the case of little endian:
5097 low = interleave_high (vect1, vect2). */
5098 perm_dest = create_tmp_var (vectype, "vect_inter_low");
5099 DECL_GIMPLE_REG_P (perm_dest) = 1;
5100 add_referenced_var (perm_dest);
5101 perm_stmt = gimple_build_assign_with_ops (low_code, perm_dest,
5103 low = make_ssa_name (perm_dest, perm_stmt);
5104 gimple_assign_set_lhs (perm_stmt, low);
5105 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5106 VEC_replace (tree, *result_chain, 2*j+1, low);
5108 dr_chain = VEC_copy (tree, heap, *result_chain);
5114 /* Function vectorizable_store.
5116 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
5118 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5119 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5120 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5123 vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
5129 tree vec_oprnd = NULL_TREE;
5130 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5131 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
5132 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5133 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5134 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5135 enum machine_mode vec_mode;
5137 enum dr_alignment_support alignment_support_scheme;
5140 enum vect_def_type dt;
5141 stmt_vec_info prev_stmt_info = NULL;
5142 tree dataref_ptr = NULL_TREE;
5143 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5146 gimple next_stmt, first_stmt = NULL;
5147 bool strided_store = false;
5148 unsigned int group_size, i;
5149 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
5151 VEC(tree,heap) *vec_oprnds = NULL;
5152 bool slp = (slp_node != NULL);
5153 stmt_vec_info first_stmt_vinfo;
5154 unsigned int vec_num;
5156 /* Multiple types in SLP are handled by creating the appropriate number of
5157 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
5162 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5164 gcc_assert (ncopies >= 1);
5166 /* FORNOW. This restriction should be relaxed. */
5167 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
5169 if (vect_print_dump_info (REPORT_DETAILS))
5170 fprintf (vect_dump, "multiple types in nested loop.");
5174 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5177 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5180 /* Is vectorizable store? */
5182 if (!is_gimple_assign (stmt))
5185 scalar_dest = gimple_assign_lhs (stmt);
5186 if (TREE_CODE (scalar_dest) != ARRAY_REF
5187 && TREE_CODE (scalar_dest) != INDIRECT_REF
5188 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5191 gcc_assert (gimple_assign_single_p (stmt));
5192 op = gimple_assign_rhs1 (stmt);
5193 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5195 if (vect_print_dump_info (REPORT_DETAILS))
5196 fprintf (vect_dump, "use not simple.");
5200 /* The scalar rhs type needs to be trivially convertible to the vector
5201 component type. This should always be the case. */
5202 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (op)))
5204 if (vect_print_dump_info (REPORT_DETAILS))
5205 fprintf (vect_dump, "??? operands of different types");
5209 vec_mode = TYPE_MODE (vectype);
5210 /* FORNOW. In some cases can vectorize even if data-type not supported
5211 (e.g. - array initialization with 0). */
5212 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
5215 if (!STMT_VINFO_DATA_REF (stmt_info))
5218 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5220 strided_store = true;
5221 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5222 if (!vect_strided_store_supported (vectype)
5223 && !PURE_SLP_STMT (stmt_info) && !slp)
5226 if (first_stmt == stmt)
5228 /* STMT is the leader of the group. Check the operands of all the
5229 stmts of the group. */
5230 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
5233 gcc_assert (gimple_assign_single_p (next_stmt));
5234 op = gimple_assign_rhs1 (next_stmt);
5235 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5237 if (vect_print_dump_info (REPORT_DETAILS))
5238 fprintf (vect_dump, "use not simple.");
5241 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5246 if (!vec_stmt) /* transformation not required. */
5248 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
5249 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
5257 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5258 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5260 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
5263 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5265 /* We vectorize all the stmts of the interleaving group when we
5266 reach the last stmt in the group. */
5267 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
5268 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
5276 strided_store = false;
5278 /* VEC_NUM is the number of vect stmts to be created for this group. */
5280 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5282 vec_num = group_size;
5288 group_size = vec_num = 1;
5289 first_stmt_vinfo = stmt_info;
5292 if (vect_print_dump_info (REPORT_DETAILS))
5293 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
5295 dr_chain = VEC_alloc (tree, heap, group_size);
5296 oprnds = VEC_alloc (tree, heap, group_size);
5298 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5299 gcc_assert (alignment_support_scheme);
5300 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
5302 /* In case the vectorization factor (VF) is bigger than the number
5303 of elements that we can fit in a vectype (nunits), we have to generate
5304 more than one vector stmt - i.e - we need to "unroll" the
5305 vector stmt by a factor VF/nunits. For more details see documentation in
5306 vect_get_vec_def_for_copy_stmt. */
5308 /* In case of interleaving (non-unit strided access):
5315 We create vectorized stores starting from base address (the access of the
5316 first stmt in the chain (S2 in the above example), when the last store stmt
5317 of the chain (S4) is reached:
5320 VS2: &base + vec_size*1 = vx0
5321 VS3: &base + vec_size*2 = vx1
5322 VS4: &base + vec_size*3 = vx3
5324 Then permutation statements are generated:
5326 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
5327 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
5330 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5331 (the order of the data-refs in the output of vect_permute_store_chain
5332 corresponds to the order of scalar stmts in the interleaving chain - see
5333 the documentation of vect_permute_store_chain()).
5335 In case of both multiple types and interleaving, above vector stores and
5336 permutation stmts are created for every copy. The result vector stmts are
5337 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5338 STMT_VINFO_RELATED_STMT for the next copies.
5341 prev_stmt_info = NULL;
5342 for (j = 0; j < ncopies; j++)
5351 /* Get vectorized arguments for SLP_NODE. */
5352 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
5354 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
5358 /* For interleaved stores we collect vectorized defs for all the
5359 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
5360 used as an input to vect_permute_store_chain(), and OPRNDS as
5361 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
5363 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5364 OPRNDS are of size 1. */
5365 next_stmt = first_stmt;
5366 for (i = 0; i < group_size; i++)
5368 /* Since gaps are not supported for interleaved stores,
5369 GROUP_SIZE is the exact number of stmts in the chain.
5370 Therefore, NEXT_STMT can't be NULL_TREE. In case that
5371 there is no interleaving, GROUP_SIZE is 1, and only one
5372 iteration of the loop will be executed. */
5373 gcc_assert (next_stmt
5374 && gimple_assign_single_p (next_stmt));
5375 op = gimple_assign_rhs1 (next_stmt);
5377 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
5379 VEC_quick_push(tree, dr_chain, vec_oprnd);
5380 VEC_quick_push(tree, oprnds, vec_oprnd);
5381 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5385 /* We should have catched mismatched types earlier. */
5386 gcc_assert (useless_type_conversion_p (vectype,
5387 TREE_TYPE (vec_oprnd)));
5388 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
5389 &dummy, &ptr_incr, false,
5391 gcc_assert (!inv_p);
5395 /* For interleaved stores we created vectorized defs for all the
5396 defs stored in OPRNDS in the previous iteration (previous copy).
5397 DR_CHAIN is then used as an input to vect_permute_store_chain(),
5398 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
5400 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5401 OPRNDS are of size 1. */
5402 for (i = 0; i < group_size; i++)
5404 op = VEC_index (tree, oprnds, i);
5405 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
5406 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
5407 VEC_replace(tree, dr_chain, i, vec_oprnd);
5408 VEC_replace(tree, oprnds, i, vec_oprnd);
5411 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
5416 result_chain = VEC_alloc (tree, heap, group_size);
5418 if (!vect_permute_store_chain (dr_chain, group_size, stmt, gsi,
5423 next_stmt = first_stmt;
5424 for (i = 0; i < vec_num; i++)
5427 /* Bump the vector pointer. */
5428 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
5432 vec_oprnd = VEC_index (tree, vec_oprnds, i);
5433 else if (strided_store)
5434 /* For strided stores vectorized defs are interleaved in
5435 vect_permute_store_chain(). */
5436 vec_oprnd = VEC_index (tree, result_chain, i);
5438 data_ref = build_fold_indirect_ref (dataref_ptr);
5440 /* Arguments are ready. Create the new vector stmt. */
5441 new_stmt = gimple_build_assign (data_ref, vec_oprnd);
5442 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5443 mark_symbols_for_renaming (new_stmt);
5449 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5451 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5453 prev_stmt_info = vinfo_for_stmt (new_stmt);
5454 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5460 VEC_free (tree, heap, dr_chain);
5461 VEC_free (tree, heap, oprnds);
5463 VEC_free (tree, heap, result_chain);
5469 /* Function vect_setup_realignment
5471 This function is called when vectorizing an unaligned load using
5472 the dr_explicit_realign[_optimized] scheme.
5473 This function generates the following code at the loop prolog:
5476 x msq_init = *(floor(p)); # prolog load
5477 realignment_token = call target_builtin;
5479 x msq = phi (msq_init, ---)
5481 The stmts marked with x are generated only for the case of
5482 dr_explicit_realign_optimized.
5484 The code above sets up a new (vector) pointer, pointing to the first
5485 location accessed by STMT, and a "floor-aligned" load using that pointer.
5486 It also generates code to compute the "realignment-token" (if the relevant
5487 target hook was defined), and creates a phi-node at the loop-header bb
5488 whose arguments are the result of the prolog-load (created by this
5489 function) and the result of a load that takes place in the loop (to be
5490 created by the caller to this function).
5492 For the case of dr_explicit_realign_optimized:
5493 The caller to this function uses the phi-result (msq) to create the
5494 realignment code inside the loop, and sets up the missing phi argument,
5497 msq = phi (msq_init, lsq)
5498 lsq = *(floor(p')); # load in loop
5499 result = realign_load (msq, lsq, realignment_token);
5501 For the case of dr_explicit_realign:
5503 msq = *(floor(p)); # load in loop
5505 lsq = *(floor(p')); # load in loop
5506 result = realign_load (msq, lsq, realignment_token);
5509 STMT - (scalar) load stmt to be vectorized. This load accesses
5510 a memory location that may be unaligned.
5511 BSI - place where new code is to be inserted.
5512 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5516 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5517 target hook, if defined.
5518 Return value - the result of the loop-header phi node. */
5521 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
5522 tree *realignment_token,
5523 enum dr_alignment_support alignment_support_scheme,
5525 struct loop **at_loop)
5527 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5528 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5529 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5530 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5532 tree scalar_dest = gimple_assign_lhs (stmt);
5539 tree msq_init = NULL_TREE;
5542 tree msq = NULL_TREE;
5543 gimple_seq stmts = NULL;
5545 bool compute_in_loop = false;
5546 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5547 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5548 struct loop *loop_for_initial_load;
5550 gcc_assert (alignment_support_scheme == dr_explicit_realign
5551 || alignment_support_scheme == dr_explicit_realign_optimized);
5553 /* We need to generate three things:
5554 1. the misalignment computation
5555 2. the extra vector load (for the optimized realignment scheme).
5556 3. the phi node for the two vectors from which the realignment is
5557 done (for the optimized realignment scheme).
5560 /* 1. Determine where to generate the misalignment computation.
5562 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5563 calculation will be generated by this function, outside the loop (in the
5564 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5565 caller, inside the loop.
5567 Background: If the misalignment remains fixed throughout the iterations of
5568 the loop, then both realignment schemes are applicable, and also the
5569 misalignment computation can be done outside LOOP. This is because we are
5570 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5571 are a multiple of VS (the Vector Size), and therefore the misalignment in
5572 different vectorized LOOP iterations is always the same.
5573 The problem arises only if the memory access is in an inner-loop nested
5574 inside LOOP, which is now being vectorized using outer-loop vectorization.
5575 This is the only case when the misalignment of the memory access may not
5576 remain fixed throughout the iterations of the inner-loop (as explained in
5577 detail in vect_supportable_dr_alignment). In this case, not only is the
5578 optimized realignment scheme not applicable, but also the misalignment
5579 computation (and generation of the realignment token that is passed to
5580 REALIGN_LOAD) have to be done inside the loop.
5582 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5583 or not, which in turn determines if the misalignment is computed inside
5584 the inner-loop, or outside LOOP. */
5586 if (init_addr != NULL_TREE)
5588 compute_in_loop = true;
5589 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5593 /* 2. Determine where to generate the extra vector load.
5595 For the optimized realignment scheme, instead of generating two vector
5596 loads in each iteration, we generate a single extra vector load in the
5597 preheader of the loop, and in each iteration reuse the result of the
5598 vector load from the previous iteration. In case the memory access is in
5599 an inner-loop nested inside LOOP, which is now being vectorized using
5600 outer-loop vectorization, we need to determine whether this initial vector
5601 load should be generated at the preheader of the inner-loop, or can be
5602 generated at the preheader of LOOP. If the memory access has no evolution
5603 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5604 to be generated inside LOOP (in the preheader of the inner-loop). */
5606 if (nested_in_vect_loop)
5608 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5609 bool invariant_in_outerloop =
5610 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5611 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5614 loop_for_initial_load = loop;
5616 *at_loop = loop_for_initial_load;
5618 /* 3. For the case of the optimized realignment, create the first vector
5619 load at the loop preheader. */
5621 if (alignment_support_scheme == dr_explicit_realign_optimized)
5623 /* Create msq_init = *(floor(p1)) in the loop preheader */
5625 gcc_assert (!compute_in_loop);
5626 pe = loop_preheader_edge (loop_for_initial_load);
5627 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5628 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5629 &init_addr, &inc, true, &inv_p, NULL_TREE);
5630 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5631 new_stmt = gimple_build_assign (vec_dest, data_ref);
5632 new_temp = make_ssa_name (vec_dest, new_stmt);
5633 gimple_assign_set_lhs (new_stmt, new_temp);
5634 mark_symbols_for_renaming (new_stmt);
5635 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5636 gcc_assert (!new_bb);
5637 msq_init = gimple_assign_lhs (new_stmt);
5640 /* 4. Create realignment token using a target builtin, if available.
5641 It is done either inside the containing loop, or before LOOP (as
5642 determined above). */
5644 if (targetm.vectorize.builtin_mask_for_load)
5648 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5649 if (compute_in_loop)
5650 gcc_assert (init_addr); /* already computed by the caller. */
5653 /* Generate the INIT_ADDR computation outside LOOP. */
5654 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5656 pe = loop_preheader_edge (loop);
5657 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5658 gcc_assert (!new_bb);
5661 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5662 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5664 vect_create_destination_var (scalar_dest,
5665 gimple_call_return_type (new_stmt));
5666 new_temp = make_ssa_name (vec_dest, new_stmt);
5667 gimple_call_set_lhs (new_stmt, new_temp);
5669 if (compute_in_loop)
5670 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5673 /* Generate the misalignment computation outside LOOP. */
5674 pe = loop_preheader_edge (loop);
5675 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5676 gcc_assert (!new_bb);
5679 *realignment_token = gimple_call_lhs (new_stmt);
5681 /* The result of the CALL_EXPR to this builtin is determined from
5682 the value of the parameter and no global variables are touched
5683 which makes the builtin a "const" function. Requiring the
5684 builtin to have the "const" attribute makes it unnecessary
5685 to call mark_call_clobbered. */
5686 gcc_assert (TREE_READONLY (builtin_decl));
5689 if (alignment_support_scheme == dr_explicit_realign)
5692 gcc_assert (!compute_in_loop);
5693 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5696 /* 5. Create msq = phi <msq_init, lsq> in loop */
5698 pe = loop_preheader_edge (containing_loop);
5699 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5700 msq = make_ssa_name (vec_dest, NULL);
5701 phi_stmt = create_phi_node (msq, containing_loop->header);
5702 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5703 add_phi_arg (phi_stmt, msq_init, pe);
5709 /* Function vect_strided_load_supported.
5711 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5712 and FALSE otherwise. */
5715 vect_strided_load_supported (tree vectype)
5717 optab perm_even_optab, perm_odd_optab;
5720 mode = (int) TYPE_MODE (vectype);
5722 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype,
5724 if (!perm_even_optab)
5726 if (vect_print_dump_info (REPORT_DETAILS))
5727 fprintf (vect_dump, "no optab for perm_even.");
5731 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5733 if (vect_print_dump_info (REPORT_DETAILS))
5734 fprintf (vect_dump, "perm_even op not supported by target.");
5738 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype,
5740 if (!perm_odd_optab)
5742 if (vect_print_dump_info (REPORT_DETAILS))
5743 fprintf (vect_dump, "no optab for perm_odd.");
5747 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5749 if (vect_print_dump_info (REPORT_DETAILS))
5750 fprintf (vect_dump, "perm_odd op not supported by target.");
5757 /* Function vect_permute_load_chain.
5759 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5760 a power of 2, generate extract_even/odd stmts to reorder the input data
5761 correctly. Return the final references for loads in RESULT_CHAIN.
5763 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5764 The input is 4 vectors each containing 8 elements. We assign a number to each
5765 element, the input sequence is:
5767 1st vec: 0 1 2 3 4 5 6 7
5768 2nd vec: 8 9 10 11 12 13 14 15
5769 3rd vec: 16 17 18 19 20 21 22 23
5770 4th vec: 24 25 26 27 28 29 30 31
5772 The output sequence should be:
5774 1st vec: 0 4 8 12 16 20 24 28
5775 2nd vec: 1 5 9 13 17 21 25 29
5776 3rd vec: 2 6 10 14 18 22 26 30
5777 4th vec: 3 7 11 15 19 23 27 31
5779 i.e., the first output vector should contain the first elements of each
5780 interleaving group, etc.
5782 We use extract_even/odd instructions to create such output. The input of each
5783 extract_even/odd operation is two vectors
5787 and the output is the vector of extracted even/odd elements. The output of
5788 extract_even will be: 0 2 4 6
5789 and of extract_odd: 1 3 5 7
5792 The permutation is done in log LENGTH stages. In each stage extract_even and
5793 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5794 order. In our example,
5796 E1: extract_even (1st vec, 2nd vec)
5797 E2: extract_odd (1st vec, 2nd vec)
5798 E3: extract_even (3rd vec, 4th vec)
5799 E4: extract_odd (3rd vec, 4th vec)
5801 The output for the first stage will be:
5803 E1: 0 2 4 6 8 10 12 14
5804 E2: 1 3 5 7 9 11 13 15
5805 E3: 16 18 20 22 24 26 28 30
5806 E4: 17 19 21 23 25 27 29 31
5808 In order to proceed and create the correct sequence for the next stage (or
5809 for the correct output, if the second stage is the last one, as in our
5810 example), we first put the output of extract_even operation and then the
5811 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5812 The input for the second stage is:
5814 1st vec (E1): 0 2 4 6 8 10 12 14
5815 2nd vec (E3): 16 18 20 22 24 26 28 30
5816 3rd vec (E2): 1 3 5 7 9 11 13 15
5817 4th vec (E4): 17 19 21 23 25 27 29 31
5819 The output of the second stage:
5821 E1: 0 4 8 12 16 20 24 28
5822 E2: 2 6 10 14 18 22 26 30
5823 E3: 1 5 9 13 17 21 25 29
5824 E4: 3 7 11 15 19 23 27 31
5826 And RESULT_CHAIN after reordering:
5828 1st vec (E1): 0 4 8 12 16 20 24 28
5829 2nd vec (E3): 1 5 9 13 17 21 25 29
5830 3rd vec (E2): 2 6 10 14 18 22 26 30
5831 4th vec (E4): 3 7 11 15 19 23 27 31. */
5834 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5835 unsigned int length,
5837 gimple_stmt_iterator *gsi,
5838 VEC(tree,heap) **result_chain)
5840 tree perm_dest, data_ref, first_vect, second_vect;
5842 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5846 /* Check that the operation is supported. */
5847 if (!vect_strided_load_supported (vectype))
5850 *result_chain = VEC_copy (tree, heap, dr_chain);
5851 for (i = 0; i < exact_log2 (length); i++)
5853 for (j = 0; j < length; j +=2)
5855 first_vect = VEC_index (tree, dr_chain, j);
5856 second_vect = VEC_index (tree, dr_chain, j+1);
5858 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5859 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5860 DECL_GIMPLE_REG_P (perm_dest) = 1;
5861 add_referenced_var (perm_dest);
5863 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_EVEN_EXPR,
5864 perm_dest, first_vect,
5867 data_ref = make_ssa_name (perm_dest, perm_stmt);
5868 gimple_assign_set_lhs (perm_stmt, data_ref);
5869 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5870 mark_symbols_for_renaming (perm_stmt);
5872 VEC_replace (tree, *result_chain, j/2, data_ref);
5874 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5875 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5876 DECL_GIMPLE_REG_P (perm_dest) = 1;
5877 add_referenced_var (perm_dest);
5879 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_ODD_EXPR,
5880 perm_dest, first_vect,
5882 data_ref = make_ssa_name (perm_dest, perm_stmt);
5883 gimple_assign_set_lhs (perm_stmt, data_ref);
5884 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5885 mark_symbols_for_renaming (perm_stmt);
5887 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5889 dr_chain = VEC_copy (tree, heap, *result_chain);
5895 /* Function vect_transform_strided_load.
5897 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5898 to perform their permutation and ascribe the result vectorized statements to
5899 the scalar statements.
5903 vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
5904 gimple_stmt_iterator *gsi)
5906 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5907 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5908 gimple next_stmt, new_stmt;
5909 VEC(tree,heap) *result_chain = NULL;
5910 unsigned int i, gap_count;
5913 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5914 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5915 vectors, that are ready for vector computation. */
5916 result_chain = VEC_alloc (tree, heap, size);
5918 if (!vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain))
5921 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5922 Since we scan the chain starting from it's first node, their order
5923 corresponds the order of data-refs in RESULT_CHAIN. */
5924 next_stmt = first_stmt;
5926 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5931 /* Skip the gaps. Loads created for the gaps will be removed by dead
5932 code elimination pass later. No need to check for the first stmt in
5933 the group, since it always exists.
5934 DR_GROUP_GAP is the number of steps in elements from the previous
5935 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5936 correspond to the gaps.
5938 if (next_stmt != first_stmt
5939 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5947 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5948 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5949 copies, and we put the new vector statement in the first available
5951 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5952 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5955 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5958 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5960 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5963 prev_stmt = rel_stmt;
5965 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5968 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5973 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5975 /* If NEXT_STMT accesses the same DR as the previous statement,
5976 put the same TMP_DATA_REF as its vectorized statement; otherwise
5977 get the next data-ref from RESULT_CHAIN. */
5978 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5983 VEC_free (tree, heap, result_chain);
5988 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
5989 building a vector of type MASK_TYPE from it) and two input vectors placed in
5990 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
5991 shifting by STRIDE elements of DR_CHAIN for every copy.
5992 (STRIDE is the number of vectorized stmts for NODE divided by the number of
5994 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
5995 the created stmts must be inserted. */
5998 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
5999 int *mask_array, int mask_nunits,
6000 tree mask_element_type, tree mask_type,
6001 int first_vec_indx, int second_vec_indx,
6002 gimple_stmt_iterator *gsi, slp_tree node,
6003 tree builtin_decl, tree vectype,
6004 VEC(tree,heap) *dr_chain,
6005 int ncopies, int vect_stmts_counter)
6007 tree t = NULL_TREE, mask_vec, mask, perm_dest;
6008 gimple perm_stmt = NULL;
6009 stmt_vec_info next_stmt_info;
6010 int i, group_size, stride, dr_chain_size;
6011 tree first_vec, second_vec, data_ref;
6014 VEC (tree, heap) *params = NULL;
6016 /* Create a vector mask. */
6017 for (i = mask_nunits - 1; i >= 0; --i)
6018 t = tree_cons (NULL_TREE, build_int_cst (mask_element_type, mask_array[i]),
6020 mask_vec = build_vector (mask_type, t);
6021 mask = vect_init_vector (stmt, mask_vec, mask_type, NULL);
6023 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node));
6024 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
6025 dr_chain_size = VEC_length (tree, dr_chain);
6027 /* Initialize the vect stmts of NODE to properly insert the generated
6029 for (i = VEC_length (gimple, SLP_TREE_VEC_STMTS (node));
6030 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6031 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (node), NULL);
6033 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
6034 for (i = 0; i < ncopies; i++)
6036 first_vec = VEC_index (tree, dr_chain, first_vec_indx);
6037 second_vec = VEC_index (tree, dr_chain, second_vec_indx);
6039 /* Build argument list for the vectorized call. */
6040 VEC_free (tree, heap, params);
6041 params = VEC_alloc (tree, heap, 3);
6042 VEC_quick_push (tree, params, first_vec);
6043 VEC_quick_push (tree, params, second_vec);
6044 VEC_quick_push (tree, params, mask);
6046 /* Generate the permute statement. */
6047 perm_stmt = gimple_build_call_vec (builtin_decl, params);
6048 data_ref = make_ssa_name (perm_dest, perm_stmt);
6049 gimple_call_set_lhs (perm_stmt, data_ref);
6050 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6051 FOR_EACH_SSA_TREE_OPERAND (sym, perm_stmt, iter, SSA_OP_ALL_VIRTUALS)
6053 if (TREE_CODE (sym) == SSA_NAME)
6054 sym = SSA_NAME_VAR (sym);
6055 mark_sym_for_renaming (sym);
6058 /* Store the vector statement in NODE. */
6059 VEC_replace (gimple, SLP_TREE_VEC_STMTS (node),
6060 stride * i + vect_stmts_counter, perm_stmt);
6062 first_vec_indx += stride;
6063 second_vec_indx += stride;
6066 /* Mark the scalar stmt as vectorized. */
6067 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
6068 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
6072 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
6073 return in CURRENT_MASK_ELEMENT its equivalent in target specific
6074 representation. Check that the mask is valid and return FALSE if not.
6075 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
6076 the next vector, i.e., the current first vector is not needed. */
6079 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
6080 int mask_nunits, bool only_one_vec, int index,
6081 int *mask, int *current_mask_element,
6082 bool *need_next_vector)
6085 static int number_of_mask_fixes = 1;
6086 static bool mask_fixed = false;
6087 static bool needs_first_vector = false;
6089 /* Convert to target specific representation. */
6090 *current_mask_element = first_mask_element + m;
6091 /* Adjust the value in case it's a mask for second and third vectors. */
6092 *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1);
6094 if (*current_mask_element < mask_nunits)
6095 needs_first_vector = true;
6097 /* We have only one input vector to permute but the mask accesses values in
6098 the next vector as well. */
6099 if (only_one_vec && *current_mask_element >= mask_nunits)
6101 if (vect_print_dump_info (REPORT_DETAILS))
6103 fprintf (vect_dump, "permutation requires at least two vectors ");
6104 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6110 /* The mask requires the next vector. */
6111 if (*current_mask_element >= mask_nunits * 2)
6113 if (needs_first_vector || mask_fixed)
6115 /* We either need the first vector too or have already moved to the
6116 next vector. In both cases, this permutation needs three
6118 if (vect_print_dump_info (REPORT_DETAILS))
6120 fprintf (vect_dump, "permutation requires at "
6121 "least three vectors ");
6122 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6128 /* We move to the next vector, dropping the first one and working with
6129 the second and the third - we need to adjust the values of the mask
6131 *current_mask_element -= mask_nunits * number_of_mask_fixes;
6133 for (i = 0; i < index; i++)
6134 mask[i] -= mask_nunits * number_of_mask_fixes;
6136 (number_of_mask_fixes)++;
6140 *need_next_vector = mask_fixed;
6142 /* This was the last element of this mask. Start a new one. */
6143 if (index == mask_nunits - 1)
6145 number_of_mask_fixes = 1;
6147 needs_first_vector = false;
6154 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6155 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6156 permute statements for SLP_NODE_INSTANCE. */
6158 vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
6159 gimple_stmt_iterator *gsi, int vf,
6160 slp_instance slp_node_instance, bool analyze_only)
6162 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6163 tree mask_element_type = NULL_TREE, mask_type;
6164 int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index;
6166 tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl;
6167 gimple next_scalar_stmt;
6168 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6169 int first_mask_element;
6170 int index, unroll_factor, *mask, current_mask_element, ncopies;
6171 bool only_one_vec = false, need_next_vector = false;
6172 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
6174 if (!targetm.vectorize.builtin_vec_perm)
6176 if (vect_print_dump_info (REPORT_DETAILS))
6178 fprintf (vect_dump, "no builtin for vect permute for ");
6179 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6185 builtin_decl = targetm.vectorize.builtin_vec_perm (vectype,
6186 &mask_element_type);
6187 if (!builtin_decl || !mask_element_type)
6189 if (vect_print_dump_info (REPORT_DETAILS))
6191 fprintf (vect_dump, "no builtin for vect permute for ");
6192 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6198 mask_type = get_vectype_for_scalar_type (mask_element_type);
6199 mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type);
6200 mask = (int *) xmalloc (sizeof (int) * mask_nunits);
6201 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6202 scale = mask_nunits / nunits;
6203 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6205 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
6206 unrolling factor. */
6207 orig_vec_stmts_num = group_size *
6208 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
6209 if (orig_vec_stmts_num == 1)
6210 only_one_vec = true;
6212 /* Number of copies is determined by the final vectorization factor
6213 relatively to SLP_NODE_INSTANCE unrolling factor. */
6214 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6216 /* Generate permutation masks for every NODE. Number of masks for each NODE
6217 is equal to GROUP_SIZE.
6218 E.g., we have a group of three nodes with three loads from the same
6219 location in each node, and the vector size is 4. I.e., we have a
6220 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6221 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6222 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6225 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target
6226 scpecific type, e.g., in bytes for Altivec.
6227 The last mask is illegal since we assume two operands for permute
6228 operation, and the mask element values can't be outside that range. Hence,
6229 the last mask must be converted into {2,5,5,5}.
6230 For the first two permutations we need the first and the second input
6231 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6232 we need the second and the third vectors: {b1,c1,a2,b2} and
6236 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_node_instance),
6242 vect_stmts_counter = 0;
6244 first_vec_index = vec_index++;
6246 second_vec_index = first_vec_index;
6248 second_vec_index = vec_index++;
6250 for (j = 0; j < unroll_factor; j++)
6252 for (k = 0; k < group_size; k++)
6254 first_mask_element = (i + j * group_size) * scale;
6255 for (m = 0; m < scale; m++)
6257 if (!vect_get_mask_element (stmt, first_mask_element, m,
6258 mask_nunits, only_one_vec, index, mask,
6259 ¤t_mask_element, &need_next_vector))
6262 mask[index++] = current_mask_element;
6265 if (index == mask_nunits)
6270 if (need_next_vector)
6272 first_vec_index = second_vec_index;
6273 second_vec_index = vec_index;
6276 next_scalar_stmt = VEC_index (gimple,
6277 SLP_TREE_SCALAR_STMTS (node), scalar_index++);
6279 vect_create_mask_and_perm (stmt, next_scalar_stmt,
6280 mask, mask_nunits, mask_element_type, mask_type,
6281 first_vec_index, second_vec_index, gsi, node,
6282 builtin_decl, vectype, dr_chain, ncopies,
6283 vect_stmts_counter++);
6294 /* vectorizable_load.
6296 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
6298 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6299 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
6300 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6303 vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
6304 slp_tree slp_node, slp_instance slp_node_instance)
6307 tree vec_dest = NULL;
6308 tree data_ref = NULL;
6309 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6310 stmt_vec_info prev_stmt_info;
6311 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6312 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6313 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
6314 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
6315 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
6316 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6319 gimple new_stmt = NULL;
6321 enum dr_alignment_support alignment_support_scheme;
6322 tree dataref_ptr = NULL_TREE;
6324 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6326 int i, j, group_size;
6327 tree msq = NULL_TREE, lsq;
6328 tree offset = NULL_TREE;
6329 tree realignment_token = NULL_TREE;
6331 VEC(tree,heap) *dr_chain = NULL;
6332 bool strided_load = false;
6336 bool compute_in_loop = false;
6337 struct loop *at_loop;
6339 bool slp = (slp_node != NULL);
6340 bool slp_perm = false;
6341 enum tree_code code;
6343 /* Multiple types in SLP are handled by creating the appropriate number of
6344 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
6349 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6351 gcc_assert (ncopies >= 1);
6353 /* FORNOW. This restriction should be relaxed. */
6354 if (nested_in_vect_loop && ncopies > 1)
6356 if (vect_print_dump_info (REPORT_DETAILS))
6357 fprintf (vect_dump, "multiple types in nested loop.");
6361 if (slp && SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance))
6364 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6367 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6370 /* Is vectorizable load? */
6371 if (!is_gimple_assign (stmt))
6374 scalar_dest = gimple_assign_lhs (stmt);
6375 if (TREE_CODE (scalar_dest) != SSA_NAME)
6378 code = gimple_assign_rhs_code (stmt);
6379 if (code != ARRAY_REF
6380 && code != INDIRECT_REF
6381 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
6384 if (!STMT_VINFO_DATA_REF (stmt_info))
6387 scalar_type = TREE_TYPE (DR_REF (dr));
6388 mode = (int) TYPE_MODE (vectype);
6390 /* FORNOW. In some cases can vectorize even if data-type not supported
6391 (e.g. - data copies). */
6392 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
6394 if (vect_print_dump_info (REPORT_DETAILS))
6395 fprintf (vect_dump, "Aligned load, but unsupported type.");
6399 /* The vector component type needs to be trivially convertible to the
6400 scalar lhs. This should always be the case. */
6401 if (!useless_type_conversion_p (TREE_TYPE (scalar_dest), TREE_TYPE (vectype)))
6403 if (vect_print_dump_info (REPORT_DETAILS))
6404 fprintf (vect_dump, "??? operands of different types");
6408 /* Check if the load is a part of an interleaving chain. */
6409 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6411 strided_load = true;
6413 gcc_assert (! nested_in_vect_loop);
6415 /* Check if interleaving is supported. */
6416 if (!vect_strided_load_supported (vectype)
6417 && !PURE_SLP_STMT (stmt_info) && !slp)
6421 if (!vec_stmt) /* transformation not required. */
6423 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
6424 vect_model_load_cost (stmt_info, ncopies, NULL);
6428 if (vect_print_dump_info (REPORT_DETAILS))
6429 fprintf (vect_dump, "transform load.");
6435 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
6436 /* Check if the chain of loads is already vectorized. */
6437 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
6439 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6442 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
6443 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
6445 /* VEC_NUM is the number of vect stmts to be created for this group. */
6448 strided_load = false;
6449 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6452 vec_num = group_size;
6454 dr_chain = VEC_alloc (tree, heap, vec_num);
6460 group_size = vec_num = 1;
6463 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
6464 gcc_assert (alignment_support_scheme);
6466 /* In case the vectorization factor (VF) is bigger than the number
6467 of elements that we can fit in a vectype (nunits), we have to generate
6468 more than one vector stmt - i.e - we need to "unroll" the
6469 vector stmt by a factor VF/nunits. In doing so, we record a pointer
6470 from one copy of the vector stmt to the next, in the field
6471 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
6472 stages to find the correct vector defs to be used when vectorizing
6473 stmts that use the defs of the current stmt. The example below illustrates
6474 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
6475 4 vectorized stmts):
6477 before vectorization:
6478 RELATED_STMT VEC_STMT
6482 step 1: vectorize stmt S1:
6483 We first create the vector stmt VS1_0, and, as usual, record a
6484 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
6485 Next, we create the vector stmt VS1_1, and record a pointer to
6486 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
6487 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
6489 RELATED_STMT VEC_STMT
6490 VS1_0: vx0 = memref0 VS1_1 -
6491 VS1_1: vx1 = memref1 VS1_2 -
6492 VS1_2: vx2 = memref2 VS1_3 -
6493 VS1_3: vx3 = memref3 - -
6494 S1: x = load - VS1_0
6497 See in documentation in vect_get_vec_def_for_stmt_copy for how the
6498 information we recorded in RELATED_STMT field is used to vectorize
6501 /* In case of interleaving (non-unit strided access):
6508 Vectorized loads are created in the order of memory accesses
6509 starting from the access of the first stmt of the chain:
6512 VS2: vx1 = &base + vec_size*1
6513 VS3: vx3 = &base + vec_size*2
6514 VS4: vx4 = &base + vec_size*3
6516 Then permutation statements are generated:
6518 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
6519 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
6522 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
6523 (the order of the data-refs in the output of vect_permute_load_chain
6524 corresponds to the order of scalar stmts in the interleaving chain - see
6525 the documentation of vect_permute_load_chain()).
6526 The generation of permutation stmts and recording them in
6527 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
6529 In case of both multiple types and interleaving, the vector loads and
6530 permutation stmts above are created for every copy. The result vector stmts
6531 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
6532 STMT_VINFO_RELATED_STMT for the next copies. */
6534 /* If the data reference is aligned (dr_aligned) or potentially unaligned
6535 on a target that supports unaligned accesses (dr_unaligned_supported)
6536 we generate the following code:
6540 p = p + indx * vectype_size;
6545 Otherwise, the data reference is potentially unaligned on a target that
6546 does not support unaligned accesses (dr_explicit_realign_optimized) -
6547 then generate the following code, in which the data in each iteration is
6548 obtained by two vector loads, one from the previous iteration, and one
6549 from the current iteration:
6551 msq_init = *(floor(p1))
6552 p2 = initial_addr + VS - 1;
6553 realignment_token = call target_builtin;
6556 p2 = p2 + indx * vectype_size
6558 vec_dest = realign_load (msq, lsq, realignment_token)
6563 /* If the misalignment remains the same throughout the execution of the
6564 loop, we can create the init_addr and permutation mask at the loop
6565 preheader. Otherwise, it needs to be created inside the loop.
6566 This can only occur when vectorizing memory accesses in the inner-loop
6567 nested within an outer-loop that is being vectorized. */
6569 if (nested_in_vect_loop_p (loop, stmt)
6570 && (TREE_INT_CST_LOW (DR_STEP (dr))
6571 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
6573 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
6574 compute_in_loop = true;
6577 if ((alignment_support_scheme == dr_explicit_realign_optimized
6578 || alignment_support_scheme == dr_explicit_realign)
6579 && !compute_in_loop)
6581 msq = vect_setup_realignment (first_stmt, gsi, &realignment_token,
6582 alignment_support_scheme, NULL_TREE,
6584 if (alignment_support_scheme == dr_explicit_realign_optimized)
6586 phi = SSA_NAME_DEF_STMT (msq);
6587 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6593 prev_stmt_info = NULL;
6594 for (j = 0; j < ncopies; j++)
6596 /* 1. Create the vector pointer update chain. */
6598 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
6600 &dummy, &ptr_incr, false,
6604 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
6606 for (i = 0; i < vec_num; i++)
6609 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
6612 /* 2. Create the vector-load in the loop. */
6613 switch (alignment_support_scheme)
6616 gcc_assert (aligned_access_p (first_dr));
6617 data_ref = build_fold_indirect_ref (dataref_ptr);
6619 case dr_unaligned_supported:
6621 int mis = DR_MISALIGNMENT (first_dr);
6622 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
6624 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
6626 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
6629 case dr_explicit_realign:
6632 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6634 if (compute_in_loop)
6635 msq = vect_setup_realignment (first_stmt, gsi,
6637 dr_explicit_realign,
6640 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6641 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6642 new_stmt = gimple_build_assign (vec_dest, data_ref);
6643 new_temp = make_ssa_name (vec_dest, new_stmt);
6644 gimple_assign_set_lhs (new_stmt, new_temp);
6645 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6646 copy_virtual_operands (new_stmt, stmt);
6647 mark_symbols_for_renaming (new_stmt);
6650 bump = size_binop (MULT_EXPR, vs_minus_1,
6651 TYPE_SIZE_UNIT (scalar_type));
6652 ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump);
6653 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
6656 case dr_explicit_realign_optimized:
6657 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6662 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6663 new_stmt = gimple_build_assign (vec_dest, data_ref);
6664 new_temp = make_ssa_name (vec_dest, new_stmt);
6665 gimple_assign_set_lhs (new_stmt, new_temp);
6666 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6667 mark_symbols_for_renaming (new_stmt);
6669 /* 3. Handle explicit realignment if necessary/supported. Create in
6670 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
6671 if (alignment_support_scheme == dr_explicit_realign_optimized
6672 || alignment_support_scheme == dr_explicit_realign)
6676 lsq = gimple_assign_lhs (new_stmt);
6677 if (!realignment_token)
6678 realignment_token = dataref_ptr;
6679 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6680 tmp = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
6682 new_stmt = gimple_build_assign (vec_dest, tmp);
6683 new_temp = make_ssa_name (vec_dest, new_stmt);
6684 gimple_assign_set_lhs (new_stmt, new_temp);
6685 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6687 if (alignment_support_scheme == dr_explicit_realign_optimized)
6690 if (i == vec_num - 1 && j == ncopies - 1)
6691 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
6696 /* 4. Handle invariant-load. */
6699 gcc_assert (!strided_load);
6700 gcc_assert (nested_in_vect_loop_p (loop, stmt));
6705 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
6707 /* CHECKME: bitpos depends on endianess? */
6708 bitpos = bitsize_zero_node;
6709 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6712 vect_create_destination_var (scalar_dest, NULL_TREE);
6713 new_stmt = gimple_build_assign (vec_dest, vec_inv);
6714 new_temp = make_ssa_name (vec_dest, new_stmt);
6715 gimple_assign_set_lhs (new_stmt, new_temp);
6716 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6718 for (k = nunits - 1; k >= 0; --k)
6719 t = tree_cons (NULL_TREE, new_temp, t);
6720 /* FIXME: use build_constructor directly. */
6721 vec_inv = build_constructor_from_list (vectype, t);
6722 new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
6723 new_stmt = SSA_NAME_DEF_STMT (new_temp);
6726 gcc_unreachable (); /* FORNOW. */
6729 /* Collect vector loads and later create their permutation in
6730 vect_transform_strided_load (). */
6731 if (strided_load || slp_perm)
6732 VEC_quick_push (tree, dr_chain, new_temp);
6734 /* Store vector loads in the corresponding SLP_NODE. */
6735 if (slp && !slp_perm)
6736 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
6739 if (slp && !slp_perm)
6744 if (!vect_transform_slp_perm_load (stmt, dr_chain, gsi,
6745 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
6746 slp_node_instance, false))
6748 VEC_free (tree, heap, dr_chain);
6756 if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi))
6759 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6760 VEC_free (tree, heap, dr_chain);
6761 dr_chain = VEC_alloc (tree, heap, group_size);
6766 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6768 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6769 prev_stmt_info = vinfo_for_stmt (new_stmt);
6775 VEC_free (tree, heap, dr_chain);
6781 /* Function vectorizable_live_operation.
6783 STMT computes a value that is used outside the loop. Check if
6784 it can be supported. */
6787 vectorizable_live_operation (gimple stmt,
6788 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6789 gimple *vec_stmt ATTRIBUTE_UNUSED)
6791 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6792 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6793 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6799 enum vect_def_type dt;
6800 enum tree_code code;
6801 enum gimple_rhs_class rhs_class;
6803 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6805 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6808 if (!is_gimple_assign (stmt))
6811 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6814 /* FORNOW. CHECKME. */
6815 if (nested_in_vect_loop_p (loop, stmt))
6818 code = gimple_assign_rhs_code (stmt);
6819 op_type = TREE_CODE_LENGTH (code);
6820 rhs_class = get_gimple_rhs_class (code);
6821 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
6822 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
6824 /* FORNOW: support only if all uses are invariant. This means
6825 that the scalar operations can remain in place, unvectorized.
6826 The original last scalar value that they compute will be used. */
6828 for (i = 0; i < op_type; i++)
6830 if (rhs_class == GIMPLE_SINGLE_RHS)
6831 op = TREE_OPERAND (gimple_op (stmt, 1), i);
6833 op = gimple_op (stmt, i + 1);
6834 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
6836 if (vect_print_dump_info (REPORT_DETAILS))
6837 fprintf (vect_dump, "use not simple.");
6841 if (dt != vect_invariant_def && dt != vect_constant_def)
6845 /* No transformation is required for the cases we currently support. */
6850 /* Function vect_is_simple_cond.
6853 LOOP - the loop that is being vectorized.
6854 COND - Condition that is checked for simple use.
6856 Returns whether a COND can be vectorized. Checks whether
6857 condition operands are supportable using vec_is_simple_use. */
6860 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6864 enum vect_def_type dt;
6866 if (!COMPARISON_CLASS_P (cond))
6869 lhs = TREE_OPERAND (cond, 0);
6870 rhs = TREE_OPERAND (cond, 1);
6872 if (TREE_CODE (lhs) == SSA_NAME)
6874 gimple lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6875 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6878 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6879 && TREE_CODE (lhs) != FIXED_CST)
6882 if (TREE_CODE (rhs) == SSA_NAME)
6884 gimple rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6885 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6888 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
6889 && TREE_CODE (rhs) != FIXED_CST)
6895 /* vectorizable_condition.
6897 Check if STMT is conditional modify expression that can be vectorized.
6898 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6899 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
6902 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6905 vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
6908 tree scalar_dest = NULL_TREE;
6909 tree vec_dest = NULL_TREE;
6910 tree op = NULL_TREE;
6911 tree cond_expr, then_clause, else_clause;
6912 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6913 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6914 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6915 tree vec_compare, vec_cond_expr;
6917 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6918 enum machine_mode vec_mode;
6920 enum vect_def_type dt;
6921 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6922 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6923 enum tree_code code;
6925 gcc_assert (ncopies >= 1);
6927 return false; /* FORNOW */
6929 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6932 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6935 /* FORNOW: SLP not supported. */
6936 if (STMT_SLP_TYPE (stmt_info))
6939 /* FORNOW: not yet supported. */
6940 if (STMT_VINFO_LIVE_P (stmt_info))
6942 if (vect_print_dump_info (REPORT_DETAILS))
6943 fprintf (vect_dump, "value used after loop.");
6947 /* Is vectorizable conditional operation? */
6948 if (!is_gimple_assign (stmt))
6951 code = gimple_assign_rhs_code (stmt);
6953 if (code != COND_EXPR)
6956 gcc_assert (gimple_assign_single_p (stmt));
6957 op = gimple_assign_rhs1 (stmt);
6958 cond_expr = TREE_OPERAND (op, 0);
6959 then_clause = TREE_OPERAND (op, 1);
6960 else_clause = TREE_OPERAND (op, 2);
6962 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6965 /* We do not handle two different vector types for the condition
6967 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6970 if (TREE_CODE (then_clause) == SSA_NAME)
6972 gimple then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6973 if (!vect_is_simple_use (then_clause, loop_vinfo,
6974 &then_def_stmt, &def, &dt))
6977 else if (TREE_CODE (then_clause) != INTEGER_CST
6978 && TREE_CODE (then_clause) != REAL_CST
6979 && TREE_CODE (then_clause) != FIXED_CST)
6982 if (TREE_CODE (else_clause) == SSA_NAME)
6984 gimple else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6985 if (!vect_is_simple_use (else_clause, loop_vinfo,
6986 &else_def_stmt, &def, &dt))
6989 else if (TREE_CODE (else_clause) != INTEGER_CST
6990 && TREE_CODE (else_clause) != REAL_CST
6991 && TREE_CODE (else_clause) != FIXED_CST)
6995 vec_mode = TYPE_MODE (vectype);
6999 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
7000 return expand_vec_cond_expr_p (op, vec_mode);
7006 scalar_dest = gimple_assign_lhs (stmt);
7007 vec_dest = vect_create_destination_var (scalar_dest, vectype);
7009 /* Handle cond expr. */
7011 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
7013 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
7014 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
7015 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
7017 /* Arguments are ready. Create the new vector stmt. */
7018 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
7019 vec_cond_lhs, vec_cond_rhs);
7020 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
7021 vec_compare, vec_then_clause, vec_else_clause);
7023 *vec_stmt = gimple_build_assign (vec_dest, vec_cond_expr);
7024 new_temp = make_ssa_name (vec_dest, *vec_stmt);
7025 gimple_assign_set_lhs (*vec_stmt, new_temp);
7026 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
7032 /* Function vect_transform_stmt.
7034 Create a vectorized stmt to replace STMT, and insert it at BSI. */
7037 vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
7038 bool *strided_store, slp_tree slp_node,
7039 slp_instance slp_node_instance)
7041 bool is_store = false;
7042 gimple vec_stmt = NULL;
7043 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7044 gimple orig_stmt_in_pattern;
7046 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7047 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7049 switch (STMT_VINFO_TYPE (stmt_info))
7051 case type_demotion_vec_info_type:
7052 done = vectorizable_type_demotion (stmt, gsi, &vec_stmt, slp_node);
7056 case type_promotion_vec_info_type:
7057 done = vectorizable_type_promotion (stmt, gsi, &vec_stmt, slp_node);
7061 case type_conversion_vec_info_type:
7062 done = vectorizable_conversion (stmt, gsi, &vec_stmt, slp_node);
7066 case induc_vec_info_type:
7067 gcc_assert (!slp_node);
7068 done = vectorizable_induction (stmt, gsi, &vec_stmt);
7072 case op_vec_info_type:
7073 done = vectorizable_operation (stmt, gsi, &vec_stmt, slp_node);
7077 case assignment_vec_info_type:
7078 done = vectorizable_assignment (stmt, gsi, &vec_stmt, slp_node);
7082 case load_vec_info_type:
7083 done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node,
7088 case store_vec_info_type:
7089 done = vectorizable_store (stmt, gsi, &vec_stmt, slp_node);
7091 if (STMT_VINFO_STRIDED_ACCESS (stmt_info) && !slp_node)
7093 /* In case of interleaving, the whole chain is vectorized when the
7094 last store in the chain is reached. Store stmts before the last
7095 one are skipped, and there vec_stmt_info shouldn't be freed
7097 *strided_store = true;
7098 if (STMT_VINFO_VEC_STMT (stmt_info))
7105 case condition_vec_info_type:
7106 gcc_assert (!slp_node);
7107 done = vectorizable_condition (stmt, gsi, &vec_stmt);
7111 case call_vec_info_type:
7112 gcc_assert (!slp_node);
7113 done = vectorizable_call (stmt, gsi, &vec_stmt);
7116 case reduc_vec_info_type:
7117 gcc_assert (!slp_node);
7118 done = vectorizable_reduction (stmt, gsi, &vec_stmt);
7123 if (!STMT_VINFO_LIVE_P (stmt_info))
7125 if (vect_print_dump_info (REPORT_DETAILS))
7126 fprintf (vect_dump, "stmt not supported.");
7131 /* Handle inner-loop stmts whose DEF is used in the loop-nest that
7132 is being vectorized, but outside the immediately enclosing loop. */
7134 && nested_in_vect_loop_p (loop, stmt)
7135 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type
7136 && (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_outer
7137 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_outer_by_reduction))
7139 struct loop *innerloop = loop->inner;
7140 imm_use_iterator imm_iter;
7141 use_operand_p use_p;
7145 if (vect_print_dump_info (REPORT_DETAILS))
7146 fprintf (vect_dump, "Record the vdef for outer-loop vectorization.");
7148 /* Find the relevant loop-exit phi-node, and reord the vec_stmt there
7149 (to be used when vectorizing outer-loop stmts that use the DEF of
7151 if (gimple_code (stmt) == GIMPLE_PHI)
7152 scalar_dest = PHI_RESULT (stmt);
7154 scalar_dest = gimple_assign_lhs (stmt);
7156 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
7158 if (!flow_bb_inside_loop_p (innerloop, gimple_bb (USE_STMT (use_p))))
7160 exit_phi = USE_STMT (use_p);
7161 STMT_VINFO_VEC_STMT (vinfo_for_stmt (exit_phi)) = vec_stmt;
7166 /* Handle stmts whose DEF is used outside the loop-nest that is
7167 being vectorized. */
7168 if (STMT_VINFO_LIVE_P (stmt_info)
7169 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
7171 done = vectorizable_live_operation (stmt, gsi, &vec_stmt);
7177 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
7178 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
7179 if (orig_stmt_in_pattern)
7181 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
7182 /* STMT was inserted by the vectorizer to replace a computation idiom.
7183 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
7184 computed this idiom. We need to record a pointer to VEC_STMT in
7185 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
7186 documentation of vect_pattern_recog. */
7187 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
7189 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
7190 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
7199 /* This function builds ni_name = number of iterations loop executes
7200 on the loop preheader. */
7203 vect_build_loop_niters (loop_vec_info loop_vinfo)
7206 gimple_seq stmts = NULL;
7208 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7209 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
7211 var = create_tmp_var (TREE_TYPE (ni), "niters");
7212 add_referenced_var (var);
7213 ni_name = force_gimple_operand (ni, &stmts, false, var);
7215 pe = loop_preheader_edge (loop);
7218 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7219 gcc_assert (!new_bb);
7226 /* This function generates the following statements:
7228 ni_name = number of iterations loop executes
7229 ratio = ni_name / vf
7230 ratio_mult_vf_name = ratio * vf
7232 and places them at the loop preheader edge. */
7235 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
7237 tree *ratio_mult_vf_name_ptr,
7238 tree *ratio_name_ptr)
7247 tree ratio_mult_vf_name;
7248 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7249 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
7250 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7253 pe = loop_preheader_edge (loop);
7255 /* Generate temporary variable that contains
7256 number of iterations loop executes. */
7258 ni_name = vect_build_loop_niters (loop_vinfo);
7259 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
7261 /* Create: ratio = ni >> log2(vf) */
7263 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
7264 if (!is_gimple_val (ratio_name))
7266 var = create_tmp_var (TREE_TYPE (ni), "bnd");
7267 add_referenced_var (var);
7270 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
7271 pe = loop_preheader_edge (loop);
7272 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7273 gcc_assert (!new_bb);
7276 /* Create: ratio_mult_vf = ratio << log2 (vf). */
7278 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
7279 ratio_name, log_vf);
7280 if (!is_gimple_val (ratio_mult_vf_name))
7282 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
7283 add_referenced_var (var);
7286 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
7288 pe = loop_preheader_edge (loop);
7289 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7290 gcc_assert (!new_bb);
7293 *ni_name_ptr = ni_name;
7294 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
7295 *ratio_name_ptr = ratio_name;
7301 /* Function vect_update_ivs_after_vectorizer.
7303 "Advance" the induction variables of LOOP to the value they should take
7304 after the execution of LOOP. This is currently necessary because the
7305 vectorizer does not handle induction variables that are used after the
7306 loop. Such a situation occurs when the last iterations of LOOP are
7308 1. We introduced new uses after LOOP for IVs that were not originally used
7309 after LOOP: the IVs of LOOP are now used by an epilog loop.
7310 2. LOOP is going to be vectorized; this means that it will iterate N/VF
7311 times, whereas the loop IVs should be bumped N times.
7314 - LOOP - a loop that is going to be vectorized. The last few iterations
7315 of LOOP were peeled.
7316 - NITERS - the number of iterations that LOOP executes (before it is
7317 vectorized). i.e, the number of times the ivs should be bumped.
7318 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
7319 coming out from LOOP on which there are uses of the LOOP ivs
7320 (this is the path from LOOP->exit to epilog_loop->preheader).
7322 The new definitions of the ivs are placed in LOOP->exit.
7323 The phi args associated with the edge UPDATE_E in the bb
7324 UPDATE_E->dest are updated accordingly.
7326 Assumption 1: Like the rest of the vectorizer, this function assumes
7327 a single loop exit that has a single predecessor.
7329 Assumption 2: The phi nodes in the LOOP header and in update_bb are
7330 organized in the same order.
7332 Assumption 3: The access function of the ivs is simple enough (see
7333 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
7335 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
7336 coming out of LOOP on which the ivs of LOOP are used (this is the path
7337 that leads to the epilog loop; other paths skip the epilog loop). This
7338 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
7339 needs to have its phis updated.
7343 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
7346 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7347 basic_block exit_bb = single_exit (loop)->dest;
7349 gimple_stmt_iterator gsi, gsi1;
7350 basic_block update_bb = update_e->dest;
7352 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
7354 /* Make sure there exists a single-predecessor exit bb: */
7355 gcc_assert (single_pred_p (exit_bb));
7357 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
7358 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
7359 gsi_next (&gsi), gsi_next (&gsi1))
7361 tree access_fn = NULL;
7362 tree evolution_part;
7365 tree var, ni, ni_name;
7366 gimple_stmt_iterator last_gsi;
7368 phi = gsi_stmt (gsi);
7369 phi1 = gsi_stmt (gsi1);
7370 if (vect_print_dump_info (REPORT_DETAILS))
7372 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
7373 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
7376 /* Skip virtual phi's. */
7377 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
7379 if (vect_print_dump_info (REPORT_DETAILS))
7380 fprintf (vect_dump, "virtual phi. skip.");
7384 /* Skip reduction phis. */
7385 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
7387 if (vect_print_dump_info (REPORT_DETAILS))
7388 fprintf (vect_dump, "reduc phi. skip.");
7392 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
7393 gcc_assert (access_fn);
7394 STRIP_NOPS (access_fn);
7396 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
7397 gcc_assert (evolution_part != NULL_TREE);
7399 /* FORNOW: We do not support IVs whose evolution function is a polynomial
7400 of degree >= 2 or exponential. */
7401 gcc_assert (!tree_is_chrec (evolution_part));
7403 step_expr = evolution_part;
7404 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
7407 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
7408 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
7410 fold_convert (sizetype,
7411 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
7412 niters, step_expr)));
7414 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
7415 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
7416 fold_convert (TREE_TYPE (init_expr),
7423 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
7424 add_referenced_var (var);
7426 last_gsi = gsi_last_bb (exit_bb);
7427 ni_name = force_gimple_operand_gsi (&last_gsi, ni, false, var,
7428 true, GSI_SAME_STMT);
7430 /* Fix phi expressions in the successor bb. */
7431 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
7435 /* Return the more conservative threshold between the
7436 min_profitable_iters returned by the cost model and the user
7437 specified threshold, if provided. */
7440 conservative_cost_threshold (loop_vec_info loop_vinfo,
7441 int min_profitable_iters)
7444 int min_scalar_loop_bound;
7446 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
7447 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
7449 /* Use the cost model only if it is more conservative than user specified
7451 th = (unsigned) min_scalar_loop_bound;
7452 if (min_profitable_iters
7453 && (!min_scalar_loop_bound
7454 || min_profitable_iters > min_scalar_loop_bound))
7455 th = (unsigned) min_profitable_iters;
7457 if (th && vect_print_dump_info (REPORT_COST))
7458 fprintf (vect_dump, "Vectorization may not be profitable.");
7463 /* Function vect_do_peeling_for_loop_bound
7465 Peel the last iterations of the loop represented by LOOP_VINFO.
7466 The peeled iterations form a new epilog loop. Given that the loop now
7467 iterates NITERS times, the new epilog loop iterates
7468 NITERS % VECTORIZATION_FACTOR times.
7470 The original loop will later be made to iterate
7471 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
7474 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
7476 tree ni_name, ratio_mult_vf_name;
7477 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7478 struct loop *new_loop;
7480 basic_block preheader;
7482 bool check_profitability = false;
7483 unsigned int th = 0;
7484 int min_profitable_iters;
7486 if (vect_print_dump_info (REPORT_DETAILS))
7487 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
7489 initialize_original_copy_tables ();
7491 /* Generate the following variables on the preheader of original loop:
7493 ni_name = number of iteration the original loop executes
7494 ratio = ni_name / vf
7495 ratio_mult_vf_name = ratio * vf */
7496 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
7497 &ratio_mult_vf_name, ratio);
7499 loop_num = loop->num;
7501 /* If cost model check not done during versioning and
7502 peeling for alignment. */
7503 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7504 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
7505 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7507 check_profitability = true;
7509 /* Get profitability threshold for vectorized loop. */
7510 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7512 th = conservative_cost_threshold (loop_vinfo,
7513 min_profitable_iters);
7516 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
7517 ratio_mult_vf_name, ni_name, false,
7518 th, check_profitability);
7519 gcc_assert (new_loop);
7520 gcc_assert (loop_num == loop->num);
7521 #ifdef ENABLE_CHECKING
7522 slpeel_verify_cfg_after_peeling (loop, new_loop);
7525 /* A guard that controls whether the new_loop is to be executed or skipped
7526 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
7527 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
7528 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
7529 is on the path where the LOOP IVs are used and need to be updated. */
7531 preheader = loop_preheader_edge (new_loop)->src;
7532 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
7533 update_e = EDGE_PRED (preheader, 0);
7535 update_e = EDGE_PRED (preheader, 1);
7537 /* Update IVs of original loop as if they were advanced
7538 by ratio_mult_vf_name steps. */
7539 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
7541 /* After peeling we have to reset scalar evolution analyzer. */
7544 free_original_copy_tables ();
7548 /* Function vect_gen_niters_for_prolog_loop
7550 Set the number of iterations for the loop represented by LOOP_VINFO
7551 to the minimum between LOOP_NITERS (the original iteration count of the loop)
7552 and the misalignment of DR - the data reference recorded in
7553 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
7554 this loop, the data reference DR will refer to an aligned location.
7556 The following computation is generated:
7558 If the misalignment of DR is known at compile time:
7559 addr_mis = int mis = DR_MISALIGNMENT (dr);
7560 Else, compute address misalignment in bytes:
7561 addr_mis = addr & (vectype_size - 1)
7563 prolog_niters = min (LOOP_NITERS, ((VF - addr_mis/elem_size)&(VF-1))/step)
7565 (elem_size = element type size; an element is the scalar element whose type
7566 is the inner type of the vectype)
7568 When the step of the data-ref in the loop is not 1 (as in interleaved data
7569 and SLP), the number of iterations of the prolog must be divided by the step
7570 (which is equal to the size of interleaved group).
7572 The above formulas assume that VF == number of elements in the vector. This
7573 may not hold when there are multiple-types in the loop.
7574 In this case, for some data-references in the loop the VF does not represent
7575 the number of elements that fit in the vector. Therefore, instead of VF we
7576 use TYPE_VECTOR_SUBPARTS. */
7579 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
7581 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
7582 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7585 tree iters, iters_name;
7588 gimple dr_stmt = DR_STMT (dr);
7589 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
7590 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7591 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
7592 tree niters_type = TREE_TYPE (loop_niters);
7594 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
7595 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
7597 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7598 step = DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info)));
7600 pe = loop_preheader_edge (loop);
7602 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
7604 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
7605 int elem_misalign = byte_misalign / element_size;
7607 if (vect_print_dump_info (REPORT_DETAILS))
7608 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
7610 iters = build_int_cst (niters_type,
7611 (((nelements - elem_misalign) & (nelements - 1)) / step));
7615 gimple_seq new_stmts = NULL;
7616 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
7617 &new_stmts, NULL_TREE, loop);
7618 tree ptr_type = TREE_TYPE (start_addr);
7619 tree size = TYPE_SIZE (ptr_type);
7620 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
7621 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
7622 tree elem_size_log =
7623 build_int_cst (type, exact_log2 (vectype_align/nelements));
7624 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
7625 tree nelements_tree = build_int_cst (type, nelements);
7629 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmts);
7630 gcc_assert (!new_bb);
7632 /* Create: byte_misalign = addr & (vectype_size - 1) */
7634 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
7636 /* Create: elem_misalign = byte_misalign / element_size */
7638 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
7640 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
7641 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
7642 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
7643 iters = fold_convert (niters_type, iters);
7646 /* Create: prolog_loop_niters = min (iters, loop_niters) */
7647 /* If the loop bound is known at compile time we already verified that it is
7648 greater than vf; since the misalignment ('iters') is at most vf, there's
7649 no need to generate the MIN_EXPR in this case. */
7650 if (TREE_CODE (loop_niters) != INTEGER_CST)
7651 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
7653 if (vect_print_dump_info (REPORT_DETAILS))
7655 fprintf (vect_dump, "niters for prolog loop: ");
7656 print_generic_expr (vect_dump, iters, TDF_SLIM);
7659 var = create_tmp_var (niters_type, "prolog_loop_niters");
7660 add_referenced_var (var);
7662 iters_name = force_gimple_operand (iters, &stmts, false, var);
7664 /* Insert stmt on loop preheader edge. */
7667 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7668 gcc_assert (!new_bb);
7675 /* Function vect_update_init_of_dr
7677 NITERS iterations were peeled from LOOP. DR represents a data reference
7678 in LOOP. This function updates the information recorded in DR to
7679 account for the fact that the first NITERS iterations had already been
7680 executed. Specifically, it updates the OFFSET field of DR. */
7683 vect_update_init_of_dr (struct data_reference *dr, tree niters)
7685 tree offset = DR_OFFSET (dr);
7687 niters = fold_build2 (MULT_EXPR, sizetype,
7688 fold_convert (sizetype, niters),
7689 fold_convert (sizetype, DR_STEP (dr)));
7690 offset = fold_build2 (PLUS_EXPR, sizetype, offset, niters);
7691 DR_OFFSET (dr) = offset;
7695 /* Function vect_update_inits_of_drs
7697 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
7698 This function updates the information recorded for the data references in
7699 the loop to account for the fact that the first NITERS iterations had
7700 already been executed. Specifically, it updates the initial_condition of
7701 the access_function of all the data_references in the loop. */
7704 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
7707 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
7708 struct data_reference *dr;
7710 if (vect_print_dump_info (REPORT_DETAILS))
7711 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
7713 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
7714 vect_update_init_of_dr (dr, niters);
7718 /* Function vect_do_peeling_for_alignment
7720 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
7721 'niters' is set to the misalignment of one of the data references in the
7722 loop, thereby forcing it to refer to an aligned location at the beginning
7723 of the execution of this loop. The data reference for which we are
7724 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
7727 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
7729 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7730 tree niters_of_prolog_loop, ni_name;
7732 struct loop *new_loop;
7733 bool check_profitability = false;
7734 unsigned int th = 0;
7735 int min_profitable_iters;
7737 if (vect_print_dump_info (REPORT_DETAILS))
7738 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
7740 initialize_original_copy_tables ();
7742 ni_name = vect_build_loop_niters (loop_vinfo);
7743 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
7746 /* If cost model check not done during versioning. */
7747 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7748 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7750 check_profitability = true;
7752 /* Get profitability threshold for vectorized loop. */
7753 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7755 th = conservative_cost_threshold (loop_vinfo,
7756 min_profitable_iters);
7759 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
7761 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
7762 niters_of_prolog_loop, ni_name, true,
7763 th, check_profitability);
7765 gcc_assert (new_loop);
7766 #ifdef ENABLE_CHECKING
7767 slpeel_verify_cfg_after_peeling (new_loop, loop);
7770 /* Update number of times loop executes. */
7771 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
7772 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
7773 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
7775 /* Update the init conditions of the access functions of all data refs. */
7776 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
7778 /* After peeling we have to reset scalar evolution analyzer. */
7781 free_original_copy_tables ();
7785 /* Function vect_create_cond_for_align_checks.
7787 Create a conditional expression that represents the alignment checks for
7788 all of data references (array element references) whose alignment must be
7792 COND_EXPR - input conditional expression. New conditions will be chained
7793 with logical AND operation.
7794 LOOP_VINFO - two fields of the loop information are used.
7795 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
7796 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
7799 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7801 The returned value is the conditional expression to be used in the if
7802 statement that controls which version of the loop gets executed at runtime.
7804 The algorithm makes two assumptions:
7805 1) The number of bytes "n" in a vector is a power of 2.
7806 2) An address "a" is aligned if a%n is zero and that this
7807 test can be done as a&(n-1) == 0. For example, for 16
7808 byte vectors the test is a&0xf == 0. */
7811 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
7813 gimple_seq *cond_expr_stmt_list)
7815 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7816 VEC(gimple,heap) *may_misalign_stmts
7817 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
7819 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
7823 tree int_ptrsize_type;
7825 tree or_tmp_name = NULL_TREE;
7826 tree and_tmp, and_tmp_name;
7829 tree part_cond_expr;
7831 /* Check that mask is one less than a power of 2, i.e., mask is
7832 all zeros followed by all ones. */
7833 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
7835 /* CHECKME: what is the best integer or unsigned type to use to hold a
7836 cast from a pointer value? */
7837 psize = TYPE_SIZE (ptr_type_node);
7839 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
7841 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
7842 of the first vector of the i'th data reference. */
7844 for (i = 0; VEC_iterate (gimple, may_misalign_stmts, i, ref_stmt); i++)
7846 gimple_seq new_stmt_list = NULL;
7848 tree addr_tmp, addr_tmp_name;
7849 tree or_tmp, new_or_tmp_name;
7850 gimple addr_stmt, or_stmt;
7852 /* create: addr_tmp = (int)(address_of_first_vector) */
7854 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
7856 if (new_stmt_list != NULL)
7857 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
7859 sprintf (tmp_name, "%s%d", "addr2int", i);
7860 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7861 add_referenced_var (addr_tmp);
7862 addr_tmp_name = make_ssa_name (addr_tmp, NULL);
7863 addr_stmt = gimple_build_assign_with_ops (NOP_EXPR, addr_tmp_name,
7864 addr_base, NULL_TREE);
7865 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
7866 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
7868 /* The addresses are OR together. */
7870 if (or_tmp_name != NULL_TREE)
7872 /* create: or_tmp = or_tmp | addr_tmp */
7873 sprintf (tmp_name, "%s%d", "orptrs", i);
7874 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7875 add_referenced_var (or_tmp);
7876 new_or_tmp_name = make_ssa_name (or_tmp, NULL);
7877 or_stmt = gimple_build_assign_with_ops (BIT_IOR_EXPR,
7879 or_tmp_name, addr_tmp_name);
7880 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
7881 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
7882 or_tmp_name = new_or_tmp_name;
7885 or_tmp_name = addr_tmp_name;
7889 mask_cst = build_int_cst (int_ptrsize_type, mask);
7891 /* create: and_tmp = or_tmp & mask */
7892 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
7893 add_referenced_var (and_tmp);
7894 and_tmp_name = make_ssa_name (and_tmp, NULL);
7896 and_stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, and_tmp_name,
7897 or_tmp_name, mask_cst);
7898 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
7899 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
7901 /* Make and_tmp the left operand of the conditional test against zero.
7902 if and_tmp has a nonzero bit then some address is unaligned. */
7903 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
7904 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
7905 and_tmp_name, ptrsize_zero);
7907 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7908 *cond_expr, part_cond_expr);
7910 *cond_expr = part_cond_expr;
7913 /* Function vect_vfa_segment_size.
7915 Create an expression that computes the size of segment
7916 that will be accessed for a data reference. The functions takes into
7917 account that realignment loads may access one more vector.
7920 DR: The data reference.
7921 VECT_FACTOR: vectorization factor.
7923 Return an expression whose value is the size of segment which will be
7927 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7929 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7930 DR_STEP (dr), vect_factor);
7932 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7934 tree vector_size = TYPE_SIZE_UNIT
7935 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7937 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7938 segment_length, vector_size);
7940 return fold_convert (sizetype, segment_length);
7943 /* Function vect_create_cond_for_alias_checks.
7945 Create a conditional expression that represents the run-time checks for
7946 overlapping of address ranges represented by a list of data references
7947 relations passed as input.
7950 COND_EXPR - input conditional expression. New conditions will be chained
7951 with logical AND operation.
7952 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7956 COND_EXPR - conditional expression.
7957 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7961 The returned value is the conditional expression to be used in the if
7962 statement that controls which version of the loop gets executed at runtime.
7966 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7968 gimple_seq * cond_expr_stmt_list)
7970 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7971 VEC (ddr_p, heap) * may_alias_ddrs =
7972 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7974 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7978 tree part_cond_expr;
7980 /* Create expression
7981 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7982 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7986 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7987 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
7989 if (VEC_empty (ddr_p, may_alias_ddrs))
7992 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
7994 struct data_reference *dr_a, *dr_b;
7995 gimple dr_group_first_a, dr_group_first_b;
7996 tree addr_base_a, addr_base_b;
7997 tree segment_length_a, segment_length_b;
7998 gimple stmt_a, stmt_b;
8001 stmt_a = DR_STMT (DDR_A (ddr));
8002 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
8003 if (dr_group_first_a)
8005 stmt_a = dr_group_first_a;
8006 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
8010 stmt_b = DR_STMT (DDR_B (ddr));
8011 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
8012 if (dr_group_first_b)
8014 stmt_b = dr_group_first_b;
8015 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
8019 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
8022 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
8025 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
8026 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
8028 if (vect_print_dump_info (REPORT_DR_DETAILS))
8031 "create runtime check for data references ");
8032 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
8033 fprintf (vect_dump, " and ");
8034 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
8039 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
8040 fold_build2 (LT_EXPR, boolean_type_node,
8041 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
8045 fold_build2 (LT_EXPR, boolean_type_node,
8046 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
8052 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
8053 *cond_expr, part_cond_expr);
8055 *cond_expr = part_cond_expr;
8057 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8058 fprintf (vect_dump, "created %u versioning for alias checks.\n",
8059 VEC_length (ddr_p, may_alias_ddrs));
8063 /* Function vect_loop_versioning.
8065 If the loop has data references that may or may not be aligned or/and
8066 has data reference relations whose independence was not proven then
8067 two versions of the loop need to be generated, one which is vectorized
8068 and one which isn't. A test is then generated to control which of the
8069 loops is executed. The test checks for the alignment of all of the
8070 data references that may or may not be aligned. An additional
8071 sequence of runtime tests is generated for each pairs of DDRs whose
8072 independence was not proven. The vectorized version of loop is
8073 executed only if both alias and alignment tests are passed.
8075 The test generated to check which version of loop is executed
8076 is modified to also check for profitability as indicated by the
8077 cost model initially. */
8080 vect_loop_versioning (loop_vec_info loop_vinfo)
8082 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8084 tree cond_expr = NULL_TREE;
8085 gimple_seq cond_expr_stmt_list = NULL;
8086 basic_block condition_bb;
8087 gimple_stmt_iterator gsi, cond_exp_gsi;
8088 basic_block merge_bb;
8089 basic_block new_exit_bb;
8091 gimple orig_phi, new_phi;
8093 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
8094 gimple_seq gimplify_stmt_list = NULL;
8095 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
8096 int min_profitable_iters = 0;
8099 /* Get profitability threshold for vectorized loop. */
8100 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
8102 th = conservative_cost_threshold (loop_vinfo,
8103 min_profitable_iters);
8106 build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
8107 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
8109 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
8112 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
8113 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
8114 &cond_expr_stmt_list);
8116 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8117 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
8118 &cond_expr_stmt_list);
8121 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
8123 force_gimple_operand (cond_expr, &gimplify_stmt_list, true, NULL_TREE);
8124 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
8126 initialize_original_copy_tables ();
8127 nloop = loop_version (loop, cond_expr, &condition_bb,
8128 prob, prob, REG_BR_PROB_BASE - prob, true);
8129 free_original_copy_tables();
8131 /* Loop versioning violates an assumption we try to maintain during
8132 vectorization - that the loop exit block has a single predecessor.
8133 After versioning, the exit block of both loop versions is the same
8134 basic block (i.e. it has two predecessors). Just in order to simplify
8135 following transformations in the vectorizer, we fix this situation
8136 here by adding a new (empty) block on the exit-edge of the loop,
8137 with the proper loop-exit phis to maintain loop-closed-form. */
8139 merge_bb = single_exit (loop)->dest;
8140 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
8141 new_exit_bb = split_edge (single_exit (loop));
8142 new_exit_e = single_exit (loop);
8143 e = EDGE_SUCC (new_exit_bb, 0);
8145 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8147 orig_phi = gsi_stmt (gsi);
8148 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
8150 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
8151 add_phi_arg (new_phi, arg, new_exit_e);
8152 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
8155 /* End loop-exit-fixes after versioning. */
8157 update_ssa (TODO_update_ssa);
8158 if (cond_expr_stmt_list)
8160 cond_exp_gsi = gsi_last_bb (condition_bb);
8161 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, GSI_SAME_STMT);
8165 /* Remove a group of stores (for SLP or interleaving), free their
8169 vect_remove_stores (gimple first_stmt)
8171 gimple next = first_stmt;
8173 gimple_stmt_iterator next_si;
8177 /* Free the attached stmt_vec_info and remove the stmt. */
8178 next_si = gsi_for_stmt (next);
8179 gsi_remove (&next_si, true);
8180 tmp = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
8181 free_stmt_vec_info (next);
8187 /* Vectorize SLP instance tree in postorder. */
8190 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
8191 unsigned int vectorization_factor)
8194 bool strided_store, is_store;
8195 gimple_stmt_iterator si;
8196 stmt_vec_info stmt_info;
8197 unsigned int vec_stmts_size, nunits, group_size;
8200 slp_tree loads_node;
8205 vect_schedule_slp_instance (SLP_TREE_LEFT (node), instance,
8206 vectorization_factor);
8207 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), instance,
8208 vectorization_factor);
8210 stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
8211 stmt_info = vinfo_for_stmt (stmt);
8213 /* VECTYPE is the type of the destination. */
8214 vectype = get_vectype_for_scalar_type (TREE_TYPE (gimple_assign_lhs (stmt)));
8215 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
8216 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
8218 /* For each SLP instance calculate number of vector stmts to be created
8219 for the scalar stmts in each node of the SLP tree. Number of vector
8220 elements in one vector iteration is the number of scalar elements in
8221 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
8223 vec_stmts_size = (vectorization_factor * group_size) / nunits;
8225 /* In case of load permutation we have to allocate vectorized statements for
8226 all the nodes that participate in that permutation. */
8227 if (SLP_INSTANCE_LOAD_PERMUTATION (instance))
8230 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), i, loads_node);
8233 if (!SLP_TREE_VEC_STMTS (loads_node))
8235 SLP_TREE_VEC_STMTS (loads_node) = VEC_alloc (gimple, heap,
8237 SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
8242 if (!SLP_TREE_VEC_STMTS (node))
8244 SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size);
8245 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
8248 if (vect_print_dump_info (REPORT_DETAILS))
8250 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
8251 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8254 /* Loads should be inserted before the first load. */
8255 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
8256 && STMT_VINFO_STRIDED_ACCESS (stmt_info)
8257 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
8258 si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
8260 si = gsi_for_stmt (stmt);
8262 is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance);
8265 if (DR_GROUP_FIRST_DR (stmt_info))
8266 /* If IS_STORE is TRUE, the vectorization of the
8267 interleaving chain was completed - free all the stores in
8269 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8271 /* FORNOW: SLP originates only from strided stores. */
8277 /* FORNOW: SLP originates only from strided stores. */
8283 vect_schedule_slp (loop_vec_info loop_vinfo)
8285 VEC (slp_instance, heap) *slp_instances =
8286 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
8287 slp_instance instance;
8289 bool is_store = false;
8291 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
8293 /* Schedule the tree of INSTANCE. */
8294 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
8295 instance, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
8297 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
8298 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
8299 fprintf (vect_dump, "vectorizing stmts using SLP.");
8305 /* Function vect_transform_loop.
8307 The analysis phase has determined that the loop is vectorizable.
8308 Vectorize the loop - created vectorized stmts to replace the scalar
8309 stmts in the loop, and update the loop exit condition. */
8312 vect_transform_loop (loop_vec_info loop_vinfo)
8314 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8315 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8316 int nbbs = loop->num_nodes;
8317 gimple_stmt_iterator si;
8320 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8322 bool slp_scheduled = false;
8323 unsigned int nunits;
8325 if (vect_print_dump_info (REPORT_DETAILS))
8326 fprintf (vect_dump, "=== vec_transform_loop ===");
8328 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
8329 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8330 vect_loop_versioning (loop_vinfo);
8332 /* CHECKME: we wouldn't need this if we called update_ssa once
8334 bitmap_zero (vect_memsyms_to_rename);
8336 /* Peel the loop if there are data refs with unknown alignment.
8337 Only one data ref with unknown store is allowed. */
8339 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
8340 vect_do_peeling_for_alignment (loop_vinfo);
8342 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
8343 compile time constant), or it is a constant that doesn't divide by the
8344 vectorization factor, then an epilog loop needs to be created.
8345 We therefore duplicate the loop: the original loop will be vectorized,
8346 and will compute the first (n/VF) iterations. The second copy of the loop
8347 will remain scalar and will compute the remaining (n%VF) iterations.
8348 (VF is the vectorization factor). */
8350 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8351 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8352 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
8353 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
8355 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8356 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
8358 /* 1) Make sure the loop header has exactly two entries
8359 2) Make sure we have a preheader basic block. */
8361 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8363 split_edge (loop_preheader_edge (loop));
8365 /* FORNOW: the vectorizer supports only loops which body consist
8366 of one basic block (header + empty latch). When the vectorizer will
8367 support more involved loop forms, the order by which the BBs are
8368 traversed need to be reconsidered. */
8370 for (i = 0; i < nbbs; i++)
8372 basic_block bb = bbs[i];
8373 stmt_vec_info stmt_info;
8376 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
8378 phi = gsi_stmt (si);
8379 if (vect_print_dump_info (REPORT_DETAILS))
8381 fprintf (vect_dump, "------>vectorizing phi: ");
8382 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
8384 stmt_info = vinfo_for_stmt (phi);
8388 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8389 && !STMT_VINFO_LIVE_P (stmt_info))
8392 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
8393 != (unsigned HOST_WIDE_INT) vectorization_factor)
8394 && vect_print_dump_info (REPORT_DETAILS))
8395 fprintf (vect_dump, "multiple-types.");
8397 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
8399 if (vect_print_dump_info (REPORT_DETAILS))
8400 fprintf (vect_dump, "transform phi.");
8401 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8405 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
8407 gimple stmt = gsi_stmt (si);
8410 if (vect_print_dump_info (REPORT_DETAILS))
8412 fprintf (vect_dump, "------>vectorizing statement: ");
8413 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8416 stmt_info = vinfo_for_stmt (stmt);
8418 /* vector stmts created in the outer-loop during vectorization of
8419 stmts in an inner-loop may not have a stmt_info, and do not
8420 need to be vectorized. */
8427 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8428 && !STMT_VINFO_LIVE_P (stmt_info))
8434 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
8436 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8437 if (!STMT_SLP_TYPE (stmt_info)
8438 && nunits != (unsigned int) vectorization_factor
8439 && vect_print_dump_info (REPORT_DETAILS))
8440 /* For SLP VF is set according to unrolling factor, and not to
8441 vector size, hence for SLP this print is not valid. */
8442 fprintf (vect_dump, "multiple-types.");
8444 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8446 if (STMT_SLP_TYPE (stmt_info))
8450 slp_scheduled = true;
8452 if (vect_print_dump_info (REPORT_DETAILS))
8453 fprintf (vect_dump, "=== scheduling SLP instances ===");
8455 is_store = vect_schedule_slp (loop_vinfo);
8457 /* IS_STORE is true if STMT is a store. Stores cannot be of
8458 hybrid SLP type. They are removed in
8459 vect_schedule_slp_instance and their vinfo is destroyed. */
8467 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8468 if (PURE_SLP_STMT (stmt_info))
8475 /* -------- vectorize statement ------------ */
8476 if (vect_print_dump_info (REPORT_DETAILS))
8477 fprintf (vect_dump, "transform statement.");
8479 strided_store = false;
8480 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
8483 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
8485 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8486 interleaving chain was completed - free all the stores in
8488 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8489 gsi_remove (&si, true);
8494 /* Free the attached stmt_vec_info and remove the stmt. */
8495 free_stmt_vec_info (stmt);
8496 gsi_remove (&si, true);
8504 slpeel_make_loop_iterate_ntimes (loop, ratio);
8506 mark_set_for_renaming (vect_memsyms_to_rename);
8508 /* The memory tags and pointers in vectorized statements need to
8509 have their SSA forms updated. FIXME, why can't this be delayed
8510 until all the loops have been transformed? */
8511 update_ssa (TODO_update_ssa);
8513 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8514 fprintf (vect_dump, "LOOP VECTORIZED.");
8515 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8516 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");