1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
50 static tree vect_create_destination_var (tree, tree);
51 static tree vect_create_data_ref_ptr
52 (tree, struct loop*, tree, tree *, tree *, bool, tree, bool *);
53 static tree vect_create_addr_base_for_vector_ref
54 (tree, tree *, tree, struct loop *);
55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
57 static tree vect_init_vector (tree, tree, tree, block_stmt_iterator *);
58 static void vect_finish_stmt_generation
59 (tree stmt, tree vec_stmt, block_stmt_iterator *);
60 static bool vect_is_simple_cond (tree, loop_vec_info);
61 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
62 static tree get_initial_def_for_reduction (tree, tree, tree *);
64 /* Utility function dealing with loop peeling (not peeling itself). */
65 static void vect_generate_tmps_on_preheader
66 (loop_vec_info, tree *, tree *, tree *);
67 static tree vect_build_loop_niters (loop_vec_info);
68 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
69 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
70 static void vect_update_init_of_dr (struct data_reference *, tree niters);
71 static void vect_update_inits_of_drs (loop_vec_info, tree);
72 static int vect_min_worthwhile_factor (enum tree_code);
76 cost_for_stmt (tree stmt)
78 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
80 switch (STMT_VINFO_TYPE (stmt_info))
82 case load_vec_info_type:
83 return TARG_SCALAR_LOAD_COST;
84 case store_vec_info_type:
85 return TARG_SCALAR_STORE_COST;
86 case op_vec_info_type:
87 case condition_vec_info_type:
88 case assignment_vec_info_type:
89 case reduc_vec_info_type:
90 case induc_vec_info_type:
91 case type_promotion_vec_info_type:
92 case type_demotion_vec_info_type:
93 case type_conversion_vec_info_type:
94 case call_vec_info_type:
95 return TARG_SCALAR_STMT_COST;
96 case undef_vec_info_type:
103 /* Function vect_estimate_min_profitable_iters
105 Return the number of iterations required for the vector version of the
106 loop to be profitable relative to the cost of the scalar version of the
109 TODO: Take profile info into account before making vectorization
110 decisions, if available. */
113 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
116 int min_profitable_iters;
117 int peel_iters_prologue;
118 int peel_iters_epilogue;
119 int vec_inside_cost = 0;
120 int vec_outside_cost = 0;
121 int scalar_single_iter_cost = 0;
122 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
123 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
124 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
125 int nbbs = loop->num_nodes;
127 int innerloop_iters = 0, factor;
129 /* Cost model disabled. */
130 if (!flag_vect_cost_model)
132 if (vect_print_dump_info (REPORT_DETAILS))
133 fprintf (vect_dump, "cost model disabled.");
137 /* Requires loop versioning tests to handle misalignment.
138 FIXME: Make cost depend on number of stmts in may_misalign list. */
140 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
142 vec_outside_cost += TARG_COND_BRANCH_COST;
143 if (vect_print_dump_info (REPORT_DETAILS))
144 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
148 /* Count statements in scalar loop. Using this as scalar cost for a single
151 TODO: Add outer loop support.
153 TODO: Consider assigning different costs to different scalar
158 innerloop_iters = 50; /* FIXME */
160 for (i = 0; i < nbbs; i++)
162 block_stmt_iterator si;
163 basic_block bb = bbs[i];
165 if (bb->loop_father == loop->inner)
166 factor = innerloop_iters;
170 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
172 tree stmt = bsi_stmt (si);
173 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
174 if (!STMT_VINFO_RELEVANT_P (stmt_info)
175 && !STMT_VINFO_LIVE_P (stmt_info))
177 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
178 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
179 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
180 some of the "outside" costs are generated inside the outer-loop. */
181 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
185 /* Add additional cost for the peeled instructions in prologue and epilogue
188 FORNOW: If we dont know the value of peel_iters for prologue or epilogue
189 at compile-time - we assume it's (vf-1)/2 (the worst would be vf-1).
191 TODO: Build an expression that represents peel_iters for prologue and
192 epilogue to be used in a run-time test. */
194 byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
196 if (byte_misalign < 0)
198 peel_iters_prologue = (vf - 1)/2;
199 if (vect_print_dump_info (REPORT_DETAILS))
200 fprintf (vect_dump, "cost model: "
201 "prologue peel iters set to (vf-1)/2.");
203 /* If peeling for alignment is unknown, loop bound of main loop becomes
205 peel_iters_epilogue = (vf - 1)/2;
206 if (vect_print_dump_info (REPORT_DETAILS))
207 fprintf (vect_dump, "cost model: "
208 "epilogue peel iters set to (vf-1)/2 because "
209 "peeling for alignment is unknown .");
215 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
216 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
217 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
218 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
220 peel_iters_prologue = nelements - (byte_misalign / element_size);
223 peel_iters_prologue = 0;
225 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
227 peel_iters_epilogue = (vf - 1)/2;
228 if (vect_print_dump_info (REPORT_DETAILS))
229 fprintf (vect_dump, "cost model: "
230 "epilogue peel iters set to (vf-1)/2 because "
231 "loop iterations are unknown .");
235 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
236 peel_iters_prologue = niters < peel_iters_prologue ?
237 niters : peel_iters_prologue;
238 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
242 /* Requires a prologue loop when peeling to handle misalignment. Add cost of
243 two guards, one for the peeled loop and one for the vector loop. */
245 if (peel_iters_prologue)
247 vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
248 if (vect_print_dump_info (REPORT_DETAILS))
249 fprintf (vect_dump, "cost model: Adding cost of checks for "
253 /* Requires an epilogue loop to finish up remaining iterations after vector
254 loop. Add cost of two guards, one for the peeled loop and one for the
257 if (peel_iters_epilogue
258 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
259 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vf)
261 vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
262 if (vect_print_dump_info (REPORT_DETAILS))
263 fprintf (vect_dump, "cost model : Adding cost of checks for "
267 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
268 + (peel_iters_epilogue * scalar_single_iter_cost);
270 /* Allow targets add additional (outside-of-loop) costs. FORNOW, the only
271 information we provide for the target is whether testing against the
272 threshold involves a runtime test. */
273 if (targetm.vectorize.builtin_vectorization_cost)
275 bool runtime_test = false;
277 /* If the number of iterations is unknown, or the
278 peeling-for-misalignment amount is unknown, we eill have to generate
279 a runtime test to test the loop count against the threshold. */
280 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
281 || (byte_misalign < 0))
284 targetm.vectorize.builtin_vectorization_cost (runtime_test);
285 if (vect_print_dump_info (REPORT_DETAILS))
286 fprintf (vect_dump, "cost model : Adding target out-of-loop cost = %d",
287 targetm.vectorize.builtin_vectorization_cost (runtime_test));
290 /* Calculate number of iterations required to make the vector version
291 profitable, relative to the loop bodies only. The following condition
292 must hold true: ((SIC*VF)-VIC)*niters > VOC*VF, where
293 SIC = scalar iteration cost, VIC = vector iteration cost,
294 VOC = vector outside cost and VF = vectorization factor. */
296 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
298 if (vec_outside_cost == 0)
299 min_profitable_iters = 1;
302 min_profitable_iters = (vec_outside_cost * vf)
303 / ((scalar_single_iter_cost * vf)
306 if ((scalar_single_iter_cost * vf * min_profitable_iters)
307 <= ((vec_inside_cost * min_profitable_iters)
308 + (vec_outside_cost * vf)))
309 min_profitable_iters++;
312 /* vector version will never be profitable. */
315 if (vect_print_dump_info (REPORT_DETAILS))
316 fprintf (vect_dump, "cost model: vector iteration cost = %d "
317 "is divisible by scalar iteration cost = %d by a factor "
318 "greater than or equal to the vectorization factor = %d .",
319 vec_inside_cost, scalar_single_iter_cost, vf);
323 if (vect_print_dump_info (REPORT_DETAILS))
325 fprintf (vect_dump, "Cost model analysis: \n");
326 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
328 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
330 fprintf (vect_dump, " Scalar cost: %d\n", scalar_single_iter_cost);
331 fprintf (vect_dump, " prologue iterations: %d\n",
332 peel_iters_prologue);
333 fprintf (vect_dump, " epilogue iterations: %d\n",
334 peel_iters_epilogue);
335 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
336 min_profitable_iters);
337 fprintf (vect_dump, " Actual minimum iters for profitability: %d\n",
338 min_profitable_iters < vf ? vf : min_profitable_iters);
341 min_profitable_iters =
342 min_profitable_iters < vf ? vf : min_profitable_iters;
344 /* Because the condition we create is:
345 if (niters <= min_profitable_iters)
346 then skip the vectorized loop. */
347 min_profitable_iters--;
348 return min_profitable_iters;
352 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
353 functions. Design better to avoid maintenance issues. */
355 /* Function vect_model_reduction_cost.
357 Models cost for a reduction operation, including the vector ops
358 generated within the strip-mine loop, the initial definition before
359 the loop, and the epilogue code that must be generated. */
362 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
371 enum machine_mode mode;
372 tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1);
373 int op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
374 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
375 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
377 /* Cost of reduction op inside loop. */
378 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
380 reduction_op = TREE_OPERAND (operation, op_type-1);
381 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
382 mode = TYPE_MODE (vectype);
383 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
386 orig_stmt = STMT_VINFO_STMT (stmt_info);
388 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
390 /* Add in cost for initial definition. */
391 outer_cost += TARG_SCALAR_TO_VEC_COST;
393 /* Determine cost of epilogue code.
395 We have a reduction operator that will reduce the vector in one statement.
396 Also requires scalar extract. */
398 if (!nested_in_vect_loop_p (loop, orig_stmt))
400 if (reduc_code < NUM_TREE_CODES)
401 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
404 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
406 TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0)));
407 int element_bitsize = tree_low_cst (bitsize, 1);
408 int nelements = vec_size_in_bits / element_bitsize;
410 optab = optab_for_tree_code (code, vectype);
412 /* We have a whole vector shift available. */
413 if (VECTOR_MODE_P (mode)
414 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
415 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
416 /* Final reduction via vector shifts and the reduction operator. Also
417 requires scalar extract. */
418 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
419 + TARG_VEC_TO_SCALAR_COST);
421 /* Use extracts and reduction op for final reduction. For N elements,
422 we have N extracts and N-1 reduction ops. */
423 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
427 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
429 if (vect_print_dump_info (REPORT_DETAILS))
430 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
431 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
432 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
436 /* Function vect_model_induction_cost.
438 Models cost for induction operations. */
441 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
443 /* loop cost for vec_loop. */
444 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
445 /* prologue cost for vec_init and vec_step. */
446 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
448 if (vect_print_dump_info (REPORT_DETAILS))
449 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
450 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
451 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
455 /* Function vect_model_simple_cost.
457 Models cost for simple operations, i.e. those that only emit ncopies of a
458 single op. Right now, this does not account for multiple insns that could
459 be generated for the single vector op. We will handle that shortly. */
462 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies, enum vect_def_type *dt)
466 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
468 /* FORNOW: Assuming maximum 2 args per stmts. */
471 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
472 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) += TARG_SCALAR_TO_VEC_COST;
475 if (vect_print_dump_info (REPORT_DETAILS))
476 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
477 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
478 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
482 /* Function vect_cost_strided_group_size
484 For strided load or store, return the group_size only if it is the first
485 load or store of a group, else return 1. This ensures that group size is
486 only returned once per group. */
489 vect_cost_strided_group_size (stmt_vec_info stmt_info)
491 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
493 if (first_stmt == STMT_VINFO_STMT (stmt_info))
494 return DR_GROUP_SIZE (stmt_info);
500 /* Function vect_model_store_cost
502 Models cost for stores. In the case of strided accesses, one access
503 has the overhead of the strided access attributed to it. */
506 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, enum vect_def_type dt)
511 if (dt == vect_constant_def || dt == vect_invariant_def)
512 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = TARG_SCALAR_TO_VEC_COST;
514 /* Strided access? */
515 if (DR_GROUP_FIRST_DR (stmt_info))
516 group_size = vect_cost_strided_group_size (stmt_info);
517 /* Not a strided access. */
521 /* Is this an access in a group of stores, which provide strided access?
522 If so, add in the cost of the permutes. */
525 /* Uses a high and low interleave operation for each needed permute. */
526 cost = ncopies * exact_log2(group_size) * group_size
527 * TARG_VEC_STMT_COST;
529 if (vect_print_dump_info (REPORT_DETAILS))
530 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
535 /* Costs of the stores. */
536 cost += ncopies * TARG_VEC_STORE_COST;
538 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = cost;
540 if (vect_print_dump_info (REPORT_DETAILS))
541 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
542 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
543 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
547 /* Function vect_model_load_cost
549 Models cost for loads. In the case of strided accesses, the last access
550 has the overhead of the strided access attributed to it. Since unaligned
551 accesses are supported for loads, we also account for the costs of the
552 access scheme chosen. */
555 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies)
560 int alignment_support_cheme;
562 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
564 /* Strided accesses? */
565 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
568 group_size = vect_cost_strided_group_size (stmt_info);
569 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
571 /* Not a strided access. */
578 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
580 /* Is this an access in a group of loads providing strided access?
581 If so, add in the cost of the permutes. */
584 /* Uses an even and odd extract operations for each needed permute. */
585 inner_cost = ncopies * exact_log2(group_size) * group_size
586 * TARG_VEC_STMT_COST;
588 if (vect_print_dump_info (REPORT_DETAILS))
589 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
594 /* The loads themselves. */
595 switch (alignment_support_cheme)
599 inner_cost += ncopies * TARG_VEC_LOAD_COST;
601 if (vect_print_dump_info (REPORT_DETAILS))
602 fprintf (vect_dump, "vect_model_load_cost: aligned.");
606 case dr_unaligned_supported:
608 /* Here, we assign an additional cost for the unaligned load. */
609 inner_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
611 if (vect_print_dump_info (REPORT_DETAILS))
612 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
617 case dr_explicit_realign:
619 inner_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
621 /* FIXME: If the misalignment remains fixed across the iterations of
622 the containing loop, the following cost should be added to the
624 if (targetm.vectorize.builtin_mask_for_load)
625 inner_cost += TARG_VEC_STMT_COST;
629 case dr_explicit_realign_optimized:
633 if (vect_print_dump_info (REPORT_DETAILS))
634 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
637 /* Unaligned software pipeline has a load of an address, an initial
638 load, and possibly a mask operation to "prime" the loop. However,
639 if this is an access in a group of loads, which provide strided
640 access, then the above cost should only be considered for one
641 access in the group. Inside the loop, there is a load op
642 and a realignment op. */
644 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1)
646 outer_cost = 2*TARG_VEC_STMT_COST;
647 if (targetm.vectorize.builtin_mask_for_load)
648 outer_cost += TARG_VEC_STMT_COST;
651 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
653 inner_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
662 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = inner_cost;
664 if (vect_print_dump_info (REPORT_DETAILS))
665 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
666 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
667 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
672 /* Function vect_get_new_vect_var.
674 Returns a name for a new variable. The current naming scheme appends the
675 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
676 the name of vectorizer generated variables, and appends that to NAME if
680 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
687 case vect_simple_var:
690 case vect_scalar_var:
693 case vect_pointer_var:
702 char* tmp = concat (prefix, name, NULL);
703 new_vect_var = create_tmp_var (type, tmp);
707 new_vect_var = create_tmp_var (type, prefix);
709 /* Mark vector typed variable as a gimple register variable. */
710 if (TREE_CODE (type) == VECTOR_TYPE)
711 DECL_GIMPLE_REG_P (new_vect_var) = true;
717 /* Function vect_create_addr_base_for_vector_ref.
719 Create an expression that computes the address of the first memory location
720 that will be accessed for a data reference.
723 STMT: The statement containing the data reference.
724 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
725 OFFSET: Optional. If supplied, it is be added to the initial address.
726 LOOP: Specify relative to which loop-nest should the address be computed.
727 For example, when the dataref is in an inner-loop nested in an
728 outer-loop that is now being vectorized, LOOP can be either the
729 outer-loop, or the inner-loop. The first memory location accessed
730 by the following dataref ('in' points to short):
737 if LOOP=i_loop: &in (relative to i_loop)
738 if LOOP=j_loop: &in+i*2B (relative to j_loop)
741 1. Return an SSA_NAME whose value is the address of the memory location of
742 the first vector of the data reference.
743 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
744 these statement(s) which define the returned SSA_NAME.
746 FORNOW: We are only handling array accesses with step 1. */
749 vect_create_addr_base_for_vector_ref (tree stmt,
754 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
755 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
756 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
757 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
759 tree data_ref_base_var;
762 tree addr_base, addr_expr;
764 tree base_offset = unshare_expr (DR_OFFSET (dr));
765 tree init = unshare_expr (DR_INIT (dr));
766 tree vect_ptr_type, addr_expr2;
767 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
770 if (loop != containing_loop)
772 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
773 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
775 gcc_assert (nested_in_vect_loop_p (loop, stmt));
777 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
778 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
779 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
782 /* Create data_ref_base */
783 base_name = build_fold_indirect_ref (data_ref_base);
784 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
785 add_referenced_var (data_ref_base_var);
786 data_ref_base = force_gimple_operand (data_ref_base, &new_base_stmt,
787 true, data_ref_base_var);
788 append_to_statement_list_force(new_base_stmt, new_stmt_list);
790 /* Create base_offset */
791 base_offset = size_binop (PLUS_EXPR, base_offset, init);
792 base_offset = fold_convert (sizetype, base_offset);
793 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
794 add_referenced_var (dest);
795 base_offset = force_gimple_operand (base_offset, &new_stmt, true, dest);
796 append_to_statement_list_force (new_stmt, new_stmt_list);
800 tree tmp = create_tmp_var (sizetype, "offset");
802 add_referenced_var (tmp);
803 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
804 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
805 base_offset, offset);
806 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
807 append_to_statement_list_force (new_stmt, new_stmt_list);
810 /* base + base_offset */
811 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
812 data_ref_base, base_offset);
814 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
816 /* addr_expr = addr_base */
817 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
818 get_name (base_name));
819 add_referenced_var (addr_expr);
820 vec_stmt = fold_convert (vect_ptr_type, addr_base);
821 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
822 get_name (base_name));
823 add_referenced_var (addr_expr2);
824 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
825 append_to_statement_list_force (new_stmt, new_stmt_list);
827 if (vect_print_dump_info (REPORT_DETAILS))
829 fprintf (vect_dump, "created ");
830 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
836 /* Function vect_create_data_ref_ptr.
838 Create a new pointer to vector type (vp), that points to the first location
839 accessed in the loop by STMT, along with the def-use update chain to
840 appropriately advance the pointer through the loop iterations. Also set
841 aliasing information for the pointer. This vector pointer is used by the
842 callers to this function to create a memory reference expression for vector
846 1. STMT: a stmt that references memory. Expected to be of the form
847 GIMPLE_MODIFY_STMT <name, data-ref> or
848 GIMPLE_MODIFY_STMT <data-ref, name>.
849 2. AT_LOOP: the loop where the vector memref is to be created.
850 3. OFFSET (optional): an offset to be added to the initial address accessed
851 by the data-ref in STMT.
852 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
853 pointing to the initial address.
854 5. TYPE: if not NULL indicates the required type of the data-ref
857 1. Declare a new ptr to vector_type, and have it point to the base of the
858 data reference (initial addressed accessed by the data reference).
859 For example, for vector of type V8HI, the following code is generated:
862 vp = (v8hi *)initial_address;
864 if OFFSET is not supplied:
865 initial_address = &a[init];
866 if OFFSET is supplied:
867 initial_address = &a[init + OFFSET];
869 Return the initial_address in INITIAL_ADDRESS.
871 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
872 update the pointer in each iteration of the loop.
874 Return the increment stmt that updates the pointer in PTR_INCR.
876 3. Set INV_P to true if the access pattern of the data reference in the
877 vectorized loop is invariant. Set it to false otherwise.
879 4. Return the pointer. */
882 vect_create_data_ref_ptr (tree stmt, struct loop *at_loop,
883 tree offset, tree *initial_address, tree *ptr_incr,
884 bool only_init, tree type, bool *inv_p)
887 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
888 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
889 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
890 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
891 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
892 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
898 tree new_stmt_list = NULL_TREE;
902 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
904 block_stmt_iterator incr_bsi;
906 tree indx_before_incr, indx_after_incr;
910 /* Check the step (evolution) of the load in LOOP, and record
911 whether it's invariant. */
912 if (nested_in_vect_loop)
913 step = STMT_VINFO_DR_STEP (stmt_info);
915 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
917 if (tree_int_cst_compare (step, size_zero_node) == 0)
922 /* Create an expression for the first address accessed by this load
924 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
926 if (vect_print_dump_info (REPORT_DETAILS))
928 tree data_ref_base = base_name;
929 fprintf (vect_dump, "create vector-pointer variable to type: ");
930 print_generic_expr (vect_dump, vectype, TDF_SLIM);
931 if (TREE_CODE (data_ref_base) == VAR_DECL)
932 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
933 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
934 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
935 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
936 fprintf (vect_dump, " vectorizing a record based array ref: ");
937 else if (TREE_CODE (data_ref_base) == SSA_NAME)
938 fprintf (vect_dump, " vectorizing a pointer ref: ");
939 print_generic_expr (vect_dump, base_name, TDF_SLIM);
942 /** (1) Create the new vector-pointer variable: **/
944 vect_ptr_type = build_pointer_type (type);
946 vect_ptr_type = build_pointer_type (vectype);
947 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
948 get_name (base_name));
949 add_referenced_var (vect_ptr);
951 /** (2) Add aliasing information to the new vector-pointer:
952 (The points-to info (DR_PTR_INFO) may be defined later.) **/
954 tag = DR_SYMBOL_TAG (dr);
957 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
958 tag must be created with tag added to its may alias list. */
960 new_type_alias (vect_ptr, tag, DR_REF (dr));
962 set_symbol_mem_tag (vect_ptr, tag);
964 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
966 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
967 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
968 def-use update cycles for the pointer: One relative to the outer-loop
969 (LOOP), which is what steps (3) and (4) below do. The other is relative
970 to the inner-loop (which is the inner-most loop containing the dataref),
971 and this is done be step (5) below.
973 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
974 inner-most loop, and so steps (3),(4) work the same, and step (5) is
975 redundant. Steps (3),(4) create the following:
978 LOOP: vp1 = phi(vp0,vp2)
984 If there is an inner-loop nested in loop, then step (5) will also be
985 applied, and an additional update in the inner-loop will be created:
988 LOOP: vp1 = phi(vp0,vp2)
990 inner: vp3 = phi(vp1,vp4)
991 vp4 = vp3 + inner_step
997 /** (3) Calculate the initial address the vector-pointer, and set
998 the vector-pointer to point to it before the loop: **/
1000 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1002 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1004 pe = loop_preheader_edge (loop);
1005 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
1006 gcc_assert (!new_bb);
1007 *initial_address = new_temp;
1009 /* Create: p = (vectype *) initial_base */
1010 vec_stmt = fold_convert (vect_ptr_type, new_temp);
1011 vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
1012 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1013 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
1014 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
1015 gcc_assert (!new_bb);
1018 /** (4) Handle the updating of the vector-pointer inside the loop.
1019 This is needed when ONLY_INIT is false, and also when AT_LOOP
1020 is the inner-loop nested in LOOP (during outer-loop vectorization).
1023 if (only_init && at_loop == loop) /* No update in loop is required. */
1025 /* Copy the points-to information if it exists. */
1026 if (DR_PTR_INFO (dr))
1027 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1028 vptr = vect_ptr_init;
1032 /* The step of the vector pointer is the Vector Size. */
1033 tree step = TYPE_SIZE_UNIT (vectype);
1034 /* One exception to the above is when the scalar step of the load in
1035 LOOP is zero. In this case the step here is also zero. */
1037 step = size_zero_node;
1039 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
1041 create_iv (vect_ptr_init,
1042 fold_convert (vect_ptr_type, step),
1043 NULL_TREE, loop, &incr_bsi, insert_after,
1044 &indx_before_incr, &indx_after_incr);
1045 incr = bsi_stmt (incr_bsi);
1046 set_stmt_info (stmt_ann (incr),
1047 new_stmt_vec_info (incr, loop_vinfo));
1049 /* Copy the points-to information if it exists. */
1050 if (DR_PTR_INFO (dr))
1052 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1053 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1055 merge_alias_info (vect_ptr_init, indx_before_incr);
1056 merge_alias_info (vect_ptr_init, indx_after_incr);
1060 vptr = indx_before_incr;
1063 if (!nested_in_vect_loop || only_init)
1067 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1068 nested in LOOP, if exists: **/
1070 gcc_assert (nested_in_vect_loop);
1073 standard_iv_increment_position (containing_loop, &incr_bsi,
1075 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE,
1076 containing_loop, &incr_bsi, insert_after, &indx_before_incr,
1078 incr = bsi_stmt (incr_bsi);
1079 set_stmt_info (stmt_ann (incr), new_stmt_vec_info (incr, loop_vinfo));
1081 /* Copy the points-to information if it exists. */
1082 if (DR_PTR_INFO (dr))
1084 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1085 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1087 merge_alias_info (vect_ptr_init, indx_before_incr);
1088 merge_alias_info (vect_ptr_init, indx_after_incr);
1092 return indx_before_incr;
1099 /* Function bump_vector_ptr
1101 Increment a pointer (to a vector type) by vector-size. If requested,
1102 i.e. if PTR-INCR is given, then also connect the new increment stmt
1103 to the existing def-use update-chain of the pointer, by modifying
1104 the PTR_INCR as illustrated below:
1106 The pointer def-use update-chain before this function:
1107 DATAREF_PTR = phi (p_0, p_2)
1109 PTR_INCR: p_2 = DATAREF_PTR + step
1111 The pointer def-use update-chain after this function:
1112 DATAREF_PTR = phi (p_0, p_2)
1114 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1116 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1119 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1121 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1122 the loop. The increment amount across iterations is expected
1124 BSI - location where the new update stmt is to be placed.
1125 STMT - the original scalar memory-access stmt that is being vectorized.
1126 BUMP - optional. The offset by which to bump the pointer. If not given,
1127 the offset is assumed to be vector_size.
1129 Output: Return NEW_DATAREF_PTR as illustrated above.
1134 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
1135 tree stmt, tree bump)
1137 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1138 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1139 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1140 tree vptr_type = TREE_TYPE (dataref_ptr);
1141 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1142 tree update = TYPE_SIZE_UNIT (vectype);
1145 use_operand_p use_p;
1146 tree new_dataref_ptr;
1151 incr_stmt = build_gimple_modify_stmt (ptr_var,
1152 build2 (POINTER_PLUS_EXPR, vptr_type,
1153 dataref_ptr, update));
1154 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1155 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
1156 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
1158 /* Copy the points-to information if it exists. */
1159 if (DR_PTR_INFO (dr))
1160 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1161 merge_alias_info (new_dataref_ptr, dataref_ptr);
1164 return new_dataref_ptr;
1166 /* Update the vector-pointer's cross-iteration increment. */
1167 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1169 tree use = USE_FROM_PTR (use_p);
1171 if (use == dataref_ptr)
1172 SET_USE (use_p, new_dataref_ptr);
1174 gcc_assert (tree_int_cst_compare (use, update) == 0);
1177 return new_dataref_ptr;
1181 /* Function vect_create_destination_var.
1183 Create a new temporary of type VECTYPE. */
1186 vect_create_destination_var (tree scalar_dest, tree vectype)
1189 const char *new_name;
1191 enum vect_var_kind kind;
1193 kind = vectype ? vect_simple_var : vect_scalar_var;
1194 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1196 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1198 new_name = get_name (scalar_dest);
1201 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1202 add_referenced_var (vec_dest);
1208 /* Function vect_init_vector.
1210 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1211 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1212 is not NULL. Otherwise, place the initialization at the loop preheader.
1213 Return the DEF of INIT_STMT.
1214 It will be used in the vectorization of STMT. */
1217 vect_init_vector (tree stmt, tree vector_var, tree vector_type,
1218 block_stmt_iterator *bsi)
1220 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1228 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1229 add_referenced_var (new_var);
1230 init_stmt = build_gimple_modify_stmt (new_var, vector_var);
1231 new_temp = make_ssa_name (new_var, init_stmt);
1232 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
1235 vect_finish_stmt_generation (stmt, init_stmt, bsi);
1238 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1239 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1241 if (nested_in_vect_loop_p (loop, stmt))
1243 pe = loop_preheader_edge (loop);
1244 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1245 gcc_assert (!new_bb);
1248 if (vect_print_dump_info (REPORT_DETAILS))
1250 fprintf (vect_dump, "created new init_stmt: ");
1251 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1254 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
1259 /* Function get_initial_def_for_induction
1262 STMT - a stmt that performs an induction operation in the loop.
1263 IV_PHI - the initial value of the induction variable
1266 Return a vector variable, initialized with the first VF values of
1267 the induction variable. E.g., for an iv with IV_PHI='X' and
1268 evolution S, for a vector of 4 units, we want to return:
1269 [X, X + S, X + 2*S, X + 3*S]. */
1272 get_initial_def_for_induction (tree iv_phi)
1274 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1275 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1276 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1277 tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
1278 tree vectype = get_vectype_for_scalar_type (scalar_type);
1279 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1280 edge pe = loop_preheader_edge (loop);
1281 struct loop *iv_loop;
1283 tree vec, vec_init, vec_step, t;
1288 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
1289 tree init_expr, step_expr;
1290 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1293 int ncopies = vf / nunits;
1295 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1296 bool nested_in_vect_loop = false;
1298 imm_use_iterator imm_iter;
1299 use_operand_p use_p;
1303 block_stmt_iterator si;
1304 basic_block bb = bb_for_stmt (iv_phi);
1306 gcc_assert (phi_info);
1307 gcc_assert (ncopies >= 1);
1309 /* Find the first insertion point in the BB. */
1310 si = bsi_after_labels (bb);
1312 if (INTEGRAL_TYPE_P (scalar_type))
1313 step_expr = build_int_cst (scalar_type, 0);
1315 step_expr = build_real (scalar_type, dconst0);
1317 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1318 if (nested_in_vect_loop_p (loop, iv_phi))
1320 nested_in_vect_loop = true;
1321 iv_loop = loop->inner;
1325 gcc_assert (iv_loop == (bb_for_stmt (iv_phi))->loop_father);
1327 latch_e = loop_latch_edge (iv_loop);
1328 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1330 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1331 gcc_assert (access_fn);
1332 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1333 &init_expr, &step_expr);
1335 pe = loop_preheader_edge (iv_loop);
1337 /* Create the vector that holds the initial_value of the induction. */
1338 if (nested_in_vect_loop)
1340 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1341 been created during vectorization of previous stmts; We obtain it from
1342 the STMT_VINFO_VEC_STMT of the defining stmt. */
1343 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1344 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1348 /* iv_loop is the loop to be vectorized. Create:
1349 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1350 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1351 add_referenced_var (new_var);
1353 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1356 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
1357 gcc_assert (!new_bb);
1361 t = tree_cons (NULL_TREE, init_expr, t);
1362 for (i = 1; i < nunits; i++)
1366 /* Create: new_name_i = new_name + step_expr */
1367 tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
1368 init_stmt = build_gimple_modify_stmt (new_var, tmp);
1369 new_name = make_ssa_name (new_var, init_stmt);
1370 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
1372 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1373 gcc_assert (!new_bb);
1375 if (vect_print_dump_info (REPORT_DETAILS))
1377 fprintf (vect_dump, "created new init_stmt: ");
1378 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1380 t = tree_cons (NULL_TREE, new_name, t);
1382 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1383 vec = build_constructor_from_list (vectype, nreverse (t));
1384 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1388 /* Create the vector that holds the step of the induction. */
1389 if (nested_in_vect_loop)
1390 /* iv_loop is nested in the loop to be vectorized. Generate:
1391 vec_step = [S, S, S, S] */
1392 new_name = step_expr;
1395 /* iv_loop is the loop to be vectorized. Generate:
1396 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1397 expr = build_int_cst (scalar_type, vf);
1398 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1402 for (i = 0; i < nunits; i++)
1403 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1404 vec = build_constructor_from_list (vectype, t);
1405 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1408 /* Create the following def-use cycle:
1413 vec_iv = PHI <vec_init, vec_loop>
1417 vec_loop = vec_iv + vec_step; */
1419 /* Create the induction-phi that defines the induction-operand. */
1420 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1421 add_referenced_var (vec_dest);
1422 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1423 set_stmt_info (get_stmt_ann (induction_phi),
1424 new_stmt_vec_info (induction_phi, loop_vinfo));
1425 induc_def = PHI_RESULT (induction_phi);
1427 /* Create the iv update inside the loop */
1428 new_stmt = build_gimple_modify_stmt (NULL_TREE,
1429 build2 (PLUS_EXPR, vectype,
1430 induc_def, vec_step));
1431 vec_def = make_ssa_name (vec_dest, new_stmt);
1432 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1433 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1434 set_stmt_info (get_stmt_ann (new_stmt),
1435 new_stmt_vec_info (new_stmt, loop_vinfo));
1437 /* Set the arguments of the phi node: */
1438 add_phi_arg (induction_phi, vec_init, pe);
1439 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1442 /* In case that vectorization factor (VF) is bigger than the number
1443 of elements that we can fit in a vectype (nunits), we have to generate
1444 more than one vector stmt - i.e - we need to "unroll" the
1445 vector stmt by a factor VF/nunits. For more details see documentation
1446 in vectorizable_operation. */
1450 stmt_vec_info prev_stmt_vinfo;
1451 /* FORNOW. This restriction should be relaxed. */
1452 gcc_assert (!nested_in_vect_loop);
1454 /* Create the vector that holds the step of the induction. */
1455 expr = build_int_cst (scalar_type, nunits);
1456 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1458 for (i = 0; i < nunits; i++)
1459 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1460 vec = build_constructor_from_list (vectype, t);
1461 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1463 vec_def = induc_def;
1464 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1465 for (i = 1; i < ncopies; i++)
1469 /* vec_i = vec_prev + vec_step */
1470 tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
1471 new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
1472 vec_def = make_ssa_name (vec_dest, new_stmt);
1473 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1474 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1475 set_stmt_info (get_stmt_ann (new_stmt),
1476 new_stmt_vec_info (new_stmt, loop_vinfo));
1477 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1478 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1482 if (nested_in_vect_loop)
1484 /* Find the loop-closed exit-phi of the induction, and record
1485 the final vector of induction results: */
1487 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1489 if (!flow_bb_inside_loop_p (iv_loop, bb_for_stmt (USE_STMT (use_p))))
1491 exit_phi = USE_STMT (use_p);
1497 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1498 /* FORNOW. Currently not supporting the case that an inner-loop induction
1499 is not used in the outer-loop (i.e. only outside the outer-loop). */
1500 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1501 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1503 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1504 if (vect_print_dump_info (REPORT_DETAILS))
1506 fprintf (vect_dump, "vector of inductions after inner-loop:");
1507 print_generic_expr (vect_dump, new_stmt, TDF_SLIM);
1513 if (vect_print_dump_info (REPORT_DETAILS))
1515 fprintf (vect_dump, "transform induction: created def-use cycle:");
1516 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
1517 fprintf (vect_dump, "\n");
1518 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
1521 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1526 /* Function vect_get_vec_def_for_operand.
1528 OP is an operand in STMT. This function returns a (vector) def that will be
1529 used in the vectorized stmt for STMT.
1531 In the case that OP is an SSA_NAME which is defined in the loop, then
1532 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1534 In case OP is an invariant or constant, a new stmt that creates a vector def
1535 needs to be introduced. */
1538 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
1543 stmt_vec_info def_stmt_info = NULL;
1544 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1545 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1546 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1547 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1553 enum vect_def_type dt;
1557 if (vect_print_dump_info (REPORT_DETAILS))
1559 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1560 print_generic_expr (vect_dump, op, TDF_SLIM);
1563 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1564 gcc_assert (is_simple_use);
1565 if (vect_print_dump_info (REPORT_DETAILS))
1569 fprintf (vect_dump, "def = ");
1570 print_generic_expr (vect_dump, def, TDF_SLIM);
1574 fprintf (vect_dump, " def_stmt = ");
1575 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
1581 /* Case 1: operand is a constant. */
1582 case vect_constant_def:
1587 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1588 if (vect_print_dump_info (REPORT_DETAILS))
1589 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1591 for (i = nunits - 1; i >= 0; --i)
1593 t = tree_cons (NULL_TREE, op, t);
1595 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1596 vec_cst = build_vector (vector_type, t);
1598 return vect_init_vector (stmt, vec_cst, vector_type, NULL);
1601 /* Case 2: operand is defined outside the loop - loop invariant. */
1602 case vect_invariant_def:
1607 /* Create 'vec_inv = {inv,inv,..,inv}' */
1608 if (vect_print_dump_info (REPORT_DETAILS))
1609 fprintf (vect_dump, "Create vector_inv.");
1611 for (i = nunits - 1; i >= 0; --i)
1613 t = tree_cons (NULL_TREE, def, t);
1616 /* FIXME: use build_constructor directly. */
1617 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1618 vec_inv = build_constructor_from_list (vector_type, t);
1619 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1622 /* Case 3: operand is defined inside the loop. */
1626 *scalar_def = def_stmt;
1628 /* Get the def from the vectorized stmt. */
1629 def_stmt_info = vinfo_for_stmt (def_stmt);
1630 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1631 gcc_assert (vec_stmt);
1632 if (TREE_CODE (vec_stmt) == PHI_NODE)
1633 vec_oprnd = PHI_RESULT (vec_stmt);
1635 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
1639 /* Case 4: operand is defined by a loop header phi - reduction */
1640 case vect_reduction_def:
1644 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1645 loop = (bb_for_stmt (def_stmt))->loop_father;
1647 /* Get the def before the loop */
1648 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1649 return get_initial_def_for_reduction (stmt, op, scalar_def);
1652 /* Case 5: operand is defined by loop-header phi - induction. */
1653 case vect_induction_def:
1655 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1657 /* Get the def from the vectorized stmt. */
1658 def_stmt_info = vinfo_for_stmt (def_stmt);
1659 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1660 gcc_assert (vec_stmt && (TREE_CODE (vec_stmt) == PHI_NODE));
1661 vec_oprnd = PHI_RESULT (vec_stmt);
1671 /* Function vect_get_vec_def_for_stmt_copy
1673 Return a vector-def for an operand. This function is used when the
1674 vectorized stmt to be created (by the caller to this function) is a "copy"
1675 created in case the vectorized result cannot fit in one vector, and several
1676 copies of the vector-stmt are required. In this case the vector-def is
1677 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
1678 of the stmt that defines VEC_OPRND.
1679 DT is the type of the vector def VEC_OPRND.
1682 In case the vectorization factor (VF) is bigger than the number
1683 of elements that can fit in a vectype (nunits), we have to generate
1684 more than one vector stmt to vectorize the scalar stmt. This situation
1685 arises when there are multiple data-types operated upon in the loop; the
1686 smallest data-type determines the VF, and as a result, when vectorizing
1687 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
1688 vector stmt (each computing a vector of 'nunits' results, and together
1689 computing 'VF' results in each iteration). This function is called when
1690 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
1691 which VF=16 and nunits=4, so the number of copies required is 4):
1693 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
1695 S1: x = load VS1.0: vx.0 = memref0 VS1.1
1696 VS1.1: vx.1 = memref1 VS1.2
1697 VS1.2: vx.2 = memref2 VS1.3
1698 VS1.3: vx.3 = memref3
1700 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
1701 VSnew.1: vz1 = vx.1 + ... VSnew.2
1702 VSnew.2: vz2 = vx.2 + ... VSnew.3
1703 VSnew.3: vz3 = vx.3 + ...
1705 The vectorization of S1 is explained in vectorizable_load.
1706 The vectorization of S2:
1707 To create the first vector-stmt out of the 4 copies - VSnew.0 -
1708 the function 'vect_get_vec_def_for_operand' is called to
1709 get the relevant vector-def for each operand of S2. For operand x it
1710 returns the vector-def 'vx.0'.
1712 To create the remaining copies of the vector-stmt (VSnew.j), this
1713 function is called to get the relevant vector-def for each operand. It is
1714 obtained from the respective VS1.j stmt, which is recorded in the
1715 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
1717 For example, to obtain the vector-def 'vx.1' in order to create the
1718 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
1719 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
1720 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
1721 and return its def ('vx.1').
1722 Overall, to create the above sequence this function will be called 3 times:
1723 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
1724 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
1725 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
1728 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
1730 tree vec_stmt_for_operand;
1731 stmt_vec_info def_stmt_info;
1733 /* Do nothing; can reuse same def. */
1734 if (dt == vect_invariant_def || dt == vect_constant_def )
1737 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
1738 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
1739 gcc_assert (def_stmt_info);
1740 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
1741 gcc_assert (vec_stmt_for_operand);
1742 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
1747 /* Function vect_finish_stmt_generation.
1749 Insert a new stmt. */
1752 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
1753 block_stmt_iterator *bsi)
1755 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1756 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1758 gcc_assert (stmt == bsi_stmt (*bsi));
1759 gcc_assert (TREE_CODE (stmt) != LABEL_EXPR);
1761 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
1763 set_stmt_info (get_stmt_ann (vec_stmt),
1764 new_stmt_vec_info (vec_stmt, loop_vinfo));
1766 if (vect_print_dump_info (REPORT_DETAILS))
1768 fprintf (vect_dump, "add new stmt: ");
1769 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
1772 /* Make sure bsi points to the stmt that is being vectorized. */
1773 gcc_assert (stmt == bsi_stmt (*bsi));
1775 #ifdef USE_MAPPED_LOCATION
1776 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
1778 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
1783 /* Function get_initial_def_for_reduction
1786 STMT - a stmt that performs a reduction operation in the loop.
1787 INIT_VAL - the initial value of the reduction variable
1790 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
1791 of the reduction (used for adjusting the epilog - see below).
1792 Return a vector variable, initialized according to the operation that STMT
1793 performs. This vector will be used as the initial value of the
1794 vector of partial results.
1796 Option1 (adjust in epilog): Initialize the vector as follows:
1799 min/max: [init_val,init_val,..,init_val,init_val]
1800 bit and/or: [init_val,init_val,..,init_val,init_val]
1801 and when necessary (e.g. add/mult case) let the caller know
1802 that it needs to adjust the result by init_val.
1804 Option2: Initialize the vector as follows:
1805 add: [0,0,...,0,init_val]
1806 mult: [1,1,...,1,init_val]
1807 min/max: [init_val,init_val,...,init_val]
1808 bit and/or: [init_val,init_val,...,init_val]
1809 and no adjustments are needed.
1811 For example, for the following code:
1817 STMT is 's = s + a[i]', and the reduction variable is 's'.
1818 For a vector of 4 units, we want to return either [0,0,0,init_val],
1819 or [0,0,0,0] and let the caller know that it needs to adjust
1820 the result at the end by 'init_val'.
1822 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
1823 initialization vector is simpler (same element in all entries).
1824 A cost model should help decide between these two schemes. */
1827 get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
1829 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1830 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1831 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1832 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1833 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1834 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
1835 tree type = TREE_TYPE (init_val);
1842 bool nested_in_vect_loop = false;
1844 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
1845 if (nested_in_vect_loop_p (loop, stmt))
1846 nested_in_vect_loop = true;
1848 gcc_assert (loop == (bb_for_stmt (stmt))->loop_father);
1850 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
1854 case WIDEN_SUM_EXPR:
1857 if (nested_in_vect_loop)
1858 *adjustment_def = vecdef;
1860 *adjustment_def = init_val;
1861 /* Create a vector of zeros for init_def. */
1862 if (INTEGRAL_TYPE_P (type))
1863 def_for_init = build_int_cst (type, 0);
1865 def_for_init = build_real (type, dconst0);
1866 for (i = nunits - 1; i >= 0; --i)
1867 t = tree_cons (NULL_TREE, def_for_init, t);
1868 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
1869 init_def = build_vector (vector_type, t);
1874 *adjustment_def = NULL_TREE;
1886 /* Function vect_create_epilog_for_reduction
1888 Create code at the loop-epilog to finalize the result of a reduction
1891 VECT_DEF is a vector of partial results.
1892 REDUC_CODE is the tree-code for the epilog reduction.
1893 STMT is the scalar reduction stmt that is being vectorized.
1894 REDUCTION_PHI is the phi-node that carries the reduction computation.
1897 1. Creates the reduction def-use cycle: sets the arguments for
1899 The loop-entry argument is the vectorized initial-value of the reduction.
1900 The loop-latch argument is VECT_DEF - the vector of partial sums.
1901 2. "Reduces" the vector of partial results VECT_DEF into a single result,
1902 by applying the operation specified by REDUC_CODE if available, or by
1903 other means (whole-vector shifts or a scalar loop).
1904 The function also creates a new phi node at the loop exit to preserve
1905 loop-closed form, as illustrated below.
1907 The flow at the entry to this function:
1910 vec_def = phi <null, null> # REDUCTION_PHI
1911 VECT_DEF = vector_stmt # vectorized form of STMT
1912 s_loop = scalar_stmt # (scalar) STMT
1914 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1918 The above is transformed by this function into:
1921 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
1922 VECT_DEF = vector_stmt # vectorized form of STMT
1923 s_loop = scalar_stmt # (scalar) STMT
1925 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1926 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1927 v_out2 = reduce <v_out1>
1928 s_out3 = extract_field <v_out2, 0>
1929 s_out4 = adjust_result <s_out3>
1935 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
1936 enum tree_code reduc_code, tree reduction_phi)
1938 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1940 enum machine_mode mode;
1941 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1942 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1943 basic_block exit_bb;
1947 block_stmt_iterator exit_bsi;
1949 tree new_temp = NULL_TREE;
1951 tree epilog_stmt = NULL_TREE;
1952 tree new_scalar_dest, exit_phi, new_dest;
1953 tree bitsize, bitpos, bytesize;
1954 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
1955 tree adjustment_def;
1956 tree vec_initial_def;
1958 imm_use_iterator imm_iter;
1959 use_operand_p use_p;
1960 bool extract_scalar_result = false;
1961 tree reduction_op, expr;
1964 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
1965 bool nested_in_vect_loop = false;
1967 VEC(tree,heap) *phis = NULL;
1970 if (nested_in_vect_loop_p (loop, stmt))
1973 nested_in_vect_loop = true;
1976 op_type = TREE_OPERAND_LENGTH (operation);
1977 reduction_op = TREE_OPERAND (operation, op_type-1);
1978 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
1979 mode = TYPE_MODE (vectype);
1981 /*** 1. Create the reduction def-use cycle ***/
1983 /* 1.1 set the loop-entry arg of the reduction-phi: */
1984 /* For the case of reduction, vect_get_vec_def_for_operand returns
1985 the scalar def before the loop, that defines the initial value
1986 of the reduction variable. */
1987 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
1989 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
1991 /* 1.2 set the loop-latch arg for the reduction-phi: */
1992 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
1994 if (vect_print_dump_info (REPORT_DETAILS))
1996 fprintf (vect_dump, "transform reduction: created def-use cycle:");
1997 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
1998 fprintf (vect_dump, "\n");
1999 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
2003 /*** 2. Create epilog code
2004 The reduction epilog code operates across the elements of the vector
2005 of partial results computed by the vectorized loop.
2006 The reduction epilog code consists of:
2007 step 1: compute the scalar result in a vector (v_out2)
2008 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2009 step 3: adjust the scalar result (s_out3) if needed.
2011 Step 1 can be accomplished using one the following three schemes:
2012 (scheme 1) using reduc_code, if available.
2013 (scheme 2) using whole-vector shifts, if available.
2014 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2017 The overall epilog code looks like this:
2019 s_out0 = phi <s_loop> # original EXIT_PHI
2020 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2021 v_out2 = reduce <v_out1> # step 1
2022 s_out3 = extract_field <v_out2, 0> # step 2
2023 s_out4 = adjust_result <s_out3> # step 3
2025 (step 3 is optional, and step2 1 and 2 may be combined).
2026 Lastly, the uses of s_out0 are replaced by s_out4.
2030 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2031 v_out1 = phi <v_loop> */
2033 exit_bb = single_exit (loop)->dest;
2034 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2035 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
2036 exit_bsi = bsi_after_labels (exit_bb);
2038 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2039 (i.e. when reduc_code is not available) and in the final adjustment
2040 code (if needed). Also get the original scalar reduction variable as
2041 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2042 represents a reduction pattern), the tree-code and scalar-def are
2043 taken from the original stmt that the pattern-stmt (STMT) replaces.
2044 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2045 are taken from STMT. */
2047 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2050 /* Regular reduction */
2055 /* Reduction pattern */
2056 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2057 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2058 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2060 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2061 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
2062 scalar_type = TREE_TYPE (scalar_dest);
2063 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2064 bitsize = TYPE_SIZE (scalar_type);
2065 bytesize = TYPE_SIZE_UNIT (scalar_type);
2068 /* In case this is a reduction in an inner-loop while vectorizing an outer
2069 loop - we don't need to extract a single scalar result at the end of the
2070 inner-loop. The final vector of partial results will be used in the
2071 vectorized outer-loop, or reduced to a scalar result at the end of the
2073 if (nested_in_vect_loop)
2074 goto vect_finalize_reduction;
2076 /* 2.3 Create the reduction code, using one of the three schemes described
2079 if (reduc_code < NUM_TREE_CODES)
2083 /*** Case 1: Create:
2084 v_out2 = reduc_expr <v_out1> */
2086 if (vect_print_dump_info (REPORT_DETAILS))
2087 fprintf (vect_dump, "Reduce using direct vector reduction.");
2089 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2090 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2091 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2092 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2093 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2094 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2096 extract_scalar_result = true;
2100 enum tree_code shift_code = 0;
2101 bool have_whole_vector_shift = true;
2103 int element_bitsize = tree_low_cst (bitsize, 1);
2104 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2107 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2108 shift_code = VEC_RSHIFT_EXPR;
2110 have_whole_vector_shift = false;
2112 /* Regardless of whether we have a whole vector shift, if we're
2113 emulating the operation via tree-vect-generic, we don't want
2114 to use it. Only the first round of the reduction is likely
2115 to still be profitable via emulation. */
2116 /* ??? It might be better to emit a reduction tree code here, so that
2117 tree-vect-generic can expand the first round via bit tricks. */
2118 if (!VECTOR_MODE_P (mode))
2119 have_whole_vector_shift = false;
2122 optab optab = optab_for_tree_code (code, vectype);
2123 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2124 have_whole_vector_shift = false;
2127 if (have_whole_vector_shift)
2129 /*** Case 2: Create:
2130 for (offset = VS/2; offset >= element_size; offset/=2)
2132 Create: va' = vec_shift <va, offset>
2133 Create: va = vop <va, va'>
2136 if (vect_print_dump_info (REPORT_DETAILS))
2137 fprintf (vect_dump, "Reduce using vector shifts");
2139 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2140 new_temp = PHI_RESULT (new_phi);
2142 for (bit_offset = vec_size_in_bits/2;
2143 bit_offset >= element_bitsize;
2146 tree bitpos = size_int (bit_offset);
2147 tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
2148 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2149 new_name = make_ssa_name (vec_dest, epilog_stmt);
2150 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2151 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2153 tmp = build2 (code, vectype, new_name, new_temp);
2154 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2155 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2156 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2157 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2160 extract_scalar_result = true;
2166 /*** Case 3: Create:
2167 s = extract_field <v_out2, 0>
2168 for (offset = element_size;
2169 offset < vector_size;
2170 offset += element_size;)
2172 Create: s' = extract_field <v_out2, offset>
2173 Create: s = op <s, s'>
2176 if (vect_print_dump_info (REPORT_DETAILS))
2177 fprintf (vect_dump, "Reduce using scalar code. ");
2179 vec_temp = PHI_RESULT (new_phi);
2180 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2181 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2183 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2184 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2185 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2186 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2187 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2189 for (bit_offset = element_bitsize;
2190 bit_offset < vec_size_in_bits;
2191 bit_offset += element_bitsize)
2194 tree bitpos = bitsize_int (bit_offset);
2195 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2198 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2199 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2200 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2201 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2202 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2204 tmp = build2 (code, scalar_type, new_name, new_temp);
2205 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
2206 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2207 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2208 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2211 extract_scalar_result = false;
2215 /* 2.4 Extract the final scalar result. Create:
2216 s_out3 = extract_field <v_out2, bitpos> */
2218 if (extract_scalar_result)
2222 gcc_assert (!nested_in_vect_loop);
2223 if (vect_print_dump_info (REPORT_DETAILS))
2224 fprintf (vect_dump, "extract scalar result");
2226 if (BYTES_BIG_ENDIAN)
2227 bitpos = size_binop (MULT_EXPR,
2228 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2229 TYPE_SIZE (scalar_type));
2231 bitpos = bitsize_zero_node;
2233 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2234 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2235 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2236 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2237 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2238 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2241 vect_finalize_reduction:
2243 /* 2.5 Adjust the final result by the initial value of the reduction
2244 variable. (When such adjustment is not needed, then
2245 'adjustment_def' is zero). For example, if code is PLUS we create:
2246 new_temp = loop_exit_def + adjustment_def */
2250 if (nested_in_vect_loop)
2252 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2253 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2254 new_dest = vect_create_destination_var (scalar_dest, vectype);
2258 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2259 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2260 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2262 epilog_stmt = build_gimple_modify_stmt (new_dest, expr);
2263 new_temp = make_ssa_name (new_dest, epilog_stmt);
2264 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2265 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2269 /* 2.6 Handle the loop-exit phi */
2271 /* Replace uses of s_out0 with uses of s_out3:
2272 Find the loop-closed-use at the loop exit of the original scalar result.
2273 (The reduction result is expected to have two immediate uses - one at the
2274 latch block, and one at the loop exit). */
2275 phis = VEC_alloc (tree, heap, 10);
2276 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2278 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
2280 exit_phi = USE_STMT (use_p);
2281 VEC_quick_push (tree, phis, exit_phi);
2284 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2285 gcc_assert (!VEC_empty (tree, phis));
2287 for (i = 0; VEC_iterate (tree, phis, i, exit_phi); i++)
2289 if (nested_in_vect_loop)
2291 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2293 /* FORNOW. Currently not supporting the case that an inner-loop reduction
2294 is not used in the outer-loop (but only outside the outer-loop). */
2295 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2296 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2298 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2299 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2300 set_stmt_info (get_stmt_ann (epilog_stmt),
2301 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2305 /* Replace the uses: */
2306 orig_name = PHI_RESULT (exit_phi);
2307 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2308 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2309 SET_USE (use_p, new_temp);
2311 VEC_free (tree, heap, phis);
2315 /* Function vectorizable_reduction.
2317 Check if STMT performs a reduction operation that can be vectorized.
2318 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2319 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2320 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2322 This function also handles reduction idioms (patterns) that have been
2323 recognized in advance during vect_pattern_recog. In this case, STMT may be
2325 X = pattern_expr (arg0, arg1, ..., X)
2326 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2327 sequence that had been detected and replaced by the pattern-stmt (STMT).
2329 In some cases of reduction patterns, the type of the reduction variable X is
2330 different than the type of the other arguments of STMT.
2331 In such cases, the vectype that is used when transforming STMT into a vector
2332 stmt is different than the vectype that is used to determine the
2333 vectorization factor, because it consists of a different number of elements
2334 than the actual number of elements that are being operated upon in parallel.
2336 For example, consider an accumulation of shorts into an int accumulator.
2337 On some targets it's possible to vectorize this pattern operating on 8
2338 shorts at a time (hence, the vectype for purposes of determining the
2339 vectorization factor should be V8HI); on the other hand, the vectype that
2340 is used to create the vector form is actually V4SI (the type of the result).
2342 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2343 indicates what is the actual level of parallelism (V8HI in the example), so
2344 that the right vectorization factor would be derived. This vectype
2345 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2346 be used to create the vectorized stmt. The right vectype for the vectorized
2347 stmt is obtained from the type of the result X:
2348 get_vectype_for_scalar_type (TREE_TYPE (X))
2350 This means that, contrary to "regular" reductions (or "regular" stmts in
2351 general), the following equation:
2352 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2353 does *NOT* necessarily hold for reduction patterns. */
2356 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2361 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2362 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2363 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2364 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2365 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2367 enum tree_code code, orig_code, epilog_reduc_code = 0;
2368 enum machine_mode vec_mode;
2370 optab optab, reduc_optab;
2371 tree new_temp = NULL_TREE;
2373 enum vect_def_type dt;
2378 stmt_vec_info orig_stmt_info;
2379 tree expr = NULL_TREE;
2381 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2382 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2383 stmt_vec_info prev_stmt_info;
2385 tree new_stmt = NULL_TREE;
2388 if (nested_in_vect_loop_p (loop, stmt))
2391 /* FORNOW. This restriction should be relaxed. */
2394 if (vect_print_dump_info (REPORT_DETAILS))
2395 fprintf (vect_dump, "multiple types in nested loop.");
2400 gcc_assert (ncopies >= 1);
2402 /* 1. Is vectorizable reduction? */
2404 /* Not supportable if the reduction variable is used in the loop. */
2405 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2408 /* Reductions that are not used even in an enclosing outer-loop,
2409 are expected to be "live" (used out of the loop). */
2410 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2411 && !STMT_VINFO_LIVE_P (stmt_info))
2414 /* Make sure it was already recognized as a reduction computation. */
2415 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2418 /* 2. Has this been recognized as a reduction pattern?
2420 Check if STMT represents a pattern that has been recognized
2421 in earlier analysis stages. For stmts that represent a pattern,
2422 the STMT_VINFO_RELATED_STMT field records the last stmt in
2423 the original sequence that constitutes the pattern. */
2425 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2428 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2429 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2430 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2431 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2434 /* 3. Check the operands of the operation. The first operands are defined
2435 inside the loop body. The last operand is the reduction variable,
2436 which is defined by the loop-header-phi. */
2438 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
2440 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2441 code = TREE_CODE (operation);
2442 op_type = TREE_OPERAND_LENGTH (operation);
2443 if (op_type != binary_op && op_type != ternary_op)
2445 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2446 scalar_type = TREE_TYPE (scalar_dest);
2448 /* All uses but the last are expected to be defined in the loop.
2449 The last use is the reduction variable. */
2450 for (i = 0; i < op_type-1; i++)
2452 op = TREE_OPERAND (operation, i);
2453 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2454 gcc_assert (is_simple_use);
2455 if (dt != vect_loop_def
2456 && dt != vect_invariant_def
2457 && dt != vect_constant_def
2458 && dt != vect_induction_def)
2462 op = TREE_OPERAND (operation, i);
2463 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2464 gcc_assert (is_simple_use);
2465 gcc_assert (dt == vect_reduction_def);
2466 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
2468 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2470 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2472 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2475 /* 4. Supportable by target? */
2477 /* 4.1. check support for the operation in the loop */
2478 optab = optab_for_tree_code (code, vectype);
2481 if (vect_print_dump_info (REPORT_DETAILS))
2482 fprintf (vect_dump, "no optab.");
2485 vec_mode = TYPE_MODE (vectype);
2486 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2488 if (vect_print_dump_info (REPORT_DETAILS))
2489 fprintf (vect_dump, "op not supported by target.");
2490 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2491 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2492 < vect_min_worthwhile_factor (code))
2494 if (vect_print_dump_info (REPORT_DETAILS))
2495 fprintf (vect_dump, "proceeding using word mode.");
2498 /* Worthwhile without SIMD support? */
2499 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2500 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2501 < vect_min_worthwhile_factor (code))
2503 if (vect_print_dump_info (REPORT_DETAILS))
2504 fprintf (vect_dump, "not worthwhile without SIMD support.");
2508 /* 4.2. Check support for the epilog operation.
2510 If STMT represents a reduction pattern, then the type of the
2511 reduction variable may be different than the type of the rest
2512 of the arguments. For example, consider the case of accumulation
2513 of shorts into an int accumulator; The original code:
2514 S1: int_a = (int) short_a;
2515 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2518 STMT: int_acc = widen_sum <short_a, int_acc>
2521 1. The tree-code that is used to create the vector operation in the
2522 epilog code (that reduces the partial results) is not the
2523 tree-code of STMT, but is rather the tree-code of the original
2524 stmt from the pattern that STMT is replacing. I.e, in the example
2525 above we want to use 'widen_sum' in the loop, but 'plus' in the
2527 2. The type (mode) we use to check available target support
2528 for the vector operation to be created in the *epilog*, is
2529 determined by the type of the reduction variable (in the example
2530 above we'd check this: plus_optab[vect_int_mode]).
2531 However the type (mode) we use to check available target support
2532 for the vector operation to be created *inside the loop*, is
2533 determined by the type of the other arguments to STMT (in the
2534 example we'd check this: widen_sum_optab[vect_short_mode]).
2536 This is contrary to "regular" reductions, in which the types of all
2537 the arguments are the same as the type of the reduction variable.
2538 For "regular" reductions we can therefore use the same vector type
2539 (and also the same tree-code) when generating the epilog code and
2540 when generating the code inside the loop. */
2544 /* This is a reduction pattern: get the vectype from the type of the
2545 reduction variable, and get the tree-code from orig_stmt. */
2546 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2547 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
2548 vec_mode = TYPE_MODE (vectype);
2552 /* Regular reduction: use the same vectype and tree-code as used for
2553 the vector code inside the loop can be used for the epilog code. */
2557 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
2559 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
2562 if (vect_print_dump_info (REPORT_DETAILS))
2563 fprintf (vect_dump, "no optab for reduction.");
2564 epilog_reduc_code = NUM_TREE_CODES;
2566 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
2568 if (vect_print_dump_info (REPORT_DETAILS))
2569 fprintf (vect_dump, "reduc op not supported by target.");
2570 epilog_reduc_code = NUM_TREE_CODES;
2573 if (!vec_stmt) /* transformation not required. */
2575 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
2576 vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
2582 if (vect_print_dump_info (REPORT_DETAILS))
2583 fprintf (vect_dump, "transform reduction.");
2585 /* Create the destination vector */
2586 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2588 /* Create the reduction-phi that defines the reduction-operand. */
2589 new_phi = create_phi_node (vec_dest, loop->header);
2591 /* In case the vectorization factor (VF) is bigger than the number
2592 of elements that we can fit in a vectype (nunits), we have to generate
2593 more than one vector stmt - i.e - we need to "unroll" the
2594 vector stmt by a factor VF/nunits. For more details see documentation
2595 in vectorizable_operation. */
2597 prev_stmt_info = NULL;
2598 for (j = 0; j < ncopies; j++)
2603 op = TREE_OPERAND (operation, 0);
2604 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
2605 if (op_type == ternary_op)
2607 op = TREE_OPERAND (operation, 1);
2608 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
2611 /* Get the vector def for the reduction variable from the phi node */
2612 reduc_def = PHI_RESULT (new_phi);
2616 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2617 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
2618 if (op_type == ternary_op)
2619 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
2621 /* Get the vector def for the reduction variable from the vectorized
2622 reduction operation generated in the previous iteration (j-1) */
2623 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
2626 /* Arguments are ready. create the new vector stmt. */
2627 if (op_type == binary_op)
2628 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
2630 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
2632 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2633 new_temp = make_ssa_name (vec_dest, new_stmt);
2634 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2635 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2638 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2640 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2641 prev_stmt_info = vinfo_for_stmt (new_stmt);
2644 /* Finalize the reduction-phi (set it's arguments) and create the
2645 epilog reduction code. */
2646 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
2650 /* Checks if CALL can be vectorized in type VECTYPE. Returns
2651 a function declaration if the target has a vectorized version
2652 of the function, or NULL_TREE if the function cannot be vectorized. */
2655 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
2657 tree fndecl = get_callee_fndecl (call);
2658 enum built_in_function code;
2660 /* We only handle functions that do not read or clobber memory -- i.e.
2661 const or novops ones. */
2662 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
2666 || TREE_CODE (fndecl) != FUNCTION_DECL
2667 || !DECL_BUILT_IN (fndecl))
2670 code = DECL_FUNCTION_CODE (fndecl);
2671 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
2675 /* Function vectorizable_call.
2677 Check if STMT performs a function call that can be vectorized.
2678 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2679 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2680 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2683 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2689 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
2690 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
2691 tree vectype_out, vectype_in;
2694 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2695 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2696 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
2697 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
2699 int ncopies, j, nargs;
2700 call_expr_arg_iterator iter;
2702 enum { NARROW, NONE, WIDEN } modifier;
2704 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2707 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2710 /* FORNOW: not yet supported. */
2711 if (STMT_VINFO_LIVE_P (stmt_info))
2713 if (vect_print_dump_info (REPORT_DETAILS))
2714 fprintf (vect_dump, "value used after loop.");
2718 /* Is STMT a vectorizable call? */
2719 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2722 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2725 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2726 if (TREE_CODE (operation) != CALL_EXPR)
2729 /* Process function arguments. */
2730 rhs_type = NULL_TREE;
2732 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
2734 /* Bail out if the function has more than two arguments, we
2735 do not have interesting builtin functions to vectorize with
2736 more than two arguments. */
2740 /* We can only handle calls with arguments of the same type. */
2742 && rhs_type != TREE_TYPE (op))
2744 if (vect_print_dump_info (REPORT_DETAILS))
2745 fprintf (vect_dump, "argument types differ.");
2748 rhs_type = TREE_TYPE (op);
2750 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
2752 if (vect_print_dump_info (REPORT_DETAILS))
2753 fprintf (vect_dump, "use not simple.");
2760 /* No arguments is also not good. */
2764 vectype_in = get_vectype_for_scalar_type (rhs_type);
2765 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2767 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
2768 vectype_out = get_vectype_for_scalar_type (lhs_type);
2769 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2772 if (nunits_in == nunits_out / 2)
2774 else if (nunits_out == nunits_in)
2776 else if (nunits_out == nunits_in / 2)
2781 /* For now, we only vectorize functions if a target specific builtin
2782 is available. TODO -- in some cases, it might be profitable to
2783 insert the calls for pieces of the vector, in order to be able
2784 to vectorize other operations in the loop. */
2785 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
2786 if (fndecl == NULL_TREE)
2788 if (vect_print_dump_info (REPORT_DETAILS))
2789 fprintf (vect_dump, "function is not vectorizable.");
2794 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
2796 if (modifier == NARROW)
2797 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2799 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2801 /* Sanity check: make sure that at least one copy of the vectorized stmt
2802 needs to be generated. */
2803 gcc_assert (ncopies >= 1);
2805 /* FORNOW. This restriction should be relaxed. */
2806 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
2808 if (vect_print_dump_info (REPORT_DETAILS))
2809 fprintf (vect_dump, "multiple types in nested loop.");
2813 if (!vec_stmt) /* transformation not required. */
2815 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
2816 if (vect_print_dump_info (REPORT_DETAILS))
2817 fprintf (vect_dump, "=== vectorizable_call ===");
2818 vect_model_simple_cost (stmt_info, ncopies, dt);
2824 if (vect_print_dump_info (REPORT_DETAILS))
2825 fprintf (vect_dump, "transform operation.");
2827 /* FORNOW. This restriction should be relaxed. */
2828 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
2830 if (vect_print_dump_info (REPORT_DETAILS))
2831 fprintf (vect_dump, "multiple types in nested loop.");
2836 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2837 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2839 prev_stmt_info = NULL;
2843 for (j = 0; j < ncopies; ++j)
2845 /* Build argument list for the vectorized call. */
2846 /* FIXME: Rewrite this so that it doesn't
2847 construct a temporary list. */
2850 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
2854 = vect_get_vec_def_for_operand (op, stmt, NULL);
2857 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
2859 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
2863 vargs = nreverse (vargs);
2865 rhs = build_function_call_expr (fndecl, vargs);
2866 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
2867 new_temp = make_ssa_name (vec_dest, new_stmt);
2868 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2870 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2873 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2875 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2877 prev_stmt_info = vinfo_for_stmt (new_stmt);
2883 for (j = 0; j < ncopies; ++j)
2885 /* Build argument list for the vectorized call. */
2886 /* FIXME: Rewrite this so that it doesn't
2887 construct a temporary list. */
2890 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
2895 = vect_get_vec_def_for_operand (op, stmt, NULL);
2897 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
2902 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
2904 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
2907 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
2908 vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs);
2912 vargs = nreverse (vargs);
2914 rhs = build_function_call_expr (fndecl, vargs);
2915 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
2916 new_temp = make_ssa_name (vec_dest, new_stmt);
2917 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2919 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2922 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2924 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2926 prev_stmt_info = vinfo_for_stmt (new_stmt);
2929 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2934 /* No current target implements this case. */
2938 /* The call in STMT might prevent it from being removed in dce.
2939 We however cannot remove it here, due to the way the ssa name
2940 it defines is mapped to the new definition. So just replace
2941 rhs of the statement with something harmless. */
2942 type = TREE_TYPE (scalar_dest);
2943 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
2950 /* Function vect_gen_widened_results_half
2952 Create a vector stmt whose code, type, number of arguments, and result
2953 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
2954 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
2955 In the case that CODE is a CALL_EXPR, this means that a call to DECL
2956 needs to be created (DECL is a function-decl of a target-builtin).
2957 STMT is the original scalar stmt that we are vectorizing. */
2960 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
2961 tree vec_oprnd0, tree vec_oprnd1, int op_type,
2962 tree vec_dest, block_stmt_iterator *bsi,
2971 /* Generate half of the widened result: */
2972 if (code == CALL_EXPR)
2974 /* Target specific support */
2975 if (op_type == binary_op)
2976 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
2978 expr = build_call_expr (decl, 1, vec_oprnd0);
2982 /* Generic support */
2983 gcc_assert (op_type == TREE_CODE_LENGTH (code));
2984 if (op_type == binary_op)
2985 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
2987 expr = build1 (code, vectype, vec_oprnd0);
2989 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2990 new_temp = make_ssa_name (vec_dest, new_stmt);
2991 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2992 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2994 if (code == CALL_EXPR)
2996 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2998 if (TREE_CODE (sym) == SSA_NAME)
2999 sym = SSA_NAME_VAR (sym);
3000 mark_sym_for_renaming (sym);
3008 /* Function vectorizable_conversion.
3010 Check if STMT performs a conversion operation, that can be vectorized.
3011 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3012 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3013 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3016 vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
3023 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3024 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3025 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3026 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3027 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3028 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3031 enum vect_def_type dt0;
3033 stmt_vec_info prev_stmt_info;
3036 tree vectype_out, vectype_in;
3039 tree rhs_type, lhs_type;
3041 enum { NARROW, NONE, WIDEN } modifier;
3043 /* Is STMT a vectorizable conversion? */
3045 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3048 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3051 if (STMT_VINFO_LIVE_P (stmt_info))
3053 /* FORNOW: not yet supported. */
3054 if (vect_print_dump_info (REPORT_DETAILS))
3055 fprintf (vect_dump, "value used after loop.");
3059 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3062 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3065 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3066 code = TREE_CODE (operation);
3067 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3070 /* Check types of lhs and rhs */
3071 op0 = TREE_OPERAND (operation, 0);
3072 rhs_type = TREE_TYPE (op0);
3073 vectype_in = get_vectype_for_scalar_type (rhs_type);
3074 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3076 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3077 lhs_type = TREE_TYPE (scalar_dest);
3078 vectype_out = get_vectype_for_scalar_type (lhs_type);
3079 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3082 if (nunits_in == nunits_out / 2)
3084 else if (nunits_out == nunits_in)
3086 else if (nunits_out == nunits_in / 2)
3091 if (modifier == NONE)
3092 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3094 /* Bail out if the types are both integral or non-integral */
3095 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3096 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3099 if (modifier == NARROW)
3100 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3102 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3104 /* Sanity check: make sure that at least one copy of the vectorized stmt
3105 needs to be generated. */
3106 gcc_assert (ncopies >= 1);
3108 /* FORNOW. This restriction should be relaxed. */
3109 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3111 if (vect_print_dump_info (REPORT_DETAILS))
3112 fprintf (vect_dump, "multiple types in nested loop.");
3116 /* Check the operands of the operation. */
3117 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
3119 if (vect_print_dump_info (REPORT_DETAILS))
3120 fprintf (vect_dump, "use not simple.");
3124 /* Supportable by target? */
3125 if ((modifier == NONE
3126 && !targetm.vectorize.builtin_conversion (code, vectype_in))
3127 || (modifier == WIDEN
3128 && !supportable_widening_operation (code, stmt, vectype_in,
3131 || (modifier == NARROW
3132 && !supportable_narrowing_operation (code, stmt, vectype_in,
3135 if (vect_print_dump_info (REPORT_DETAILS))
3136 fprintf (vect_dump, "op not supported by target.");
3140 if (modifier != NONE)
3141 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3143 if (!vec_stmt) /* transformation not required. */
3145 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3150 if (vect_print_dump_info (REPORT_DETAILS))
3151 fprintf (vect_dump, "transform conversion.");
3154 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3156 prev_stmt_info = NULL;
3160 for (j = 0; j < ncopies; j++)
3166 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3168 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
3171 targetm.vectorize.builtin_conversion (code, vectype_in);
3172 new_stmt = build_call_expr (builtin_decl, 1, vec_oprnd0);
3174 /* Arguments are ready. create the new vector stmt. */
3175 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3176 new_temp = make_ssa_name (vec_dest, new_stmt);
3177 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3178 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3179 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3181 if (TREE_CODE (sym) == SSA_NAME)
3182 sym = SSA_NAME_VAR (sym);
3183 mark_sym_for_renaming (sym);
3187 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3189 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3190 prev_stmt_info = vinfo_for_stmt (new_stmt);
3195 /* In case the vectorization factor (VF) is bigger than the number
3196 of elements that we can fit in a vectype (nunits), we have to
3197 generate more than one vector stmt - i.e - we need to "unroll"
3198 the vector stmt by a factor VF/nunits. */
3199 for (j = 0; j < ncopies; j++)
3202 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3204 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
3206 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3208 /* Generate first half of the widened result: */
3210 = vect_gen_widened_results_half (code1, vectype_out, decl1,
3211 vec_oprnd0, vec_oprnd1,
3212 unary_op, vec_dest, bsi, stmt);
3214 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3216 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3217 prev_stmt_info = vinfo_for_stmt (new_stmt);
3219 /* Generate second half of the widened result: */
3221 = vect_gen_widened_results_half (code2, vectype_out, decl2,
3222 vec_oprnd0, vec_oprnd1,
3223 unary_op, vec_dest, bsi, stmt);
3224 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3225 prev_stmt_info = vinfo_for_stmt (new_stmt);
3230 /* In case the vectorization factor (VF) is bigger than the number
3231 of elements that we can fit in a vectype (nunits), we have to
3232 generate more than one vector stmt - i.e - we need to "unroll"
3233 the vector stmt by a factor VF/nunits. */
3234 for (j = 0; j < ncopies; j++)
3239 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3240 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
3244 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
3245 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
3248 /* Arguments are ready. Create the new vector stmt. */
3249 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3250 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3251 new_temp = make_ssa_name (vec_dest, new_stmt);
3252 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3253 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3256 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3258 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3260 prev_stmt_info = vinfo_for_stmt (new_stmt);
3263 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3269 /* Function vectorizable_assignment.
3271 Check if STMT performs an assignment (copy) that can be vectorized.
3272 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3273 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3274 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3277 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3283 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3284 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3285 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3288 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3289 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3290 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3292 gcc_assert (ncopies >= 1);
3294 return false; /* FORNOW */
3296 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3299 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3302 /* FORNOW: not yet supported. */
3303 if (STMT_VINFO_LIVE_P (stmt_info))
3305 if (vect_print_dump_info (REPORT_DETAILS))
3306 fprintf (vect_dump, "value used after loop.");
3310 /* Is vectorizable assignment? */
3311 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3314 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3315 if (TREE_CODE (scalar_dest) != SSA_NAME)
3318 op = GIMPLE_STMT_OPERAND (stmt, 1);
3319 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3321 if (vect_print_dump_info (REPORT_DETAILS))
3322 fprintf (vect_dump, "use not simple.");
3326 if (!vec_stmt) /* transformation not required. */
3328 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3329 if (vect_print_dump_info (REPORT_DETAILS))
3330 fprintf (vect_dump, "=== vectorizable_assignment ===");
3331 vect_model_simple_cost (stmt_info, ncopies, dt);
3336 if (vect_print_dump_info (REPORT_DETAILS))
3337 fprintf (vect_dump, "transform assignment.");
3340 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3343 op = GIMPLE_STMT_OPERAND (stmt, 1);
3344 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
3346 /* Arguments are ready. create the new vector stmt. */
3347 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_oprnd);
3348 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3349 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
3350 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3356 /* Function vect_min_worthwhile_factor.
3358 For a loop where we could vectorize the operation indicated by CODE,
3359 return the minimum vectorization factor that makes it worthwhile
3360 to use generic vectors. */
3362 vect_min_worthwhile_factor (enum tree_code code)
3383 /* Function vectorizable_induction
3385 Check if PHI performs an induction computation that can be vectorized.
3386 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3387 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3388 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3391 vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3394 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3395 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3396 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3397 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3398 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3401 gcc_assert (ncopies >= 1);
3403 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3406 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3408 if (STMT_VINFO_LIVE_P (stmt_info))
3410 /* FORNOW: not yet supported. */
3411 if (vect_print_dump_info (REPORT_DETAILS))
3412 fprintf (vect_dump, "value used after loop.");
3416 if (TREE_CODE (phi) != PHI_NODE)
3419 if (!vec_stmt) /* transformation not required. */
3421 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3422 if (vect_print_dump_info (REPORT_DETAILS))
3423 fprintf (vect_dump, "=== vectorizable_induction ===");
3424 vect_model_induction_cost (stmt_info, ncopies);
3430 if (vect_print_dump_info (REPORT_DETAILS))
3431 fprintf (vect_dump, "transform induction phi.");
3433 vec_def = get_initial_def_for_induction (phi);
3434 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3439 /* Function vectorizable_operation.
3441 Check if STMT performs a binary or unary operation that can be vectorized.
3442 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3443 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3444 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3447 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3452 tree op0, op1 = NULL;
3453 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3454 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3455 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3456 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3457 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3458 enum tree_code code;
3459 enum machine_mode vec_mode;
3464 enum machine_mode optab_op2_mode;
3466 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3468 stmt_vec_info prev_stmt_info;
3469 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
3472 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3475 gcc_assert (ncopies >= 1);
3476 /* FORNOW. This restriction should be relaxed. */
3477 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3479 if (vect_print_dump_info (REPORT_DETAILS))
3480 fprintf (vect_dump, "multiple types in nested loop.");
3484 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3487 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3490 /* FORNOW: not yet supported. */
3491 if (STMT_VINFO_LIVE_P (stmt_info))
3493 if (vect_print_dump_info (REPORT_DETAILS))
3494 fprintf (vect_dump, "value used after loop.");
3498 /* Is STMT a vectorizable binary/unary operation? */
3499 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3502 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3505 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3506 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3507 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3508 if (nunits_out != nunits_in)
3511 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3512 code = TREE_CODE (operation);
3514 /* For pointer addition, we should use the normal plus for
3515 the vector addition. */
3516 if (code == POINTER_PLUS_EXPR)
3519 optab = optab_for_tree_code (code, vectype);
3521 /* Support only unary or binary operations. */
3522 op_type = TREE_OPERAND_LENGTH (operation);
3523 if (op_type != unary_op && op_type != binary_op)
3525 if (vect_print_dump_info (REPORT_DETAILS))
3526 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
3530 op0 = TREE_OPERAND (operation, 0);
3531 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3533 if (vect_print_dump_info (REPORT_DETAILS))
3534 fprintf (vect_dump, "use not simple.");
3538 if (op_type == binary_op)
3540 op1 = TREE_OPERAND (operation, 1);
3541 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
3543 if (vect_print_dump_info (REPORT_DETAILS))
3544 fprintf (vect_dump, "use not simple.");
3549 /* Supportable by target? */
3552 if (vect_print_dump_info (REPORT_DETAILS))
3553 fprintf (vect_dump, "no optab.");
3556 vec_mode = TYPE_MODE (vectype);
3557 icode = (int) optab_handler (optab, vec_mode)->insn_code;
3558 if (icode == CODE_FOR_nothing)
3560 if (vect_print_dump_info (REPORT_DETAILS))
3561 fprintf (vect_dump, "op not supported by target.");
3562 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3563 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3564 < vect_min_worthwhile_factor (code))
3566 if (vect_print_dump_info (REPORT_DETAILS))
3567 fprintf (vect_dump, "proceeding using word mode.");
3570 /* Worthwhile without SIMD support? */
3571 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3572 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3573 < vect_min_worthwhile_factor (code))
3575 if (vect_print_dump_info (REPORT_DETAILS))
3576 fprintf (vect_dump, "not worthwhile without SIMD support.");
3580 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
3582 /* FORNOW: not yet supported. */
3583 if (!VECTOR_MODE_P (vec_mode))
3586 /* Invariant argument is needed for a vector shift
3587 by a scalar shift operand. */
3588 optab_op2_mode = insn_data[icode].operand[2].mode;
3589 if (! (VECTOR_MODE_P (optab_op2_mode)
3590 || dt[1] == vect_constant_def
3591 || dt[1] == vect_invariant_def))
3593 if (vect_print_dump_info (REPORT_DETAILS))
3594 fprintf (vect_dump, "operand mode requires invariant argument.");
3599 if (!vec_stmt) /* transformation not required. */
3601 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
3602 if (vect_print_dump_info (REPORT_DETAILS))
3603 fprintf (vect_dump, "=== vectorizable_operation ===");
3604 vect_model_simple_cost (stmt_info, ncopies, dt);
3610 if (vect_print_dump_info (REPORT_DETAILS))
3611 fprintf (vect_dump, "transform binary/unary operation.");
3614 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3616 /* In case the vectorization factor (VF) is bigger than the number
3617 of elements that we can fit in a vectype (nunits), we have to generate
3618 more than one vector stmt - i.e - we need to "unroll" the
3619 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3620 from one copy of the vector stmt to the next, in the field
3621 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3622 stages to find the correct vector defs to be used when vectorizing
3623 stmts that use the defs of the current stmt. The example below illustrates
3624 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3625 4 vectorized stmts):
3627 before vectorization:
3628 RELATED_STMT VEC_STMT
3632 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
3634 RELATED_STMT VEC_STMT
3635 VS1_0: vx0 = memref0 VS1_1 -
3636 VS1_1: vx1 = memref1 VS1_2 -
3637 VS1_2: vx2 = memref2 VS1_3 -
3638 VS1_3: vx3 = memref3 - -
3639 S1: x = load - VS1_0
3642 step2: vectorize stmt S2 (done here):
3643 To vectorize stmt S2 we first need to find the relevant vector
3644 def for the first operand 'x'. This is, as usual, obtained from
3645 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
3646 that defines 'x' (S1). This way we find the stmt VS1_0, and the
3647 relevant vector def 'vx0'. Having found 'vx0' we can generate
3648 the vector stmt VS2_0, and as usual, record it in the
3649 STMT_VINFO_VEC_STMT of stmt S2.
3650 When creating the second copy (VS2_1), we obtain the relevant vector
3651 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
3652 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
3653 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
3654 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
3655 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
3656 chain of stmts and pointers:
3657 RELATED_STMT VEC_STMT
3658 VS1_0: vx0 = memref0 VS1_1 -
3659 VS1_1: vx1 = memref1 VS1_2 -
3660 VS1_2: vx2 = memref2 VS1_3 -
3661 VS1_3: vx3 = memref3 - -
3662 S1: x = load - VS1_0
3663 VS2_0: vz0 = vx0 + v1 VS2_1 -
3664 VS2_1: vz1 = vx1 + v1 VS2_2 -
3665 VS2_2: vz2 = vx2 + v1 VS2_3 -
3666 VS2_3: vz3 = vx3 + v1 - -
3667 S2: z = x + 1 - VS2_0 */
3669 prev_stmt_info = NULL;
3670 for (j = 0; j < ncopies; j++)
3675 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3676 if (op_type == binary_op)
3678 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
3680 /* Vector shl and shr insn patterns can be defined with
3681 scalar operand 2 (shift operand). In this case, use
3682 constant or loop invariant op1 directly, without
3683 extending it to vector mode first. */
3684 optab_op2_mode = insn_data[icode].operand[2].mode;
3685 if (!VECTOR_MODE_P (optab_op2_mode))
3687 if (vect_print_dump_info (REPORT_DETAILS))
3688 fprintf (vect_dump, "operand 1 using scalar mode.");
3693 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
3698 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3699 if (op_type == binary_op)
3700 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
3703 /* Arguments are ready. create the new vector stmt. */
3705 if (op_type == binary_op)
3706 new_stmt = build_gimple_modify_stmt (vec_dest,
3707 build2 (code, vectype, vec_oprnd0, vec_oprnd1));
3709 new_stmt = build_gimple_modify_stmt (vec_dest,
3710 build1 (code, vectype, vec_oprnd0));
3711 new_temp = make_ssa_name (vec_dest, new_stmt);
3712 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3713 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3716 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3718 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3719 prev_stmt_info = vinfo_for_stmt (new_stmt);
3726 /* Function vectorizable_type_demotion
3728 Check if STMT performs a binary or unary operation that involves
3729 type demotion, and if it can be vectorized.
3730 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3731 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3732 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3735 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
3742 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
3743 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3744 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3745 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3746 enum tree_code code, code1 = ERROR_MARK;
3749 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3751 stmt_vec_info prev_stmt_info;
3760 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3763 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3766 /* FORNOW: not yet supported. */
3767 if (STMT_VINFO_LIVE_P (stmt_info))
3769 if (vect_print_dump_info (REPORT_DETAILS))
3770 fprintf (vect_dump, "value used after loop.");
3774 /* Is STMT a vectorizable type-demotion operation? */
3775 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3778 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3781 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3782 code = TREE_CODE (operation);
3783 if (code != NOP_EXPR && code != CONVERT_EXPR)
3786 op0 = TREE_OPERAND (operation, 0);
3787 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
3788 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3790 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3791 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3792 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3793 if (nunits_in != nunits_out / 2) /* FORNOW */
3796 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3797 gcc_assert (ncopies >= 1);
3798 /* FORNOW. This restriction should be relaxed. */
3799 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3801 if (vect_print_dump_info (REPORT_DETAILS))
3802 fprintf (vect_dump, "multiple types in nested loop.");
3806 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
3807 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3808 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
3809 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
3810 && (code == NOP_EXPR || code == CONVERT_EXPR))))
3813 /* Check the operands of the operation. */
3814 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3816 if (vect_print_dump_info (REPORT_DETAILS))
3817 fprintf (vect_dump, "use not simple.");
3821 /* Supportable by target? */
3822 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1))
3825 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3827 if (!vec_stmt) /* transformation not required. */
3829 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
3830 if (vect_print_dump_info (REPORT_DETAILS))
3831 fprintf (vect_dump, "=== vectorizable_demotion ===");
3832 vect_model_simple_cost (stmt_info, ncopies, dt);
3837 if (vect_print_dump_info (REPORT_DETAILS))
3838 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
3842 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3844 /* In case the vectorization factor (VF) is bigger than the number
3845 of elements that we can fit in a vectype (nunits), we have to generate
3846 more than one vector stmt - i.e - we need to "unroll" the
3847 vector stmt by a factor VF/nunits. */
3848 prev_stmt_info = NULL;
3849 for (j = 0; j < ncopies; j++)
3854 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3855 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3859 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3860 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3863 /* Arguments are ready. Create the new vector stmt. */
3864 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3865 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3866 new_temp = make_ssa_name (vec_dest, new_stmt);
3867 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3868 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3871 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3873 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3875 prev_stmt_info = vinfo_for_stmt (new_stmt);
3878 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3883 /* Function vectorizable_type_promotion
3885 Check if STMT performs a binary or unary operation that involves
3886 type promotion, and if it can be vectorized.
3887 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3888 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3889 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3892 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
3898 tree op0, op1 = NULL;
3899 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
3900 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3901 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3902 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3903 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3904 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3907 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3909 stmt_vec_info prev_stmt_info;
3917 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3920 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3923 /* FORNOW: not yet supported. */
3924 if (STMT_VINFO_LIVE_P (stmt_info))
3926 if (vect_print_dump_info (REPORT_DETAILS))
3927 fprintf (vect_dump, "value used after loop.");
3931 /* Is STMT a vectorizable type-promotion operation? */
3932 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3935 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3938 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3939 code = TREE_CODE (operation);
3940 if (code != NOP_EXPR && code != CONVERT_EXPR
3941 && code != WIDEN_MULT_EXPR)
3944 op0 = TREE_OPERAND (operation, 0);
3945 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
3946 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3948 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3949 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3950 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3951 if (nunits_out != nunits_in / 2) /* FORNOW */
3954 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3955 gcc_assert (ncopies >= 1);
3956 /* FORNOW. This restriction should be relaxed. */
3957 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3959 if (vect_print_dump_info (REPORT_DETAILS))
3960 fprintf (vect_dump, "multiple types in nested loop.");
3964 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
3965 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3966 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
3967 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
3968 && (code == CONVERT_EXPR || code == NOP_EXPR))))
3971 /* Check the operands of the operation. */
3972 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3974 if (vect_print_dump_info (REPORT_DETAILS))
3975 fprintf (vect_dump, "use not simple.");
3979 op_type = TREE_CODE_LENGTH (code);
3980 if (op_type == binary_op)
3982 op1 = TREE_OPERAND (operation, 1);
3983 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
3985 if (vect_print_dump_info (REPORT_DETAILS))
3986 fprintf (vect_dump, "use not simple.");
3991 /* Supportable by target? */
3992 if (!supportable_widening_operation (code, stmt, vectype_in,
3993 &decl1, &decl2, &code1, &code2))
3996 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3998 if (!vec_stmt) /* transformation not required. */
4000 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4001 if (vect_print_dump_info (REPORT_DETAILS))
4002 fprintf (vect_dump, "=== vectorizable_promotion ===");
4003 vect_model_simple_cost (stmt_info, 2*ncopies, dt);
4009 if (vect_print_dump_info (REPORT_DETAILS))
4010 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4014 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4016 /* In case the vectorization factor (VF) is bigger than the number
4017 of elements that we can fit in a vectype (nunits), we have to generate
4018 more than one vector stmt - i.e - we need to "unroll" the
4019 vector stmt by a factor VF/nunits. */
4021 prev_stmt_info = NULL;
4022 for (j = 0; j < ncopies; j++)
4027 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4028 if (op_type == binary_op)
4029 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4033 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4034 if (op_type == binary_op)
4035 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4038 /* Arguments are ready. Create the new vector stmt. We are creating
4039 two vector defs because the widened result does not fit in one vector.
4040 The vectorized stmt can be expressed as a call to a taregt builtin,
4041 or a using a tree-code. */
4042 /* Generate first half of the widened result: */
4043 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
4044 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4046 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4048 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4049 prev_stmt_info = vinfo_for_stmt (new_stmt);
4051 /* Generate second half of the widened result: */
4052 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
4053 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4054 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4055 prev_stmt_info = vinfo_for_stmt (new_stmt);
4059 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4064 /* Function vect_strided_store_supported.
4066 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4067 and FALSE otherwise. */
4070 vect_strided_store_supported (tree vectype)
4072 optab interleave_high_optab, interleave_low_optab;
4075 mode = (int) TYPE_MODE (vectype);
4077 /* Check that the operation is supported. */
4078 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4080 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4082 if (!interleave_high_optab || !interleave_low_optab)
4084 if (vect_print_dump_info (REPORT_DETAILS))
4085 fprintf (vect_dump, "no optab for interleave.");
4089 if (optab_handler (interleave_high_optab, mode)->insn_code
4091 || optab_handler (interleave_low_optab, mode)->insn_code
4092 == CODE_FOR_nothing)
4094 if (vect_print_dump_info (REPORT_DETAILS))
4095 fprintf (vect_dump, "interleave op not supported by target.");
4102 /* Function vect_permute_store_chain.
4104 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4105 a power of 2, generate interleave_high/low stmts to reorder the data
4106 correctly for the stores. Return the final references for stores in
4109 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4110 The input is 4 vectors each containing 8 elements. We assign a number to each
4111 element, the input sequence is:
4113 1st vec: 0 1 2 3 4 5 6 7
4114 2nd vec: 8 9 10 11 12 13 14 15
4115 3rd vec: 16 17 18 19 20 21 22 23
4116 4th vec: 24 25 26 27 28 29 30 31
4118 The output sequence should be:
4120 1st vec: 0 8 16 24 1 9 17 25
4121 2nd vec: 2 10 18 26 3 11 19 27
4122 3rd vec: 4 12 20 28 5 13 21 30
4123 4th vec: 6 14 22 30 7 15 23 31
4125 i.e., we interleave the contents of the four vectors in their order.
4127 We use interleave_high/low instructions to create such output. The input of
4128 each interleave_high/low operation is two vectors:
4131 the even elements of the result vector are obtained left-to-right from the
4132 high/low elements of the first vector. The odd elements of the result are
4133 obtained left-to-right from the high/low elements of the second vector.
4134 The output of interleave_high will be: 0 4 1 5
4135 and of interleave_low: 2 6 3 7
4138 The permutation is done in log LENGTH stages. In each stage interleave_high
4139 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4140 where the first argument is taken from the first half of DR_CHAIN and the
4141 second argument from it's second half.
4144 I1: interleave_high (1st vec, 3rd vec)
4145 I2: interleave_low (1st vec, 3rd vec)
4146 I3: interleave_high (2nd vec, 4th vec)
4147 I4: interleave_low (2nd vec, 4th vec)
4149 The output for the first stage is:
4151 I1: 0 16 1 17 2 18 3 19
4152 I2: 4 20 5 21 6 22 7 23
4153 I3: 8 24 9 25 10 26 11 27
4154 I4: 12 28 13 29 14 30 15 31
4156 The output of the second stage, i.e. the final result is:
4158 I1: 0 8 16 24 1 9 17 25
4159 I2: 2 10 18 26 3 11 19 27
4160 I3: 4 12 20 28 5 13 21 30
4161 I4: 6 14 22 30 7 15 23 31. */
4164 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
4165 unsigned int length,
4167 block_stmt_iterator *bsi,
4168 VEC(tree,heap) **result_chain)
4170 tree perm_dest, perm_stmt, vect1, vect2, high, low;
4171 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4172 tree scalar_dest, tmp;
4175 VEC(tree,heap) *first, *second;
4177 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4178 first = VEC_alloc (tree, heap, length/2);
4179 second = VEC_alloc (tree, heap, length/2);
4181 /* Check that the operation is supported. */
4182 if (!vect_strided_store_supported (vectype))
4185 *result_chain = VEC_copy (tree, heap, dr_chain);
4187 for (i = 0; i < exact_log2 (length); i++)
4189 for (j = 0; j < length/2; j++)
4191 vect1 = VEC_index (tree, dr_chain, j);
4192 vect2 = VEC_index (tree, dr_chain, j+length/2);
4194 /* Create interleaving stmt:
4195 in the case of big endian:
4196 high = interleave_high (vect1, vect2)
4197 and in the case of little endian:
4198 high = interleave_low (vect1, vect2). */
4199 perm_dest = create_tmp_var (vectype, "vect_inter_high");
4200 DECL_GIMPLE_REG_P (perm_dest) = 1;
4201 add_referenced_var (perm_dest);
4202 if (BYTES_BIG_ENDIAN)
4203 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4205 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4206 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4207 high = make_ssa_name (perm_dest, perm_stmt);
4208 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
4209 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4210 VEC_replace (tree, *result_chain, 2*j, high);
4212 /* Create interleaving stmt:
4213 in the case of big endian:
4214 low = interleave_low (vect1, vect2)
4215 and in the case of little endian:
4216 low = interleave_high (vect1, vect2). */
4217 perm_dest = create_tmp_var (vectype, "vect_inter_low");
4218 DECL_GIMPLE_REG_P (perm_dest) = 1;
4219 add_referenced_var (perm_dest);
4220 if (BYTES_BIG_ENDIAN)
4221 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4223 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4224 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4225 low = make_ssa_name (perm_dest, perm_stmt);
4226 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
4227 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4228 VEC_replace (tree, *result_chain, 2*j+1, low);
4230 dr_chain = VEC_copy (tree, heap, *result_chain);
4236 /* Function vectorizable_store.
4238 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
4240 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4241 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4242 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4245 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
4250 tree vec_oprnd = NULL_TREE;
4251 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4252 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
4253 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4254 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4255 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4256 enum machine_mode vec_mode;
4258 enum dr_alignment_support alignment_support_scheme;
4260 enum vect_def_type dt;
4261 stmt_vec_info prev_stmt_info = NULL;
4262 tree dataref_ptr = NULL_TREE;
4263 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4264 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4266 tree next_stmt, first_stmt;
4267 bool strided_store = false;
4268 unsigned int group_size, i;
4269 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
4272 gcc_assert (ncopies >= 1);
4274 /* FORNOW. This restriction should be relaxed. */
4275 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4277 if (vect_print_dump_info (REPORT_DETAILS))
4278 fprintf (vect_dump, "multiple types in nested loop.");
4282 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4285 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4288 if (STMT_VINFO_LIVE_P (stmt_info))
4290 if (vect_print_dump_info (REPORT_DETAILS))
4291 fprintf (vect_dump, "value used after loop.");
4295 /* Is vectorizable store? */
4297 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4300 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4301 if (TREE_CODE (scalar_dest) != ARRAY_REF
4302 && TREE_CODE (scalar_dest) != INDIRECT_REF
4303 && !DR_GROUP_FIRST_DR (stmt_info))
4306 op = GIMPLE_STMT_OPERAND (stmt, 1);
4307 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4309 if (vect_print_dump_info (REPORT_DETAILS))
4310 fprintf (vect_dump, "use not simple.");
4314 vec_mode = TYPE_MODE (vectype);
4315 /* FORNOW. In some cases can vectorize even if data-type not supported
4316 (e.g. - array initialization with 0). */
4317 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
4320 if (!STMT_VINFO_DATA_REF (stmt_info))
4323 if (DR_GROUP_FIRST_DR (stmt_info))
4325 strided_store = true;
4326 if (!vect_strided_store_supported (vectype))
4330 if (!vec_stmt) /* transformation not required. */
4332 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
4333 vect_model_store_cost (stmt_info, ncopies, dt);
4341 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4342 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
4343 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
4345 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
4348 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
4350 /* We vectorize all the stmts of the interleaving group when we
4351 reach the last stmt in the group. */
4352 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
4353 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
4355 *vec_stmt = NULL_TREE;
4366 if (vect_print_dump_info (REPORT_DETAILS))
4367 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
4369 dr_chain = VEC_alloc (tree, heap, group_size);
4370 oprnds = VEC_alloc (tree, heap, group_size);
4372 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
4373 gcc_assert (alignment_support_scheme);
4374 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
4376 /* In case the vectorization factor (VF) is bigger than the number
4377 of elements that we can fit in a vectype (nunits), we have to generate
4378 more than one vector stmt - i.e - we need to "unroll" the
4379 vector stmt by a factor VF/nunits. For more details see documentation in
4380 vect_get_vec_def_for_copy_stmt. */
4382 /* In case of interleaving (non-unit strided access):
4389 We create vectorized stores starting from base address (the access of the
4390 first stmt in the chain (S2 in the above example), when the last store stmt
4391 of the chain (S4) is reached:
4394 VS2: &base + vec_size*1 = vx0
4395 VS3: &base + vec_size*2 = vx1
4396 VS4: &base + vec_size*3 = vx3
4398 Then permutation statements are generated:
4400 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
4401 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
4404 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
4405 (the order of the data-refs in the output of vect_permute_store_chain
4406 corresponds to the order of scalar stmts in the interleaving chain - see
4407 the documentation of vect_permute_store_chain()).
4409 In case of both multiple types and interleaving, above vector stores and
4410 permutation stmts are created for every copy. The result vector stmts are
4411 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
4412 STMT_VINFO_RELATED_STMT for the next copies.
4415 prev_stmt_info = NULL;
4416 for (j = 0; j < ncopies; j++)
4423 /* For interleaved stores we collect vectorized defs for all the
4424 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
4425 as an input to vect_permute_store_chain(), and OPRNDS as an input
4426 to vect_get_vec_def_for_stmt_copy() for the next copy.
4427 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4428 OPRNDS are of size 1. */
4429 next_stmt = first_stmt;
4430 for (i = 0; i < group_size; i++)
4432 /* Since gaps are not supported for interleaved stores, GROUP_SIZE
4433 is the exact number of stmts in the chain. Therefore, NEXT_STMT
4434 can't be NULL_TREE. In case that there is no interleaving,
4435 GROUP_SIZE is 1, and only one iteration of the loop will be
4437 gcc_assert (next_stmt);
4438 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4439 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
4440 VEC_quick_push(tree, dr_chain, vec_oprnd);
4441 VEC_quick_push(tree, oprnds, vec_oprnd);
4442 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4444 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
4445 &dummy, &ptr_incr, false,
4446 TREE_TYPE (vec_oprnd), &inv_p);
4447 gcc_assert (!inv_p);
4451 /* For interleaved stores we created vectorized defs for all the
4452 defs stored in OPRNDS in the previous iteration (previous copy).
4453 DR_CHAIN is then used as an input to vect_permute_store_chain(),
4454 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
4456 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4457 OPRNDS are of size 1. */
4458 for (i = 0; i < group_size; i++)
4460 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
4461 VEC_index (tree, oprnds, i));
4462 VEC_replace(tree, dr_chain, i, vec_oprnd);
4463 VEC_replace(tree, oprnds, i, vec_oprnd);
4466 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
4471 result_chain = VEC_alloc (tree, heap, group_size);
4473 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
4478 next_stmt = first_stmt;
4479 for (i = 0; i < group_size; i++)
4481 /* For strided stores vectorized defs are interleaved in
4482 vect_permute_store_chain(). */
4484 vec_oprnd = VEC_index(tree, result_chain, i);
4486 data_ref = build_fold_indirect_ref (dataref_ptr);
4487 /* Arguments are ready. Create the new vector stmt. */
4488 new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
4489 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4490 mark_symbols_for_renaming (new_stmt);
4493 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4495 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4497 prev_stmt_info = vinfo_for_stmt (new_stmt);
4498 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4501 /* Bump the vector pointer. */
4503 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
4511 /* Function vect_setup_realignment
4513 This function is called when vectorizing an unaligned load using
4514 the dr_explicit_realign[_optimized] scheme.
4515 This function generates the following code at the loop prolog:
4518 x msq_init = *(floor(p)); # prolog load
4519 realignment_token = call target_builtin;
4521 x msq = phi (msq_init, ---)
4523 The stmts marked with x are generated only for the case of
4524 dr_explicit_realign_optimized.
4526 The code above sets up a new (vector) pointer, pointing to the first
4527 location accessed by STMT, and a "floor-aligned" load using that pointer.
4528 It also generates code to compute the "realignment-token" (if the relevant
4529 target hook was defined), and creates a phi-node at the loop-header bb
4530 whose arguments are the result of the prolog-load (created by this
4531 function) and the result of a load that takes place in the loop (to be
4532 created by the caller to this function).
4534 For the case of dr_explicit_realign_optimized:
4535 The caller to this function uses the phi-result (msq) to create the
4536 realignment code inside the loop, and sets up the missing phi argument,
4539 msq = phi (msq_init, lsq)
4540 lsq = *(floor(p')); # load in loop
4541 result = realign_load (msq, lsq, realignment_token);
4543 For the case of dr_explicit_realign:
4545 msq = *(floor(p)); # load in loop
4547 lsq = *(floor(p')); # load in loop
4548 result = realign_load (msq, lsq, realignment_token);
4551 STMT - (scalar) load stmt to be vectorized. This load accesses
4552 a memory location that may be unaligned.
4553 BSI - place where new code is to be inserted.
4554 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4558 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4559 target hook, if defined.
4560 Return value - the result of the loop-header phi node. */
4563 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
4564 tree *realignment_token,
4565 enum dr_alignment_support alignment_support_scheme,
4567 struct loop **at_loop)
4569 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4570 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4571 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4572 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4574 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4581 tree msq_init = NULL_TREE;
4584 tree msq = NULL_TREE;
4585 tree stmts = NULL_TREE;
4587 bool compute_in_loop = false;
4588 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4589 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
4590 struct loop *loop_for_initial_load;
4592 gcc_assert (alignment_support_scheme == dr_explicit_realign
4593 || alignment_support_scheme == dr_explicit_realign_optimized);
4595 /* We need to generate three things:
4596 1. the misalignment computation
4597 2. the extra vector load (for the optimized realignment scheme).
4598 3. the phi node for the two vectors from which the realignment is
4599 done (for the optimized realignment scheme).
4602 /* 1. Determine where to generate the misalignment computation.
4604 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4605 calculation will be generated by this function, outside the loop (in the
4606 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4607 caller, inside the loop.
4609 Background: If the misalignment remains fixed throughout the iterations of
4610 the loop, then both realignment schemes are applicable, and also the
4611 misalignment computation can be done outside LOOP. This is because we are
4612 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4613 are a multiple of VS (the Vector Size), and therefore the misalignment in
4614 different vectorized LOOP iterations is always the same.
4615 The problem arises only if the memory access is in an inner-loop nested
4616 inside LOOP, which is now being vectorized using outer-loop vectorization.
4617 This is the only case when the misalignment of the memory access may not
4618 remain fixed throughout the iterations of the inner-loop (as explained in
4619 detail in vect_supportable_dr_alignment). In this case, not only is the
4620 optimized realignment scheme not applicable, but also the misalignment
4621 computation (and generation of the realignment token that is passed to
4622 REALIGN_LOAD) have to be done inside the loop.
4624 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4625 or not, which in turn determines if the misalignment is computed inside
4626 the inner-loop, or outside LOOP. */
4628 if (init_addr != NULL_TREE)
4630 compute_in_loop = true;
4631 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4635 /* 2. Determine where to generate the extra vector load.
4637 For the optimized realignment scheme, instead of generating two vector
4638 loads in each iteration, we generate a single extra vector load in the
4639 preheader of the loop, and in each iteration reuse the result of the
4640 vector load from the previous iteration. In case the memory access is in
4641 an inner-loop nested inside LOOP, which is now being vectorized using
4642 outer-loop vectorization, we need to determine whether this initial vector
4643 load should be generated at the preheader of the inner-loop, or can be
4644 generated at the preheader of LOOP. If the memory access has no evolution
4645 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4646 to be generated inside LOOP (in the preheader of the inner-loop). */
4648 if (nested_in_vect_loop)
4650 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4651 bool invariant_in_outerloop =
4652 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4653 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4656 loop_for_initial_load = loop;
4658 *at_loop = loop_for_initial_load;
4660 /* 3. For the case of the optimized realignment, create the first vector
4661 load at the loop preheader. */
4663 if (alignment_support_scheme == dr_explicit_realign_optimized)
4665 /* Create msq_init = *(floor(p1)) in the loop preheader */
4667 gcc_assert (!compute_in_loop);
4668 pe = loop_preheader_edge (loop_for_initial_load);
4669 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4670 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
4671 &init_addr, &inc, true, NULL_TREE, &inv_p);
4672 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
4673 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
4674 new_temp = make_ssa_name (vec_dest, new_stmt);
4675 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4676 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
4677 gcc_assert (!new_bb);
4678 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
4681 /* 4. Create realignment token using a target builtin, if available.
4682 It is done either inside the containing loop, or before LOOP (as
4683 determined above). */
4685 if (targetm.vectorize.builtin_mask_for_load)
4689 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4690 if (compute_in_loop)
4691 gcc_assert (init_addr); /* already computed by the caller. */
4694 /* Generate the INIT_ADDR computation outside LOOP. */
4695 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4697 pe = loop_preheader_edge (loop);
4698 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
4699 gcc_assert (!new_bb);
4702 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4703 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
4704 vec_dest = vect_create_destination_var (scalar_dest,
4705 TREE_TYPE (new_stmt));
4706 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
4707 new_temp = make_ssa_name (vec_dest, new_stmt);
4708 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4710 if (compute_in_loop)
4711 bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT);
4714 /* Generate the misalignment computation outside LOOP. */
4715 pe = loop_preheader_edge (loop);
4716 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
4717 gcc_assert (!new_bb);
4720 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
4722 /* The result of the CALL_EXPR to this builtin is determined from
4723 the value of the parameter and no global variables are touched
4724 which makes the builtin a "const" function. Requiring the
4725 builtin to have the "const" attribute makes it unnecessary
4726 to call mark_call_clobbered. */
4727 gcc_assert (TREE_READONLY (builtin_decl));
4730 if (alignment_support_scheme == dr_explicit_realign)
4733 gcc_assert (!compute_in_loop);
4734 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4737 /* 5. Create msq = phi <msq_init, lsq> in loop */
4739 pe = loop_preheader_edge (containing_loop);
4740 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4741 msq = make_ssa_name (vec_dest, NULL_TREE);
4742 phi_stmt = create_phi_node (msq, containing_loop->header);
4743 SSA_NAME_DEF_STMT (msq) = phi_stmt;
4744 add_phi_arg (phi_stmt, msq_init, pe);
4750 /* Function vect_strided_load_supported.
4752 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
4753 and FALSE otherwise. */
4756 vect_strided_load_supported (tree vectype)
4758 optab perm_even_optab, perm_odd_optab;
4761 mode = (int) TYPE_MODE (vectype);
4763 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
4764 if (!perm_even_optab)
4766 if (vect_print_dump_info (REPORT_DETAILS))
4767 fprintf (vect_dump, "no optab for perm_even.");
4771 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
4773 if (vect_print_dump_info (REPORT_DETAILS))
4774 fprintf (vect_dump, "perm_even op not supported by target.");
4778 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
4779 if (!perm_odd_optab)
4781 if (vect_print_dump_info (REPORT_DETAILS))
4782 fprintf (vect_dump, "no optab for perm_odd.");
4786 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
4788 if (vect_print_dump_info (REPORT_DETAILS))
4789 fprintf (vect_dump, "perm_odd op not supported by target.");
4796 /* Function vect_permute_load_chain.
4798 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4799 a power of 2, generate extract_even/odd stmts to reorder the input data
4800 correctly. Return the final references for loads in RESULT_CHAIN.
4802 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4803 The input is 4 vectors each containing 8 elements. We assign a number to each
4804 element, the input sequence is:
4806 1st vec: 0 1 2 3 4 5 6 7
4807 2nd vec: 8 9 10 11 12 13 14 15
4808 3rd vec: 16 17 18 19 20 21 22 23
4809 4th vec: 24 25 26 27 28 29 30 31
4811 The output sequence should be:
4813 1st vec: 0 4 8 12 16 20 24 28
4814 2nd vec: 1 5 9 13 17 21 25 29
4815 3rd vec: 2 6 10 14 18 22 26 30
4816 4th vec: 3 7 11 15 19 23 27 31
4818 i.e., the first output vector should contain the first elements of each
4819 interleaving group, etc.
4821 We use extract_even/odd instructions to create such output. The input of each
4822 extract_even/odd operation is two vectors
4826 and the output is the vector of extracted even/odd elements. The output of
4827 extract_even will be: 0 2 4 6
4828 and of extract_odd: 1 3 5 7
4831 The permutation is done in log LENGTH stages. In each stage extract_even and
4832 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
4833 order. In our example,
4835 E1: extract_even (1st vec, 2nd vec)
4836 E2: extract_odd (1st vec, 2nd vec)
4837 E3: extract_even (3rd vec, 4th vec)
4838 E4: extract_odd (3rd vec, 4th vec)
4840 The output for the first stage will be:
4842 E1: 0 2 4 6 8 10 12 14
4843 E2: 1 3 5 7 9 11 13 15
4844 E3: 16 18 20 22 24 26 28 30
4845 E4: 17 19 21 23 25 27 29 31
4847 In order to proceed and create the correct sequence for the next stage (or
4848 for the correct output, if the second stage is the last one, as in our
4849 example), we first put the output of extract_even operation and then the
4850 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4851 The input for the second stage is:
4853 1st vec (E1): 0 2 4 6 8 10 12 14
4854 2nd vec (E3): 16 18 20 22 24 26 28 30
4855 3rd vec (E2): 1 3 5 7 9 11 13 15
4856 4th vec (E4): 17 19 21 23 25 27 29 31
4858 The output of the second stage:
4860 E1: 0 4 8 12 16 20 24 28
4861 E2: 2 6 10 14 18 22 26 30
4862 E3: 1 5 9 13 17 21 25 29
4863 E4: 3 7 11 15 19 23 27 31
4865 And RESULT_CHAIN after reordering:
4867 1st vec (E1): 0 4 8 12 16 20 24 28
4868 2nd vec (E3): 1 5 9 13 17 21 25 29
4869 3rd vec (E2): 2 6 10 14 18 22 26 30
4870 4th vec (E4): 3 7 11 15 19 23 27 31. */
4873 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
4874 unsigned int length,
4876 block_stmt_iterator *bsi,
4877 VEC(tree,heap) **result_chain)
4879 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
4880 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4885 /* Check that the operation is supported. */
4886 if (!vect_strided_load_supported (vectype))
4889 *result_chain = VEC_copy (tree, heap, dr_chain);
4890 for (i = 0; i < exact_log2 (length); i++)
4892 for (j = 0; j < length; j +=2)
4894 first_vect = VEC_index (tree, dr_chain, j);
4895 second_vect = VEC_index (tree, dr_chain, j+1);
4897 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4898 perm_dest = create_tmp_var (vectype, "vect_perm_even");
4899 DECL_GIMPLE_REG_P (perm_dest) = 1;
4900 add_referenced_var (perm_dest);
4902 tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
4903 first_vect, second_vect);
4904 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4906 data_ref = make_ssa_name (perm_dest, perm_stmt);
4907 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
4908 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4909 mark_symbols_for_renaming (perm_stmt);
4911 VEC_replace (tree, *result_chain, j/2, data_ref);
4913 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4914 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
4915 DECL_GIMPLE_REG_P (perm_dest) = 1;
4916 add_referenced_var (perm_dest);
4918 tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
4919 first_vect, second_vect);
4920 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4921 data_ref = make_ssa_name (perm_dest, perm_stmt);
4922 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
4923 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4924 mark_symbols_for_renaming (perm_stmt);
4926 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
4928 dr_chain = VEC_copy (tree, heap, *result_chain);
4934 /* Function vect_transform_strided_load.
4936 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4937 to perform their permutation and ascribe the result vectorized statements to
4938 the scalar statements.
4942 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
4943 block_stmt_iterator *bsi)
4945 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4946 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4947 tree next_stmt, new_stmt;
4948 VEC(tree,heap) *result_chain = NULL;
4949 unsigned int i, gap_count;
4952 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4953 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4954 vectors, that are ready for vector computation. */
4955 result_chain = VEC_alloc (tree, heap, size);
4957 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
4960 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4961 Since we scan the chain starting from it's first node, their order
4962 corresponds the order of data-refs in RESULT_CHAIN. */
4963 next_stmt = first_stmt;
4965 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
4970 /* Skip the gaps. Loads created for the gaps will be removed by dead
4971 code elimination pass later.
4972 DR_GROUP_GAP is the number of steps in elements from the previous
4973 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
4974 correspond to the gaps.
4976 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
4984 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4985 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4986 copies, and we put the new vector statement in the first available
4988 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4989 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4992 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4993 tree rel_stmt = STMT_VINFO_RELATED_STMT (
4994 vinfo_for_stmt (prev_stmt));
4997 prev_stmt = rel_stmt;
4998 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5000 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
5002 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5004 /* If NEXT_STMT accesses the same DR as the previous statement,
5005 put the same TMP_DATA_REF as its vectorized statement; otherwise
5006 get the next data-ref from RESULT_CHAIN. */
5007 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5015 /* vectorizable_load.
5017 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
5019 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5020 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5021 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5024 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
5027 tree vec_dest = NULL;
5028 tree data_ref = NULL;
5030 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5031 stmt_vec_info prev_stmt_info;
5032 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5033 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5034 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5035 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5036 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
5037 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5040 tree new_stmt = NULL_TREE;
5042 enum dr_alignment_support alignment_support_scheme;
5043 tree dataref_ptr = NULL_TREE;
5045 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5046 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5047 int i, j, group_size;
5048 tree msq = NULL_TREE, lsq;
5049 tree offset = NULL_TREE;
5050 tree realignment_token = NULL_TREE;
5051 tree phi = NULL_TREE;
5052 VEC(tree,heap) *dr_chain = NULL;
5053 bool strided_load = false;
5057 bool compute_in_loop = false;
5058 struct loop *at_loop;
5060 gcc_assert (ncopies >= 1);
5062 /* FORNOW. This restriction should be relaxed. */
5063 if (nested_in_vect_loop && ncopies > 1)
5065 if (vect_print_dump_info (REPORT_DETAILS))
5066 fprintf (vect_dump, "multiple types in nested loop.");
5070 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5073 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5076 /* FORNOW: not yet supported. */
5077 if (STMT_VINFO_LIVE_P (stmt_info))
5079 if (vect_print_dump_info (REPORT_DETAILS))
5080 fprintf (vect_dump, "value used after loop.");
5084 /* Is vectorizable load? */
5085 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5088 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5089 if (TREE_CODE (scalar_dest) != SSA_NAME)
5092 op = GIMPLE_STMT_OPERAND (stmt, 1);
5093 if (TREE_CODE (op) != ARRAY_REF
5094 && TREE_CODE (op) != INDIRECT_REF
5095 && !DR_GROUP_FIRST_DR (stmt_info))
5098 if (!STMT_VINFO_DATA_REF (stmt_info))
5101 scalar_type = TREE_TYPE (DR_REF (dr));
5102 mode = (int) TYPE_MODE (vectype);
5104 /* FORNOW. In some cases can vectorize even if data-type not supported
5105 (e.g. - data copies). */
5106 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
5108 if (vect_print_dump_info (REPORT_DETAILS))
5109 fprintf (vect_dump, "Aligned load, but unsupported type.");
5113 /* Check if the load is a part of an interleaving chain. */
5114 if (DR_GROUP_FIRST_DR (stmt_info))
5116 strided_load = true;
5118 gcc_assert (! nested_in_vect_loop);
5120 /* Check if interleaving is supported. */
5121 if (!vect_strided_load_supported (vectype))
5125 if (!vec_stmt) /* transformation not required. */
5127 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
5128 vect_model_load_cost (stmt_info, ncopies);
5132 if (vect_print_dump_info (REPORT_DETAILS))
5133 fprintf (vect_dump, "transform load.");
5139 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5140 /* Check if the chain of loads is already vectorized. */
5141 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
5143 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5146 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5147 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5148 dr_chain = VEC_alloc (tree, heap, group_size);
5157 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5158 gcc_assert (alignment_support_scheme);
5160 /* In case the vectorization factor (VF) is bigger than the number
5161 of elements that we can fit in a vectype (nunits), we have to generate
5162 more than one vector stmt - i.e - we need to "unroll" the
5163 vector stmt by a factor VF/nunits. In doing so, we record a pointer
5164 from one copy of the vector stmt to the next, in the field
5165 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
5166 stages to find the correct vector defs to be used when vectorizing
5167 stmts that use the defs of the current stmt. The example below illustrates
5168 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
5169 4 vectorized stmts):
5171 before vectorization:
5172 RELATED_STMT VEC_STMT
5176 step 1: vectorize stmt S1:
5177 We first create the vector stmt VS1_0, and, as usual, record a
5178 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
5179 Next, we create the vector stmt VS1_1, and record a pointer to
5180 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
5181 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
5183 RELATED_STMT VEC_STMT
5184 VS1_0: vx0 = memref0 VS1_1 -
5185 VS1_1: vx1 = memref1 VS1_2 -
5186 VS1_2: vx2 = memref2 VS1_3 -
5187 VS1_3: vx3 = memref3 - -
5188 S1: x = load - VS1_0
5191 See in documentation in vect_get_vec_def_for_stmt_copy for how the
5192 information we recorded in RELATED_STMT field is used to vectorize
5195 /* In case of interleaving (non-unit strided access):
5202 Vectorized loads are created in the order of memory accesses
5203 starting from the access of the first stmt of the chain:
5206 VS2: vx1 = &base + vec_size*1
5207 VS3: vx3 = &base + vec_size*2
5208 VS4: vx4 = &base + vec_size*3
5210 Then permutation statements are generated:
5212 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
5213 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
5216 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5217 (the order of the data-refs in the output of vect_permute_load_chain
5218 corresponds to the order of scalar stmts in the interleaving chain - see
5219 the documentation of vect_permute_load_chain()).
5220 The generation of permutation stmts and recording them in
5221 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
5223 In case of both multiple types and interleaving, the vector loads and
5224 permutation stmts above are created for every copy. The result vector stmts
5225 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5226 STMT_VINFO_RELATED_STMT for the next copies. */
5228 /* If the data reference is aligned (dr_aligned) or potentially unaligned
5229 on a target that supports unaligned accesses (dr_unaligned_supported)
5230 we generate the following code:
5234 p = p + indx * vectype_size;
5239 Otherwise, the data reference is potentially unaligned on a target that
5240 does not support unaligned accesses (dr_explicit_realign_optimized) -
5241 then generate the following code, in which the data in each iteration is
5242 obtained by two vector loads, one from the previous iteration, and one
5243 from the current iteration:
5245 msq_init = *(floor(p1))
5246 p2 = initial_addr + VS - 1;
5247 realignment_token = call target_builtin;
5250 p2 = p2 + indx * vectype_size
5252 vec_dest = realign_load (msq, lsq, realignment_token)
5257 /* If the misalignment remains the same throughout the execution of the
5258 loop, we can create the init_addr and permutation mask at the loop
5259 preheader. Otherwise, it needs to be created inside the loop.
5260 This can only occur when vectorizing memory accesses in the inner-loop
5261 nested within an outer-loop that is being vectorized. */
5263 if (nested_in_vect_loop_p (loop, stmt)
5264 && (TREE_INT_CST_LOW (DR_STEP (dr)) % UNITS_PER_SIMD_WORD != 0))
5266 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
5267 compute_in_loop = true;
5270 if ((alignment_support_scheme == dr_explicit_realign_optimized
5271 || alignment_support_scheme == dr_explicit_realign)
5272 && !compute_in_loop)
5274 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token,
5275 alignment_support_scheme, NULL_TREE,
5277 if (alignment_support_scheme == dr_explicit_realign_optimized)
5279 phi = SSA_NAME_DEF_STMT (msq);
5280 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5286 prev_stmt_info = NULL;
5287 for (j = 0; j < ncopies; j++)
5289 /* 1. Create the vector pointer update chain. */
5291 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
5293 &dummy, &ptr_incr, false,
5297 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
5299 for (i = 0; i < group_size; i++)
5301 /* 2. Create the vector-load in the loop. */
5302 switch (alignment_support_scheme)
5305 gcc_assert (aligned_access_p (first_dr));
5306 data_ref = build_fold_indirect_ref (dataref_ptr);
5308 case dr_unaligned_supported:
5310 int mis = DR_MISALIGNMENT (first_dr);
5311 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
5313 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
5315 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
5318 case dr_explicit_realign:
5321 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5323 if (compute_in_loop)
5324 msq = vect_setup_realignment (first_stmt, bsi,
5326 dr_explicit_realign,
5329 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5330 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5331 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5332 new_temp = make_ssa_name (vec_dest, new_stmt);
5333 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5334 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5335 copy_virtual_operands (new_stmt, stmt);
5336 mark_symbols_for_renaming (new_stmt);
5339 bump = size_binop (MULT_EXPR, vs_minus_1,
5340 TYPE_SIZE_UNIT (scalar_type));
5341 ptr = bump_vector_ptr (dataref_ptr, NULL_TREE, bsi, stmt, bump);
5342 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5345 case dr_explicit_realign_optimized:
5346 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5351 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5352 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5353 new_temp = make_ssa_name (vec_dest, new_stmt);
5354 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5355 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5356 mark_symbols_for_renaming (new_stmt);
5358 /* 3. Handle explicit realignment if necessary/supported. Create in
5359 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
5360 if (alignment_support_scheme == dr_explicit_realign_optimized
5361 || alignment_support_scheme == dr_explicit_realign)
5363 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
5364 if (!realignment_token)
5365 realignment_token = dataref_ptr;
5366 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5367 new_stmt = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
5369 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5370 new_temp = make_ssa_name (vec_dest, new_stmt);
5371 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5372 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5374 if (alignment_support_scheme == dr_explicit_realign_optimized)
5376 if (i == group_size - 1 && j == ncopies - 1)
5377 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
5382 /* 4. Handle invariant-load. */
5385 gcc_assert (!strided_load);
5386 gcc_assert (nested_in_vect_loop_p (loop, stmt));
5391 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
5393 /* CHECKME: bitpos depends on endianess? */
5394 bitpos = bitsize_zero_node;
5395 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5397 BIT_FIELD_REF_UNSIGNED (vec_inv) =
5398 TYPE_UNSIGNED (scalar_type);
5400 vect_create_destination_var (scalar_dest, NULL_TREE);
5401 new_stmt = build_gimple_modify_stmt (vec_dest, vec_inv);
5402 new_temp = make_ssa_name (vec_dest, new_stmt);
5403 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5404 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5406 for (k = nunits - 1; k >= 0; --k)
5407 t = tree_cons (NULL_TREE, new_temp, t);
5408 /* FIXME: use build_constructor directly. */
5409 vec_inv = build_constructor_from_list (vectype, t);
5410 new_temp = vect_init_vector (stmt, vec_inv, vectype, bsi);
5411 new_stmt = SSA_NAME_DEF_STMT (new_temp);
5414 gcc_unreachable (); /* FORNOW. */
5418 VEC_quick_push (tree, dr_chain, new_temp);
5419 if (i < group_size - 1)
5421 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
5426 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
5428 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5429 dr_chain = VEC_alloc (tree, heap, group_size);
5434 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5436 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5437 prev_stmt_info = vinfo_for_stmt (new_stmt);
5445 /* Function vectorizable_live_operation.
5447 STMT computes a value that is used outside the loop. Check if
5448 it can be supported. */
5451 vectorizable_live_operation (tree stmt,
5452 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
5453 tree *vec_stmt ATTRIBUTE_UNUSED)
5456 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5457 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5458 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5463 enum vect_def_type dt;
5465 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5467 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5470 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5473 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
5476 /* FORNOW. CHECKME. */
5477 if (nested_in_vect_loop_p (loop, stmt))
5480 operation = GIMPLE_STMT_OPERAND (stmt, 1);
5481 op_type = TREE_OPERAND_LENGTH (operation);
5483 /* FORNOW: support only if all uses are invariant. This means
5484 that the scalar operations can remain in place, unvectorized.
5485 The original last scalar value that they compute will be used. */
5487 for (i = 0; i < op_type; i++)
5489 op = TREE_OPERAND (operation, i);
5490 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5492 if (vect_print_dump_info (REPORT_DETAILS))
5493 fprintf (vect_dump, "use not simple.");
5497 if (dt != vect_invariant_def && dt != vect_constant_def)
5501 /* No transformation is required for the cases we currently support. */
5506 /* Function vect_is_simple_cond.
5509 LOOP - the loop that is being vectorized.
5510 COND - Condition that is checked for simple use.
5512 Returns whether a COND can be vectorized. Checks whether
5513 condition operands are supportable using vec_is_simple_use. */
5516 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
5520 enum vect_def_type dt;
5522 if (!COMPARISON_CLASS_P (cond))
5525 lhs = TREE_OPERAND (cond, 0);
5526 rhs = TREE_OPERAND (cond, 1);
5528 if (TREE_CODE (lhs) == SSA_NAME)
5530 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
5531 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
5534 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
5535 && TREE_CODE (lhs) != FIXED_CST)
5538 if (TREE_CODE (rhs) == SSA_NAME)
5540 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
5541 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
5544 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
5545 && TREE_CODE (rhs) != FIXED_CST)
5551 /* vectorizable_condition.
5553 Check if STMT is conditional modify expression that can be vectorized.
5554 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5555 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
5558 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5561 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
5563 tree scalar_dest = NULL_TREE;
5564 tree vec_dest = NULL_TREE;
5565 tree op = NULL_TREE;
5566 tree cond_expr, then_clause, else_clause;
5567 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5568 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5569 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
5570 tree vec_compare, vec_cond_expr;
5572 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5573 enum machine_mode vec_mode;
5575 enum vect_def_type dt;
5576 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5577 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5579 gcc_assert (ncopies >= 1);
5581 return false; /* FORNOW */
5583 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5586 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5589 /* FORNOW: not yet supported. */
5590 if (STMT_VINFO_LIVE_P (stmt_info))
5592 if (vect_print_dump_info (REPORT_DETAILS))
5593 fprintf (vect_dump, "value used after loop.");
5597 /* Is vectorizable conditional operation? */
5598 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5601 op = GIMPLE_STMT_OPERAND (stmt, 1);
5603 if (TREE_CODE (op) != COND_EXPR)
5606 cond_expr = TREE_OPERAND (op, 0);
5607 then_clause = TREE_OPERAND (op, 1);
5608 else_clause = TREE_OPERAND (op, 2);
5610 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
5613 /* We do not handle two different vector types for the condition
5615 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
5618 if (TREE_CODE (then_clause) == SSA_NAME)
5620 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
5621 if (!vect_is_simple_use (then_clause, loop_vinfo,
5622 &then_def_stmt, &def, &dt))
5625 else if (TREE_CODE (then_clause) != INTEGER_CST
5626 && TREE_CODE (then_clause) != REAL_CST
5627 && TREE_CODE (then_clause) != FIXED_CST)
5630 if (TREE_CODE (else_clause) == SSA_NAME)
5632 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
5633 if (!vect_is_simple_use (else_clause, loop_vinfo,
5634 &else_def_stmt, &def, &dt))
5637 else if (TREE_CODE (else_clause) != INTEGER_CST
5638 && TREE_CODE (else_clause) != REAL_CST
5639 && TREE_CODE (else_clause) != FIXED_CST)
5643 vec_mode = TYPE_MODE (vectype);
5647 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
5648 return expand_vec_cond_expr_p (op, vec_mode);
5654 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5655 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5657 /* Handle cond expr. */
5659 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
5661 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
5662 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
5663 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
5665 /* Arguments are ready. create the new vector stmt. */
5666 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
5667 vec_cond_lhs, vec_cond_rhs);
5668 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
5669 vec_compare, vec_then_clause, vec_else_clause);
5671 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
5672 new_temp = make_ssa_name (vec_dest, *vec_stmt);
5673 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
5674 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
5679 /* Function vect_transform_stmt.
5681 Create a vectorized stmt to replace STMT, and insert it at BSI. */
5684 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
5686 bool is_store = false;
5687 tree vec_stmt = NULL_TREE;
5688 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5689 tree orig_stmt_in_pattern;
5692 switch (STMT_VINFO_TYPE (stmt_info))
5694 case type_demotion_vec_info_type:
5695 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
5699 case type_promotion_vec_info_type:
5700 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
5704 case type_conversion_vec_info_type:
5705 done = vectorizable_conversion (stmt, bsi, &vec_stmt);
5709 case induc_vec_info_type:
5710 done = vectorizable_induction (stmt, bsi, &vec_stmt);
5714 case op_vec_info_type:
5715 done = vectorizable_operation (stmt, bsi, &vec_stmt);
5719 case assignment_vec_info_type:
5720 done = vectorizable_assignment (stmt, bsi, &vec_stmt);
5724 case load_vec_info_type:
5725 done = vectorizable_load (stmt, bsi, &vec_stmt);
5729 case store_vec_info_type:
5730 done = vectorizable_store (stmt, bsi, &vec_stmt);
5732 if (DR_GROUP_FIRST_DR (stmt_info))
5734 /* In case of interleaving, the whole chain is vectorized when the
5735 last store in the chain is reached. Store stmts before the last
5736 one are skipped, and there vec_stmt_info shouldn't be freed
5738 *strided_store = true;
5739 if (STMT_VINFO_VEC_STMT (stmt_info))
5746 case condition_vec_info_type:
5747 done = vectorizable_condition (stmt, bsi, &vec_stmt);
5751 case call_vec_info_type:
5752 done = vectorizable_call (stmt, bsi, &vec_stmt);
5755 case reduc_vec_info_type:
5756 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
5761 if (!STMT_VINFO_LIVE_P (stmt_info))
5763 if (vect_print_dump_info (REPORT_DETAILS))
5764 fprintf (vect_dump, "stmt not supported.");
5769 if (STMT_VINFO_LIVE_P (stmt_info)
5770 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
5772 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
5778 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
5779 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
5780 if (orig_stmt_in_pattern)
5782 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
5783 /* STMT was inserted by the vectorizer to replace a computation idiom.
5784 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
5785 computed this idiom. We need to record a pointer to VEC_STMT in
5786 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
5787 documentation of vect_pattern_recog. */
5788 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
5790 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5791 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
5800 /* This function builds ni_name = number of iterations loop executes
5801 on the loop preheader. */
5804 vect_build_loop_niters (loop_vec_info loop_vinfo)
5806 tree ni_name, stmt, var;
5808 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5809 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5811 var = create_tmp_var (TREE_TYPE (ni), "niters");
5812 add_referenced_var (var);
5813 ni_name = force_gimple_operand (ni, &stmt, false, var);
5815 pe = loop_preheader_edge (loop);
5818 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
5819 gcc_assert (!new_bb);
5826 /* This function generates the following statements:
5828 ni_name = number of iterations loop executes
5829 ratio = ni_name / vf
5830 ratio_mult_vf_name = ratio * vf
5832 and places them at the loop preheader edge. */
5835 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5837 tree *ratio_mult_vf_name_ptr,
5838 tree *ratio_name_ptr)
5846 tree ratio_mult_vf_name;
5847 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5848 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
5849 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5852 pe = loop_preheader_edge (loop);
5854 /* Generate temporary variable that contains
5855 number of iterations loop executes. */
5857 ni_name = vect_build_loop_niters (loop_vinfo);
5858 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
5860 /* Create: ratio = ni >> log2(vf) */
5862 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
5863 if (!is_gimple_val (ratio_name))
5865 var = create_tmp_var (TREE_TYPE (ni), "bnd");
5866 add_referenced_var (var);
5868 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
5869 pe = loop_preheader_edge (loop);
5870 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
5871 gcc_assert (!new_bb);
5874 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5876 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5877 ratio_name, log_vf);
5878 if (!is_gimple_val (ratio_mult_vf_name))
5880 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
5881 add_referenced_var (var);
5883 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
5885 pe = loop_preheader_edge (loop);
5886 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
5887 gcc_assert (!new_bb);
5890 *ni_name_ptr = ni_name;
5891 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5892 *ratio_name_ptr = ratio_name;
5898 /* Function vect_update_ivs_after_vectorizer.
5900 "Advance" the induction variables of LOOP to the value they should take
5901 after the execution of LOOP. This is currently necessary because the
5902 vectorizer does not handle induction variables that are used after the
5903 loop. Such a situation occurs when the last iterations of LOOP are
5905 1. We introduced new uses after LOOP for IVs that were not originally used
5906 after LOOP: the IVs of LOOP are now used by an epilog loop.
5907 2. LOOP is going to be vectorized; this means that it will iterate N/VF
5908 times, whereas the loop IVs should be bumped N times.
5911 - LOOP - a loop that is going to be vectorized. The last few iterations
5912 of LOOP were peeled.
5913 - NITERS - the number of iterations that LOOP executes (before it is
5914 vectorized). i.e, the number of times the ivs should be bumped.
5915 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
5916 coming out from LOOP on which there are uses of the LOOP ivs
5917 (this is the path from LOOP->exit to epilog_loop->preheader).
5919 The new definitions of the ivs are placed in LOOP->exit.
5920 The phi args associated with the edge UPDATE_E in the bb
5921 UPDATE_E->dest are updated accordingly.
5923 Assumption 1: Like the rest of the vectorizer, this function assumes
5924 a single loop exit that has a single predecessor.
5926 Assumption 2: The phi nodes in the LOOP header and in update_bb are
5927 organized in the same order.
5929 Assumption 3: The access function of the ivs is simple enough (see
5930 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
5932 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
5933 coming out of LOOP on which the ivs of LOOP are used (this is the path
5934 that leads to the epilog loop; other paths skip the epilog loop). This
5935 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
5936 needs to have its phis updated.
5940 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
5943 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5944 basic_block exit_bb = single_exit (loop)->dest;
5946 basic_block update_bb = update_e->dest;
5948 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
5950 /* Make sure there exists a single-predecessor exit bb: */
5951 gcc_assert (single_pred_p (exit_bb));
5953 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
5955 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
5957 tree access_fn = NULL;
5958 tree evolution_part;
5961 tree var, ni, ni_name;
5962 block_stmt_iterator last_bsi;
5964 if (vect_print_dump_info (REPORT_DETAILS))
5966 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
5967 print_generic_expr (vect_dump, phi, TDF_SLIM);
5970 /* Skip virtual phi's. */
5971 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
5973 if (vect_print_dump_info (REPORT_DETAILS))
5974 fprintf (vect_dump, "virtual phi. skip.");
5978 /* Skip reduction phis. */
5979 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
5981 if (vect_print_dump_info (REPORT_DETAILS))
5982 fprintf (vect_dump, "reduc phi. skip.");
5986 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
5987 gcc_assert (access_fn);
5989 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
5990 gcc_assert (evolution_part != NULL_TREE);
5992 /* FORNOW: We do not support IVs whose evolution function is a polynomial
5993 of degree >= 2 or exponential. */
5994 gcc_assert (!tree_is_chrec (evolution_part));
5996 step_expr = evolution_part;
5997 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
6000 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
6001 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
6003 fold_convert (sizetype,
6004 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
6005 niters, step_expr)));
6007 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
6008 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
6009 fold_convert (TREE_TYPE (init_expr),
6016 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
6017 add_referenced_var (var);
6019 last_bsi = bsi_last (exit_bb);
6020 ni_name = force_gimple_operand_bsi (&last_bsi, ni, false, var,
6021 true, BSI_SAME_STMT);
6023 /* Fix phi expressions in the successor bb. */
6024 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
6029 /* Function vect_do_peeling_for_loop_bound
6031 Peel the last iterations of the loop represented by LOOP_VINFO.
6032 The peeled iterations form a new epilog loop. Given that the loop now
6033 iterates NITERS times, the new epilog loop iterates
6034 NITERS % VECTORIZATION_FACTOR times.
6036 The original loop will later be made to iterate
6037 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
6040 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
6042 tree ni_name, ratio_mult_vf_name;
6043 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6044 struct loop *new_loop;
6046 basic_block preheader;
6049 int min_scalar_loop_bound;
6050 int min_profitable_iters;
6052 if (vect_print_dump_info (REPORT_DETAILS))
6053 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
6055 initialize_original_copy_tables ();
6057 /* Generate the following variables on the preheader of original loop:
6059 ni_name = number of iteration the original loop executes
6060 ratio = ni_name / vf
6061 ratio_mult_vf_name = ratio * vf */
6062 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
6063 &ratio_mult_vf_name, ratio);
6065 loop_num = loop->num;
6067 /* Analyze cost to set threshhold for vectorized loop. */
6068 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6069 min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND))
6070 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6072 /* Use the cost model only if it is more conservative than user specified
6075 th = (unsigned) min_scalar_loop_bound;
6076 if (min_profitable_iters
6077 && (!min_scalar_loop_bound
6078 || min_profitable_iters > min_scalar_loop_bound))
6079 th = (unsigned) min_profitable_iters;
6081 if (min_profitable_iters
6082 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
6083 && vect_print_dump_info (REPORT_DETAILS))
6084 fprintf (vect_dump, "vectorization may not be profitable.");
6086 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
6087 ratio_mult_vf_name, ni_name, false,
6089 gcc_assert (new_loop);
6090 gcc_assert (loop_num == loop->num);
6091 #ifdef ENABLE_CHECKING
6092 slpeel_verify_cfg_after_peeling (loop, new_loop);
6095 /* A guard that controls whether the new_loop is to be executed or skipped
6096 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
6097 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
6098 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
6099 is on the path where the LOOP IVs are used and need to be updated. */
6101 preheader = loop_preheader_edge (new_loop)->src;
6102 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
6103 update_e = EDGE_PRED (preheader, 0);
6105 update_e = EDGE_PRED (preheader, 1);
6107 /* Update IVs of original loop as if they were advanced
6108 by ratio_mult_vf_name steps. */
6109 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
6111 /* After peeling we have to reset scalar evolution analyzer. */
6114 free_original_copy_tables ();
6118 /* Function vect_gen_niters_for_prolog_loop
6120 Set the number of iterations for the loop represented by LOOP_VINFO
6121 to the minimum between LOOP_NITERS (the original iteration count of the loop)
6122 and the misalignment of DR - the data reference recorded in
6123 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
6124 this loop, the data reference DR will refer to an aligned location.
6126 The following computation is generated:
6128 If the misalignment of DR is known at compile time:
6129 addr_mis = int mis = DR_MISALIGNMENT (dr);
6130 Else, compute address misalignment in bytes:
6131 addr_mis = addr & (vectype_size - 1)
6133 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
6135 (elem_size = element type size; an element is the scalar element
6136 whose type is the inner type of the vectype)
6140 prolog_niters = min ( LOOP_NITERS ,
6141 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
6142 where group_size is the size of the interleaved group.
6144 The above formulas assume that VF == number of elements in the vector. This
6145 may not hold when there are multiple-types in the loop.
6146 In this case, for some data-references in the loop the VF does not represent
6147 the number of elements that fit in the vector. Therefore, instead of VF we
6148 use TYPE_VECTOR_SUBPARTS. */
6151 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
6153 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
6154 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6156 tree iters, iters_name;
6159 tree dr_stmt = DR_STMT (dr);
6160 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
6161 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6162 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
6163 tree niters_type = TREE_TYPE (loop_niters);
6165 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
6166 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
6168 if (DR_GROUP_FIRST_DR (stmt_info))
6170 /* For interleaved access element size must be multiplied by the size of
6171 the interleaved group. */
6172 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
6173 DR_GROUP_FIRST_DR (stmt_info)));
6174 element_size *= group_size;
6177 pe = loop_preheader_edge (loop);
6179 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
6181 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
6182 int elem_misalign = byte_misalign / element_size;
6184 if (vect_print_dump_info (REPORT_DETAILS))
6185 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
6186 iters = build_int_cst (niters_type,
6187 (nelements - elem_misalign)&(nelements/group_size-1));
6191 tree new_stmts = NULL_TREE;
6192 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
6193 &new_stmts, NULL_TREE, loop);
6194 tree ptr_type = TREE_TYPE (start_addr);
6195 tree size = TYPE_SIZE (ptr_type);
6196 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
6197 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
6198 tree elem_size_log =
6199 build_int_cst (type, exact_log2 (vectype_align/nelements));
6200 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
6201 tree nelements_tree = build_int_cst (type, nelements);
6205 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
6206 gcc_assert (!new_bb);
6208 /* Create: byte_misalign = addr & (vectype_size - 1) */
6210 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
6212 /* Create: elem_misalign = byte_misalign / element_size */
6214 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
6216 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
6217 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
6218 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
6219 iters = fold_convert (niters_type, iters);
6222 /* Create: prolog_loop_niters = min (iters, loop_niters) */
6223 /* If the loop bound is known at compile time we already verified that it is
6224 greater than vf; since the misalignment ('iters') is at most vf, there's
6225 no need to generate the MIN_EXPR in this case. */
6226 if (TREE_CODE (loop_niters) != INTEGER_CST)
6227 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
6229 if (vect_print_dump_info (REPORT_DETAILS))
6231 fprintf (vect_dump, "niters for prolog loop: ");
6232 print_generic_expr (vect_dump, iters, TDF_SLIM);
6235 var = create_tmp_var (niters_type, "prolog_loop_niters");
6236 add_referenced_var (var);
6237 iters_name = force_gimple_operand (iters, &stmt, false, var);
6239 /* Insert stmt on loop preheader edge. */
6242 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6243 gcc_assert (!new_bb);
6250 /* Function vect_update_init_of_dr
6252 NITERS iterations were peeled from LOOP. DR represents a data reference
6253 in LOOP. This function updates the information recorded in DR to
6254 account for the fact that the first NITERS iterations had already been
6255 executed. Specifically, it updates the OFFSET field of DR. */
6258 vect_update_init_of_dr (struct data_reference *dr, tree niters)
6260 tree offset = DR_OFFSET (dr);
6262 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
6263 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
6264 DR_OFFSET (dr) = offset;
6268 /* Function vect_update_inits_of_drs
6270 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
6271 This function updates the information recorded for the data references in
6272 the loop to account for the fact that the first NITERS iterations had
6273 already been executed. Specifically, it updates the initial_condition of
6274 the access_function of all the data_references in the loop. */
6277 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
6280 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
6281 struct data_reference *dr;
6283 if (vect_print_dump_info (REPORT_DETAILS))
6284 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
6286 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
6287 vect_update_init_of_dr (dr, niters);
6291 /* Function vect_do_peeling_for_alignment
6293 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
6294 'niters' is set to the misalignment of one of the data references in the
6295 loop, thereby forcing it to refer to an aligned location at the beginning
6296 of the execution of this loop. The data reference for which we are
6297 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
6300 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
6302 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6303 tree niters_of_prolog_loop, ni_name;
6305 struct loop *new_loop;
6307 if (vect_print_dump_info (REPORT_DETAILS))
6308 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
6310 initialize_original_copy_tables ();
6312 ni_name = vect_build_loop_niters (loop_vinfo);
6313 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
6315 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
6317 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
6318 niters_of_prolog_loop, ni_name, true, 0);
6319 gcc_assert (new_loop);
6320 #ifdef ENABLE_CHECKING
6321 slpeel_verify_cfg_after_peeling (new_loop, loop);
6324 /* Update number of times loop executes. */
6325 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
6326 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
6327 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
6329 /* Update the init conditions of the access functions of all data refs. */
6330 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
6332 /* After peeling we have to reset scalar evolution analyzer. */
6335 free_original_copy_tables ();
6339 /* Function vect_create_cond_for_align_checks.
6341 Create a conditional expression that represents the alignment checks for
6342 all of data references (array element references) whose alignment must be
6346 LOOP_VINFO - two fields of the loop information are used.
6347 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
6348 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
6351 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6353 The returned value is the conditional expression to be used in the if
6354 statement that controls which version of the loop gets executed at runtime.
6356 The algorithm makes two assumptions:
6357 1) The number of bytes "n" in a vector is a power of 2.
6358 2) An address "a" is aligned if a%n is zero and that this
6359 test can be done as a&(n-1) == 0. For example, for 16
6360 byte vectors the test is a&0xf == 0. */
6363 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
6364 tree *cond_expr_stmt_list)
6366 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6367 VEC(tree,heap) *may_misalign_stmts
6368 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
6370 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
6374 tree int_ptrsize_type;
6376 tree or_tmp_name = NULL_TREE;
6377 tree and_tmp, and_tmp_name, and_stmt;
6380 /* Check that mask is one less than a power of 2, i.e., mask is
6381 all zeros followed by all ones. */
6382 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
6384 /* CHECKME: what is the best integer or unsigned type to use to hold a
6385 cast from a pointer value? */
6386 psize = TYPE_SIZE (ptr_type_node);
6388 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
6390 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
6391 of the first vector of the i'th data reference. */
6393 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
6395 tree new_stmt_list = NULL_TREE;
6397 tree addr_tmp, addr_tmp_name, addr_stmt;
6398 tree or_tmp, new_or_tmp_name, or_stmt;
6400 /* create: addr_tmp = (int)(address_of_first_vector) */
6401 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
6402 &new_stmt_list, NULL_TREE, loop);
6404 if (new_stmt_list != NULL_TREE)
6405 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
6407 sprintf (tmp_name, "%s%d", "addr2int", i);
6408 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6409 add_referenced_var (addr_tmp);
6410 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
6411 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
6412 addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
6413 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
6414 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
6416 /* The addresses are OR together. */
6418 if (or_tmp_name != NULL_TREE)
6420 /* create: or_tmp = or_tmp | addr_tmp */
6421 sprintf (tmp_name, "%s%d", "orptrs", i);
6422 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6423 add_referenced_var (or_tmp);
6424 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
6425 tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
6426 or_tmp_name, addr_tmp_name);
6427 or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
6428 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
6429 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
6430 or_tmp_name = new_or_tmp_name;
6433 or_tmp_name = addr_tmp_name;
6437 mask_cst = build_int_cst (int_ptrsize_type, mask);
6439 /* create: and_tmp = or_tmp & mask */
6440 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
6441 add_referenced_var (and_tmp);
6442 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
6444 tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
6445 and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
6446 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
6447 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
6449 /* Make and_tmp the left operand of the conditional test against zero.
6450 if and_tmp has a nonzero bit then some address is unaligned. */
6451 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
6452 return build2 (EQ_EXPR, boolean_type_node,
6453 and_tmp_name, ptrsize_zero);
6456 /* Function vect_vfa_segment_size.
6458 Create an expression that computes the size of segment
6459 that will be accessed for a data reference. The functions takes into
6460 account that realignment loads may access one more vector.
6463 DR: The data reference.
6464 VECT_FACTOR: vectorization factor.
6466 Return an expression whose value is the size of segment which will be
6470 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
6472 tree segment_length;
6474 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
6477 build_int_cst (integer_type_node,
6478 GET_MODE_SIZE (TYPE_MODE (STMT_VINFO_VECTYPE
6479 (vinfo_for_stmt (DR_STMT (dr))))));
6482 fold_convert (sizetype,
6483 fold_build2 (PLUS_EXPR, integer_type_node,
6484 fold_build2 (MULT_EXPR, integer_type_node, DR_STEP (dr),
6491 fold_convert (sizetype,
6492 fold_build2 (MULT_EXPR, integer_type_node, DR_STEP (dr),
6496 return segment_length;
6499 /* Function vect_create_cond_for_alias_checks.
6501 Create a conditional expression that represents the run-time checks for
6502 overlapping of address ranges represented by a list of data references
6503 relations passed as input.
6506 COND_EXPR - input conditional expression. New conditions will be chained
6507 with logical and operation.
6508 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
6512 COND_EXPR - conditional expression.
6513 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6515 The returned value is the conditional expression to be used in the if
6516 statement that controls which version of the loop gets executed at runtime.
6520 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
6522 tree * cond_expr_stmt_list)
6524 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6525 VEC (ddr_p, heap) * may_alias_ddrs =
6526 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
6528 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
6532 tree part_cond_expr;
6534 /* Create expression
6535 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
6536 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
6540 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
6541 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
6543 if (VEC_empty (ddr_p, may_alias_ddrs))
6546 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
6548 tree stmt_a = DR_STMT (DDR_A (ddr));
6549 tree stmt_b = DR_STMT (DDR_B (ddr));
6552 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
6555 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
6558 tree segment_length_a = vect_vfa_segment_size (DDR_A (ddr), vect_factor);
6559 tree segment_length_b = vect_vfa_segment_size (DDR_B (ddr), vect_factor);
6561 if (vect_print_dump_info (REPORT_DR_DETAILS))
6564 "create runtime check for data references ");
6565 print_generic_expr (vect_dump, DR_REF (DDR_A (ddr)), TDF_SLIM);
6566 fprintf (vect_dump, " and ");
6567 print_generic_expr (vect_dump, DR_REF (DDR_B (ddr)), TDF_SLIM);
6572 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
6573 fold_build2 (LT_EXPR, boolean_type_node,
6574 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
6578 fold_build2 (LT_EXPR, boolean_type_node,
6579 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
6585 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
6586 *cond_expr, part_cond_expr);
6588 *cond_expr = part_cond_expr;
6590 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
6591 fprintf (vect_dump, "created %u versioning for alias checks.\n",
6592 VEC_length (ddr_p, may_alias_ddrs));
6596 /* Function vect_transform_loop.
6598 The analysis phase has determined that the loop is vectorizable.
6599 Vectorize the loop - created vectorized stmts to replace the scalar
6600 stmts in the loop, and update the loop exit condition. */
6603 vect_transform_loop (loop_vec_info loop_vinfo)
6605 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6606 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6607 int nbbs = loop->num_nodes;
6608 block_stmt_iterator si, next_si;
6611 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6614 if (vect_print_dump_info (REPORT_DETAILS))
6615 fprintf (vect_dump, "=== vec_transform_loop ===");
6617 /* If the loop has data references that may or may not be aligned or/and
6618 has data reference relations whose independence was not proven then
6619 two versions of the loop need to be generated, one which is vectorized
6620 and one which isn't. A test is then generated to control which of the
6621 loops is executed. The test checks for the alignment of all of the
6622 data references that may or may not be aligned. An additional
6623 sequence of runtime tests is generated for each pairs of DDRs whose
6624 independence was not proven. The vectorized version of loop is
6625 executed only if both alias and alignment tests are passed. */
6627 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6628 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
6631 tree cond_expr = NULL_TREE;
6632 tree cond_expr_stmt_list = NULL_TREE;
6633 basic_block condition_bb;
6634 block_stmt_iterator cond_exp_bsi;
6635 basic_block merge_bb;
6636 basic_block new_exit_bb;
6638 tree orig_phi, new_phi, arg;
6639 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
6640 tree gimplify_stmt_list;
6642 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
6644 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr_stmt_list);
6646 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
6647 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
6648 &cond_expr_stmt_list);
6651 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
6653 force_gimple_operand (cond_expr, &gimplify_stmt_list, true,
6655 append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list);
6657 initialize_original_copy_tables ();
6658 nloop = loop_version (loop, cond_expr, &condition_bb,
6659 prob, prob, REG_BR_PROB_BASE - prob, true);
6660 free_original_copy_tables();
6662 /** Loop versioning violates an assumption we try to maintain during
6663 vectorization - that the loop exit block has a single predecessor.
6664 After versioning, the exit block of both loop versions is the same
6665 basic block (i.e. it has two predecessors). Just in order to simplify
6666 following transformations in the vectorizer, we fix this situation
6667 here by adding a new (empty) block on the exit-edge of the loop,
6668 with the proper loop-exit phis to maintain loop-closed-form. **/
6670 merge_bb = single_exit (loop)->dest;
6671 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
6672 new_exit_bb = split_edge (single_exit (loop));
6673 new_exit_e = single_exit (loop);
6674 e = EDGE_SUCC (new_exit_bb, 0);
6676 for (orig_phi = phi_nodes (merge_bb); orig_phi;
6677 orig_phi = PHI_CHAIN (orig_phi))
6679 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
6681 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
6682 add_phi_arg (new_phi, arg, new_exit_e);
6683 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
6686 /** end loop-exit-fixes after versioning **/
6688 update_ssa (TODO_update_ssa);
6689 cond_exp_bsi = bsi_last (condition_bb);
6690 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
6693 /* CHECKME: we wouldn't need this if we called update_ssa once
6695 bitmap_zero (vect_memsyms_to_rename);
6697 /* Peel the loop if there are data refs with unknown alignment.
6698 Only one data ref with unknown store is allowed. */
6700 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
6701 vect_do_peeling_for_alignment (loop_vinfo);
6703 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6704 compile time constant), or it is a constant that doesn't divide by the
6705 vectorization factor, then an epilog loop needs to be created.
6706 We therefore duplicate the loop: the original loop will be vectorized,
6707 and will compute the first (n/VF) iterations. The second copy of the loop
6708 will remain scalar and will compute the remaining (n%VF) iterations.
6709 (VF is the vectorization factor). */
6711 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
6712 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
6713 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
6714 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
6716 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6717 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6719 /* 1) Make sure the loop header has exactly two entries
6720 2) Make sure we have a preheader basic block. */
6722 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6724 split_edge (loop_preheader_edge (loop));
6726 /* FORNOW: the vectorizer supports only loops which body consist
6727 of one basic block (header + empty latch). When the vectorizer will
6728 support more involved loop forms, the order by which the BBs are
6729 traversed need to be reconsidered. */
6731 for (i = 0; i < nbbs; i++)
6733 basic_block bb = bbs[i];
6734 stmt_vec_info stmt_info;
6737 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
6739 if (vect_print_dump_info (REPORT_DETAILS))
6741 fprintf (vect_dump, "------>vectorizing phi: ");
6742 print_generic_expr (vect_dump, phi, TDF_SLIM);
6744 stmt_info = vinfo_for_stmt (phi);
6747 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6748 && !STMT_VINFO_LIVE_P (stmt_info))
6751 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6752 != (unsigned HOST_WIDE_INT) vectorization_factor)
6753 && vect_print_dump_info (REPORT_DETAILS))
6754 fprintf (vect_dump, "multiple-types.");
6756 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6758 if (vect_print_dump_info (REPORT_DETAILS))
6759 fprintf (vect_dump, "transform phi.");
6760 vect_transform_stmt (phi, NULL, NULL);
6764 for (si = bsi_start (bb); !bsi_end_p (si);)
6766 tree stmt = bsi_stmt (si);
6769 if (vect_print_dump_info (REPORT_DETAILS))
6771 fprintf (vect_dump, "------>vectorizing statement: ");
6772 print_generic_expr (vect_dump, stmt, TDF_SLIM);
6775 stmt_info = vinfo_for_stmt (stmt);
6777 /* vector stmts created in the outer-loop during vectorization of
6778 stmts in an inner-loop may not have a stmt_info, and do not
6779 need to be vectorized. */
6786 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6787 && !STMT_VINFO_LIVE_P (stmt_info))
6793 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
6794 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6795 != (unsigned HOST_WIDE_INT) vectorization_factor)
6796 && vect_print_dump_info (REPORT_DETAILS))
6797 fprintf (vect_dump, "multiple-types.");
6799 /* -------- vectorize statement ------------ */
6800 if (vect_print_dump_info (REPORT_DETAILS))
6801 fprintf (vect_dump, "transform statement.");
6803 strided_store = false;
6804 is_store = vect_transform_stmt (stmt, &si, &strided_store);
6808 if (DR_GROUP_FIRST_DR (stmt_info))
6810 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6811 interleaving chain was completed - free all the stores in
6813 tree next = DR_GROUP_FIRST_DR (stmt_info);
6815 stmt_vec_info next_stmt_info;
6819 next_si = bsi_for_stmt (next);
6820 next_stmt_info = vinfo_for_stmt (next);
6821 /* Free the attached stmt_vec_info and remove the stmt. */
6822 ann = stmt_ann (next);
6823 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
6824 free (next_stmt_info);
6825 set_stmt_info (ann, NULL);
6826 bsi_remove (&next_si, true);
6829 bsi_remove (&si, true);
6834 /* Free the attached stmt_vec_info and remove the stmt. */
6835 ann = stmt_ann (stmt);
6837 set_stmt_info (ann, NULL);
6838 bsi_remove (&si, true);
6846 slpeel_make_loop_iterate_ntimes (loop, ratio);
6848 mark_set_for_renaming (vect_memsyms_to_rename);
6850 /* The memory tags and pointers in vectorized statements need to
6851 have their SSA forms updated. FIXME, why can't this be delayed
6852 until all the loops have been transformed? */
6853 update_ssa (TODO_update_ssa);
6855 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
6856 fprintf (vect_dump, "LOOP VECTORIZED.");
6857 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
6858 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");