1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
50 static tree vect_create_destination_var (tree, tree);
51 static tree vect_create_data_ref_ptr
52 (tree, block_stmt_iterator *, tree, tree *, tree *, bool, tree);
53 static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree);
54 static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *);
55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
57 static tree vect_init_vector (tree, tree, tree);
58 static void vect_finish_stmt_generation
59 (tree stmt, tree vec_stmt, block_stmt_iterator *bsi);
60 static bool vect_is_simple_cond (tree, loop_vec_info);
61 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
62 static tree get_initial_def_for_reduction (tree, tree, tree *);
64 /* Utility function dealing with loop peeling (not peeling itself). */
65 static void vect_generate_tmps_on_preheader
66 (loop_vec_info, tree *, tree *, tree *);
67 static tree vect_build_loop_niters (loop_vec_info);
68 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
69 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
70 static void vect_update_init_of_dr (struct data_reference *, tree niters);
71 static void vect_update_inits_of_drs (loop_vec_info, tree);
72 static int vect_min_worthwhile_factor (enum tree_code);
76 cost_for_stmt (tree stmt)
78 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
80 switch (STMT_VINFO_TYPE (stmt_info))
82 case load_vec_info_type:
83 return TARG_SCALAR_LOAD_COST;
84 case store_vec_info_type:
85 return TARG_SCALAR_STORE_COST;
86 case op_vec_info_type:
87 case condition_vec_info_type:
88 case assignment_vec_info_type:
89 case reduc_vec_info_type:
90 case induc_vec_info_type:
91 case type_promotion_vec_info_type:
92 case type_demotion_vec_info_type:
93 case type_conversion_vec_info_type:
94 case call_vec_info_type:
95 return TARG_SCALAR_STMT_COST;
96 case undef_vec_info_type:
103 /* Function vect_estimate_min_profitable_iters
105 Return the number of iterations required for the vector version of the
106 loop to be profitable relative to the cost of the scalar version of the
109 TODO: Take profile info into account before making vectorization
110 decisions, if available. */
113 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
116 int min_profitable_iters;
117 int peel_iters_prologue;
118 int peel_iters_epilogue;
119 int vec_inside_cost = 0;
120 int vec_outside_cost = 0;
121 int scalar_single_iter_cost = 0;
122 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
123 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
124 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
125 int nbbs = loop->num_nodes;
127 int innerloop_iters, factor;
129 /* Cost model disabled. */
130 if (!flag_vect_cost_model)
132 if (vect_print_dump_info (REPORT_DETAILS))
133 fprintf (vect_dump, "cost model disabled.");
137 /* Requires loop versioning tests to handle misalignment.
138 FIXME: Make cost depend on number of stmts in may_misalign list. */
140 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
142 vec_outside_cost += TARG_COND_BRANCH_COST;
143 if (vect_print_dump_info (REPORT_DETAILS))
144 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
148 /* Count statements in scalar loop. Using this as scalar cost for a single
151 TODO: Add outer loop support.
153 TODO: Consider assigning different costs to different scalar
158 innerloop_iters = 50; /* FIXME */
160 for (i = 0; i < nbbs; i++)
162 block_stmt_iterator si;
163 basic_block bb = bbs[i];
165 if (bb->loop_father == loop->inner)
166 factor = innerloop_iters;
170 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
172 tree stmt = bsi_stmt (si);
173 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
174 if (!STMT_VINFO_RELEVANT_P (stmt_info)
175 && !STMT_VINFO_LIVE_P (stmt_info))
177 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
178 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
179 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
180 some of the "outside" costs are generated inside the outer-loop. */
181 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
185 /* Add additional cost for the peeled instructions in prologue and epilogue
188 FORNOW: If we dont know the value of peel_iters for prologue or epilogue
189 at compile-time - we assume it's (vf-1)/2 (the worst would be vf-1).
191 TODO: Build an expression that represents peel_iters for prologue and
192 epilogue to be used in a run-time test. */
194 byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
196 if (byte_misalign < 0)
198 peel_iters_prologue = (vf - 1)/2;
199 if (vect_print_dump_info (REPORT_DETAILS))
200 fprintf (vect_dump, "cost model: "
201 "prologue peel iters set to (vf-1)/2.");
203 /* If peeling for alignment is unknown, loop bound of main loop becomes
205 peel_iters_epilogue = (vf - 1)/2;
206 if (vect_print_dump_info (REPORT_DETAILS))
207 fprintf (vect_dump, "cost model: "
208 "epilogue peel iters set to (vf-1)/2 because "
209 "peeling for alignment is unknown .");
215 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
216 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
217 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
218 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
220 peel_iters_prologue = nelements - (byte_misalign / element_size);
223 peel_iters_prologue = 0;
225 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
227 peel_iters_epilogue = (vf - 1)/2;
228 if (vect_print_dump_info (REPORT_DETAILS))
229 fprintf (vect_dump, "cost model: "
230 "epilogue peel iters set to (vf-1)/2 because "
231 "loop iterations are unknown .");
235 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
236 peel_iters_prologue = niters < peel_iters_prologue ?
237 niters : peel_iters_prologue;
238 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
242 /* Requires a prologue loop when peeling to handle misalignment. Add cost of
243 two guards, one for the peeled loop and one for the vector loop. */
245 if (peel_iters_prologue)
247 vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
248 if (vect_print_dump_info (REPORT_DETAILS))
249 fprintf (vect_dump, "cost model: Adding cost of checks for "
253 /* Requires an epilogue loop to finish up remaining iterations after vector
254 loop. Add cost of two guards, one for the peeled loop and one for the
257 if (peel_iters_epilogue
258 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
259 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vf)
261 vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
262 if (vect_print_dump_info (REPORT_DETAILS))
263 fprintf (vect_dump, "cost model : Adding cost of checks for "
267 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
268 + (peel_iters_epilogue * scalar_single_iter_cost);
270 /* Allow targets add additional (outside-of-loop) costs. FORNOW, the only
271 information we provide for the target is whether testing against the
272 threshold involves a runtime test. */
273 if (targetm.vectorize.builtin_vectorization_cost)
275 bool runtime_test = false;
277 /* If the number of iterations is unknown, or the
278 peeling-for-misalignment amount is unknown, we eill have to generate
279 a runtime test to test the loop count against the threshold. */
280 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
281 || (byte_misalign < 0))
284 targetm.vectorize.builtin_vectorization_cost (runtime_test);
285 if (vect_print_dump_info (REPORT_DETAILS))
286 fprintf (vect_dump, "cost model : Adding target out-of-loop cost = %d",
287 targetm.vectorize.builtin_vectorization_cost (runtime_test));
290 /* Calculate number of iterations required to make the vector version
291 profitable, relative to the loop bodies only. The following condition
292 must hold true: ((SIC*VF)-VIC)*niters > VOC*VF, where
293 SIC = scalar iteration cost, VIC = vector iteration cost,
294 VOC = vector outside cost and VF = vectorization factor. */
296 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
298 if (vec_outside_cost == 0)
299 min_profitable_iters = 1;
302 min_profitable_iters = (vec_outside_cost * vf)
303 / ((scalar_single_iter_cost * vf)
306 if ((scalar_single_iter_cost * vf * min_profitable_iters)
307 <= ((vec_inside_cost * min_profitable_iters)
308 + (vec_outside_cost * vf)))
309 min_profitable_iters++;
312 /* vector version will never be profitable. */
315 if (vect_print_dump_info (REPORT_DETAILS))
316 fprintf (vect_dump, "cost model: vector iteration cost = %d "
317 "is divisible by scalar iteration cost = %d by a factor "
318 "greater than or equal to the vectorization factor = %d .",
319 vec_inside_cost, scalar_single_iter_cost, vf);
323 if (vect_print_dump_info (REPORT_DETAILS))
325 fprintf (vect_dump, "Cost model analysis: \n");
326 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
328 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
330 fprintf (vect_dump, " Scalar cost: %d\n", scalar_single_iter_cost);
331 fprintf (vect_dump, " prologue iterations: %d\n",
332 peel_iters_prologue);
333 fprintf (vect_dump, " epilogue iterations: %d\n",
334 peel_iters_epilogue);
335 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
336 min_profitable_iters);
337 fprintf (vect_dump, " Actual minimum iters for profitability: %d\n",
338 min_profitable_iters < vf ? vf : min_profitable_iters);
341 min_profitable_iters =
342 min_profitable_iters < vf ? vf : min_profitable_iters;
344 /* Because the condition we create is:
345 if (niters <= min_profitable_iters)
346 then skip the vectorized loop. */
347 min_profitable_iters--;
348 return min_profitable_iters;
352 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
353 functions. Design better to avoid maintenance issues. */
355 /* Function vect_model_reduction_cost.
357 Models cost for a reduction operation, including the vector ops
358 generated within the strip-mine loop, the initial definition before
359 the loop, and the epilogue code that must be generated. */
362 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
371 enum machine_mode mode;
372 tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1);
373 int op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
375 /* Cost of reduction op inside loop. */
376 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
378 reduction_op = TREE_OPERAND (operation, op_type-1);
379 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
380 mode = TYPE_MODE (vectype);
381 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
384 orig_stmt = STMT_VINFO_STMT (stmt_info);
386 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
388 /* Add in cost for initial definition. */
389 outer_cost += TARG_SCALAR_TO_VEC_COST;
391 /* Determine cost of epilogue code.
393 We have a reduction operator that will reduce the vector in one statement.
394 Also requires scalar extract. */
396 if (reduc_code < NUM_TREE_CODES)
397 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
400 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
402 TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0)));
403 int element_bitsize = tree_low_cst (bitsize, 1);
404 int nelements = vec_size_in_bits / element_bitsize;
406 optab = optab_for_tree_code (code, vectype);
408 /* We have a whole vector shift available. */
409 if (VECTOR_MODE_P (mode)
410 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
411 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
412 /* Final reduction via vector shifts and the reduction operator. Also
413 requires scalar extract. */
414 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
415 + TARG_VEC_TO_SCALAR_COST);
417 /* Use extracts and reduction op for final reduction. For N elements,
418 we have N extracts and N-1 reduction ops. */
419 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
422 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
424 if (vect_print_dump_info (REPORT_DETAILS))
425 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
426 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
427 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
431 /* Function vect_model_induction_cost.
433 Models cost for induction operations. */
436 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
438 /* loop cost for vec_loop. */
439 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
440 /* prologue cost for vec_init and vec_step. */
441 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
443 if (vect_print_dump_info (REPORT_DETAILS))
444 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
445 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
446 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
450 /* Function vect_model_simple_cost.
452 Models cost for simple operations, i.e. those that only emit ncopies of a
453 single op. Right now, this does not account for multiple insns that could
454 be generated for the single vector op. We will handle that shortly. */
457 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies, enum vect_def_type *dt)
461 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
463 /* FORNOW: Assuming maximum 2 args per stmts. */
466 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
467 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) += TARG_SCALAR_TO_VEC_COST;
470 if (vect_print_dump_info (REPORT_DETAILS))
471 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
472 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
473 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
477 /* Function vect_cost_strided_group_size
479 For strided load or store, return the group_size only if it is the first
480 load or store of a group, else return 1. This ensures that group size is
481 only returned once per group. */
484 vect_cost_strided_group_size (stmt_vec_info stmt_info)
486 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
488 if (first_stmt == STMT_VINFO_STMT (stmt_info))
489 return DR_GROUP_SIZE (stmt_info);
495 /* Function vect_model_store_cost
497 Models cost for stores. In the case of strided accesses, one access
498 has the overhead of the strided access attributed to it. */
501 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, enum vect_def_type dt)
506 if (dt == vect_constant_def || dt == vect_invariant_def)
507 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = TARG_SCALAR_TO_VEC_COST;
509 /* Strided access? */
510 if (DR_GROUP_FIRST_DR (stmt_info))
511 group_size = vect_cost_strided_group_size (stmt_info);
512 /* Not a strided access. */
516 /* Is this an access in a group of stores, which provide strided access?
517 If so, add in the cost of the permutes. */
520 /* Uses a high and low interleave operation for each needed permute. */
521 cost = ncopies * exact_log2(group_size) * group_size
522 * TARG_VEC_STMT_COST;
524 if (vect_print_dump_info (REPORT_DETAILS))
525 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
530 /* Costs of the stores. */
531 cost += ncopies * TARG_VEC_STORE_COST;
533 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = cost;
535 if (vect_print_dump_info (REPORT_DETAILS))
536 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
537 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
538 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
542 /* Function vect_model_load_cost
544 Models cost for loads. In the case of strided accesses, the last access
545 has the overhead of the strided access attributed to it. Since unaligned
546 accesses are supported for loads, we also account for the costs of the
547 access scheme chosen. */
550 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies)
555 int alignment_support_cheme;
557 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
559 /* Strided accesses? */
560 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
563 group_size = vect_cost_strided_group_size (stmt_info);
564 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
566 /* Not a strided access. */
573 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
575 /* Is this an access in a group of loads providing strided access?
576 If so, add in the cost of the permutes. */
579 /* Uses an even and odd extract operations for each needed permute. */
580 inner_cost = ncopies * exact_log2(group_size) * group_size
581 * TARG_VEC_STMT_COST;
583 if (vect_print_dump_info (REPORT_DETAILS))
584 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
589 /* The loads themselves. */
590 switch (alignment_support_cheme)
594 inner_cost += ncopies * TARG_VEC_LOAD_COST;
596 if (vect_print_dump_info (REPORT_DETAILS))
597 fprintf (vect_dump, "vect_model_load_cost: aligned.");
601 case dr_unaligned_supported:
603 /* Here, we assign an additional cost for the unaligned load. */
604 inner_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
606 if (vect_print_dump_info (REPORT_DETAILS))
607 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
612 case dr_unaligned_software_pipeline:
616 if (vect_print_dump_info (REPORT_DETAILS))
617 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
620 /* Unaligned software pipeline has a load of an address, an initial
621 load, and possibly a mask operation to "prime" the loop. However,
622 if this is an access in a group of loads, which provide strided
623 access, then the above cost should only be considered for one
624 access in the group. Inside the loop, there is a load op
625 and a realignment op. */
627 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1)
629 outer_cost = 2*TARG_VEC_STMT_COST;
630 if (targetm.vectorize.builtin_mask_for_load)
631 outer_cost += TARG_VEC_STMT_COST;
634 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
636 inner_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
645 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = inner_cost;
647 if (vect_print_dump_info (REPORT_DETAILS))
648 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
649 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
650 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
655 /* Function vect_get_new_vect_var.
657 Returns a name for a new variable. The current naming scheme appends the
658 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
659 the name of vectorizer generated variables, and appends that to NAME if
663 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
670 case vect_simple_var:
673 case vect_scalar_var:
676 case vect_pointer_var:
685 char* tmp = concat (prefix, name, NULL);
686 new_vect_var = create_tmp_var (type, tmp);
690 new_vect_var = create_tmp_var (type, prefix);
692 /* Mark vector typed variable as a gimple register variable. */
693 if (TREE_CODE (type) == VECTOR_TYPE)
694 DECL_GIMPLE_REG_P (new_vect_var) = true;
700 /* Function vect_create_addr_base_for_vector_ref.
702 Create an expression that computes the address of the first memory location
703 that will be accessed for a data reference.
706 STMT: The statement containing the data reference.
707 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
708 OFFSET: Optional. If supplied, it is be added to the initial address.
711 1. Return an SSA_NAME whose value is the address of the memory location of
712 the first vector of the data reference.
713 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
714 these statement(s) which define the returned SSA_NAME.
716 FORNOW: We are only handling array accesses with step 1. */
719 vect_create_addr_base_for_vector_ref (tree stmt,
723 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
724 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
725 tree data_ref_base_expr = unshare_expr (DR_BASE_ADDRESS (dr));
726 tree base_name = build_fold_indirect_ref (data_ref_base_expr);
727 tree data_ref_base_var;
731 tree addr_base, addr_expr;
733 tree base_offset = unshare_expr (DR_OFFSET (dr));
734 tree init = unshare_expr (DR_INIT (dr));
735 tree vect_ptr_type, addr_expr2;
738 /* Create data_ref_base */
739 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base_expr), "batmp");
740 add_referenced_var (data_ref_base_var);
741 data_ref_base = force_gimple_operand (data_ref_base_expr, &new_base_stmt,
742 true, data_ref_base_var);
743 append_to_statement_list_force(new_base_stmt, new_stmt_list);
745 /* Create base_offset */
746 base_offset = size_binop (PLUS_EXPR, base_offset, init);
747 base_offset = fold_convert (sizetype, base_offset);
748 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
749 add_referenced_var (dest);
750 base_offset = force_gimple_operand (base_offset, &new_stmt, true, dest);
751 append_to_statement_list_force (new_stmt, new_stmt_list);
755 tree tmp = create_tmp_var (sizetype, "offset");
758 /* For interleaved access step we divide STEP by the size of the
759 interleaving group. */
760 if (DR_GROUP_SIZE (stmt_info))
761 step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr),
762 build_int_cst (TREE_TYPE (offset),
763 DR_GROUP_SIZE (stmt_info)));
767 add_referenced_var (tmp);
768 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
769 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
770 base_offset, offset);
771 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
772 append_to_statement_list_force (new_stmt, new_stmt_list);
775 /* base + base_offset */
776 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base,
779 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
781 /* addr_expr = addr_base */
782 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
783 get_name (base_name));
784 add_referenced_var (addr_expr);
785 vec_stmt = fold_convert (vect_ptr_type, addr_base);
786 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
787 get_name (base_name));
788 add_referenced_var (addr_expr2);
789 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
790 append_to_statement_list_force (new_stmt, new_stmt_list);
792 if (vect_print_dump_info (REPORT_DETAILS))
794 fprintf (vect_dump, "created ");
795 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
801 /* Function vect_create_data_ref_ptr.
803 Create a new pointer to vector type (vp), that points to the first location
804 accessed in the loop by STMT, along with the def-use update chain to
805 appropriately advance the pointer through the loop iterations. Also set
806 aliasing information for the pointer. This vector pointer is used by the
807 callers to this function to create a memory reference expression for vector
811 1. STMT: a stmt that references memory. Expected to be of the form
812 GIMPLE_MODIFY_STMT <name, data-ref> or
813 GIMPLE_MODIFY_STMT <data-ref, name>.
814 2. BSI: block_stmt_iterator where new stmts can be added.
815 3. OFFSET (optional): an offset to be added to the initial address accessed
816 by the data-ref in STMT.
817 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
818 pointing to the initial address.
819 5. TYPE: if not NULL indicates the required type of the data-ref
822 1. Declare a new ptr to vector_type, and have it point to the base of the
823 data reference (initial addressed accessed by the data reference).
824 For example, for vector of type V8HI, the following code is generated:
827 vp = (v8hi *)initial_address;
829 if OFFSET is not supplied:
830 initial_address = &a[init];
831 if OFFSET is supplied:
832 initial_address = &a[init + OFFSET];
834 Return the initial_address in INITIAL_ADDRESS.
836 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
837 update the pointer in each iteration of the loop.
839 Return the increment stmt that updates the pointer in PTR_INCR.
841 3. Return the pointer. */
844 vect_create_data_ref_ptr (tree stmt,
845 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
846 tree offset, tree *initial_address, tree *ptr_incr,
847 bool only_init, tree type)
850 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
851 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
852 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
853 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
859 tree new_stmt_list = NULL_TREE;
860 edge pe = loop_preheader_edge (loop);
863 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
865 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
867 if (vect_print_dump_info (REPORT_DETAILS))
869 tree data_ref_base = base_name;
870 fprintf (vect_dump, "create vector-pointer variable to type: ");
871 print_generic_expr (vect_dump, vectype, TDF_SLIM);
872 if (TREE_CODE (data_ref_base) == VAR_DECL)
873 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
874 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
875 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
876 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
877 fprintf (vect_dump, " vectorizing a record based array ref: ");
878 else if (TREE_CODE (data_ref_base) == SSA_NAME)
879 fprintf (vect_dump, " vectorizing a pointer ref: ");
880 print_generic_expr (vect_dump, base_name, TDF_SLIM);
883 /** (1) Create the new vector-pointer variable: **/
885 vect_ptr_type = build_pointer_type (type);
887 vect_ptr_type = build_pointer_type (vectype);
888 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
889 get_name (base_name));
890 add_referenced_var (vect_ptr);
892 /** (2) Add aliasing information to the new vector-pointer:
893 (The points-to info (DR_PTR_INFO) may be defined later.) **/
895 tag = DR_SYMBOL_TAG (dr);
898 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
899 tag must be created with tag added to its may alias list. */
901 new_type_alias (vect_ptr, tag, DR_REF (dr));
903 set_symbol_mem_tag (vect_ptr, tag);
905 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
907 /** (3) Calculate the initial address the vector-pointer, and set
908 the vector-pointer to point to it before the loop: **/
910 /* Create: (&(base[init_val+offset]) in the loop preheader. */
911 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
913 pe = loop_preheader_edge (loop);
914 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
915 gcc_assert (!new_bb);
916 *initial_address = new_temp;
918 /* Create: p = (vectype *) initial_base */
919 vec_stmt = fold_convert (vect_ptr_type, new_temp);
920 vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
921 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
922 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
923 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
924 gcc_assert (!new_bb);
927 /** (4) Handle the updating of the vector-pointer inside the loop: **/
929 if (only_init) /* No update in loop is required. */
931 /* Copy the points-to information if it exists. */
932 if (DR_PTR_INFO (dr))
933 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
934 return vect_ptr_init;
938 block_stmt_iterator incr_bsi;
940 tree indx_before_incr, indx_after_incr;
943 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
944 create_iv (vect_ptr_init,
945 fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)),
946 NULL_TREE, loop, &incr_bsi, insert_after,
947 &indx_before_incr, &indx_after_incr);
948 incr = bsi_stmt (incr_bsi);
949 set_stmt_info (stmt_ann (incr),
950 new_stmt_vec_info (incr, loop_vinfo));
952 /* Copy the points-to information if it exists. */
953 if (DR_PTR_INFO (dr))
955 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
956 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
958 merge_alias_info (vect_ptr_init, indx_before_incr);
959 merge_alias_info (vect_ptr_init, indx_after_incr);
963 return indx_before_incr;
968 /* Function bump_vector_ptr
970 Increment a pointer (to a vector type) by vector-size. Connect the new
971 increment stmt to the existing def-use update-chain of the pointer.
973 The pointer def-use update-chain before this function:
974 DATAREF_PTR = phi (p_0, p_2)
976 PTR_INCR: p_2 = DATAREF_PTR + step
978 The pointer def-use update-chain after this function:
979 DATAREF_PTR = phi (p_0, p_2)
981 NEW_DATAREF_PTR = DATAREF_PTR + vector_size
983 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
986 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
988 PTR_INCR - the stmt that updates the pointer in each iteration of the loop.
989 The increment amount across iterations is also expected to be
991 BSI - location where the new update stmt is to be placed.
992 STMT - the original scalar memory-access stmt that is being vectorized.
994 Output: Return NEW_DATAREF_PTR as illustrated above.
999 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
1002 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1003 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1004 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1005 tree vptr_type = TREE_TYPE (dataref_ptr);
1006 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1007 tree update = TYPE_SIZE_UNIT (vectype);
1010 use_operand_p use_p;
1011 tree new_dataref_ptr;
1013 incr_stmt = build_gimple_modify_stmt (ptr_var,
1014 build2 (POINTER_PLUS_EXPR, vptr_type,
1015 dataref_ptr, update));
1016 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1017 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
1018 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
1020 /* Update the vector-pointer's cross-iteration increment. */
1021 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1023 tree use = USE_FROM_PTR (use_p);
1025 if (use == dataref_ptr)
1026 SET_USE (use_p, new_dataref_ptr);
1028 gcc_assert (tree_int_cst_compare (use, update) == 0);
1031 /* Copy the points-to information if it exists. */
1032 if (DR_PTR_INFO (dr))
1033 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1034 merge_alias_info (new_dataref_ptr, dataref_ptr);
1036 return new_dataref_ptr;
1040 /* Function vect_create_destination_var.
1042 Create a new temporary of type VECTYPE. */
1045 vect_create_destination_var (tree scalar_dest, tree vectype)
1048 const char *new_name;
1050 enum vect_var_kind kind;
1052 kind = vectype ? vect_simple_var : vect_scalar_var;
1053 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1055 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1057 new_name = get_name (scalar_dest);
1060 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1061 add_referenced_var (vec_dest);
1067 /* Function vect_init_vector.
1069 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1070 the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be
1071 used in the vectorization of STMT. */
1074 vect_init_vector (tree stmt, tree vector_var, tree vector_type)
1076 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1077 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1078 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1086 if (nested_in_vect_loop_p (loop, stmt))
1089 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1090 add_referenced_var (new_var);
1092 init_stmt = build_gimple_modify_stmt (new_var, vector_var);
1093 new_temp = make_ssa_name (new_var, init_stmt);
1094 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
1096 pe = loop_preheader_edge (loop);
1097 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1098 gcc_assert (!new_bb);
1100 if (vect_print_dump_info (REPORT_DETAILS))
1102 fprintf (vect_dump, "created new init_stmt: ");
1103 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1106 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
1111 /* Function get_initial_def_for_induction
1114 STMT - a stmt that performs an induction operation in the loop.
1115 IV_PHI - the initial value of the induction variable
1118 Return a vector variable, initialized with the first VF values of
1119 the induction variable. E.g., for an iv with IV_PHI='X' and
1120 evolution S, for a vector of 4 units, we want to return:
1121 [X, X + S, X + 2*S, X + 3*S]. */
1124 get_initial_def_for_induction (tree iv_phi)
1126 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1127 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1128 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1129 tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
1130 tree vectype = get_vectype_for_scalar_type (scalar_type);
1131 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1132 edge pe = loop_preheader_edge (loop);
1133 struct loop *iv_loop;
1135 tree vec, vec_init, vec_step, t;
1140 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
1141 tree init_expr, step_expr;
1142 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1145 int ncopies = vf / nunits;
1147 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1148 bool nested_in_vect_loop = false;
1150 imm_use_iterator imm_iter;
1151 use_operand_p use_p;
1155 block_stmt_iterator si;
1156 basic_block bb = bb_for_stmt (iv_phi);
1158 gcc_assert (phi_info);
1159 gcc_assert (ncopies >= 1);
1161 /* Find the first insertion point in the BB. */
1162 si = bsi_after_labels (bb);
1164 if (INTEGRAL_TYPE_P (scalar_type))
1165 step_expr = build_int_cst (scalar_type, 0);
1167 step_expr = build_real (scalar_type, dconst0);
1169 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1170 if (nested_in_vect_loop_p (loop, iv_phi))
1172 nested_in_vect_loop = true;
1173 iv_loop = loop->inner;
1177 gcc_assert (iv_loop == (bb_for_stmt (iv_phi))->loop_father);
1179 latch_e = loop_latch_edge (iv_loop);
1180 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1182 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1183 gcc_assert (access_fn);
1184 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1185 &init_expr, &step_expr);
1187 pe = loop_preheader_edge (iv_loop);
1189 /* Create the vector that holds the initial_value of the induction. */
1190 if (nested_in_vect_loop)
1192 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1193 been created during vectorization of previous stmts; We obtain it from
1194 the STMT_VINFO_VEC_STMT of the defining stmt. */
1195 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1196 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1200 /* iv_loop is the loop to be vectorized. Create:
1201 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1202 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1203 add_referenced_var (new_var);
1205 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1208 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
1209 gcc_assert (!new_bb);
1213 t = tree_cons (NULL_TREE, init_expr, t);
1214 for (i = 1; i < nunits; i++)
1218 /* Create: new_name_i = new_name + step_expr */
1219 tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
1220 init_stmt = build_gimple_modify_stmt (new_var, tmp);
1221 new_name = make_ssa_name (new_var, init_stmt);
1222 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
1224 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1225 gcc_assert (!new_bb);
1227 if (vect_print_dump_info (REPORT_DETAILS))
1229 fprintf (vect_dump, "created new init_stmt: ");
1230 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1232 t = tree_cons (NULL_TREE, new_name, t);
1234 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1235 vec = build_constructor_from_list (vectype, nreverse (t));
1236 vec_init = vect_init_vector (iv_phi, vec, vectype);
1240 /* Create the vector that holds the step of the induction. */
1241 if (nested_in_vect_loop)
1242 /* iv_loop is nested in the loop to be vectorized. Generate:
1243 vec_step = [S, S, S, S] */
1244 new_name = step_expr;
1247 /* iv_loop is the loop to be vectorized. Generate:
1248 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1249 expr = build_int_cst (scalar_type, vf);
1250 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1254 for (i = 0; i < nunits; i++)
1255 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1256 vec = build_constructor_from_list (vectype, t);
1257 vec_step = vect_init_vector (iv_phi, vec, vectype);
1260 /* Create the following def-use cycle:
1265 vec_iv = PHI <vec_init, vec_loop>
1269 vec_loop = vec_iv + vec_step; */
1271 /* Create the induction-phi that defines the induction-operand. */
1272 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1273 add_referenced_var (vec_dest);
1274 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1275 set_stmt_info (get_stmt_ann (induction_phi),
1276 new_stmt_vec_info (induction_phi, loop_vinfo));
1277 induc_def = PHI_RESULT (induction_phi);
1279 /* Create the iv update inside the loop */
1280 new_stmt = build_gimple_modify_stmt (NULL_TREE,
1281 build2 (PLUS_EXPR, vectype,
1282 induc_def, vec_step));
1283 vec_def = make_ssa_name (vec_dest, new_stmt);
1284 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1285 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1286 set_stmt_info (get_stmt_ann (new_stmt),
1287 new_stmt_vec_info (new_stmt, loop_vinfo));
1289 /* Set the arguments of the phi node: */
1290 add_phi_arg (induction_phi, vec_init, pe);
1291 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1294 /* In case that vectorization factor (VF) is bigger than the number
1295 of elements that we can fit in a vectype (nunits), we have to generate
1296 more than one vector stmt - i.e - we need to "unroll" the
1297 vector stmt by a factor VF/nunits. For more details see documentation
1298 in vectorizable_operation. */
1302 stmt_vec_info prev_stmt_vinfo;
1303 /* FORNOW. This restriction should be relaxed. */
1304 gcc_assert (!nested_in_vect_loop);
1306 /* Create the vector that holds the step of the induction. */
1307 expr = build_int_cst (scalar_type, nunits);
1308 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1310 for (i = 0; i < nunits; i++)
1311 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1312 vec = build_constructor_from_list (vectype, t);
1313 vec_step = vect_init_vector (iv_phi, vec, vectype);
1315 vec_def = induc_def;
1316 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1317 for (i = 1; i < ncopies; i++)
1321 /* vec_i = vec_prev + vec_step */
1322 tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
1323 new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
1324 vec_def = make_ssa_name (vec_dest, new_stmt);
1325 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1326 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1327 set_stmt_info (get_stmt_ann (new_stmt),
1328 new_stmt_vec_info (new_stmt, loop_vinfo));
1329 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1330 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1334 if (nested_in_vect_loop)
1336 /* Find the loop-closed exit-phi of the induction, and record
1337 the final vector of induction results: */
1339 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1341 if (!flow_bb_inside_loop_p (iv_loop, bb_for_stmt (USE_STMT (use_p))))
1343 exit_phi = USE_STMT (use_p);
1349 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1350 /* FORNOW. Currently not supporting the case that an inner-loop induction
1351 is not used in the outer-loop (i.e. only outside the outer-loop). */
1352 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1353 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1355 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1356 if (vect_print_dump_info (REPORT_DETAILS))
1358 fprintf (vect_dump, "vector of inductions after inner-loop:");
1359 print_generic_expr (vect_dump, new_stmt, TDF_SLIM);
1365 if (vect_print_dump_info (REPORT_DETAILS))
1367 fprintf (vect_dump, "transform induction: created def-use cycle:");
1368 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
1369 fprintf (vect_dump, "\n");
1370 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
1373 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1378 /* Function vect_get_vec_def_for_operand.
1380 OP is an operand in STMT. This function returns a (vector) def that will be
1381 used in the vectorized stmt for STMT.
1383 In the case that OP is an SSA_NAME which is defined in the loop, then
1384 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1386 In case OP is an invariant or constant, a new stmt that creates a vector def
1387 needs to be introduced. */
1390 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
1395 stmt_vec_info def_stmt_info = NULL;
1396 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1397 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1398 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1399 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1405 enum vect_def_type dt;
1409 if (vect_print_dump_info (REPORT_DETAILS))
1411 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1412 print_generic_expr (vect_dump, op, TDF_SLIM);
1415 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1416 gcc_assert (is_simple_use);
1417 if (vect_print_dump_info (REPORT_DETAILS))
1421 fprintf (vect_dump, "def = ");
1422 print_generic_expr (vect_dump, def, TDF_SLIM);
1426 fprintf (vect_dump, " def_stmt = ");
1427 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
1433 /* Case 1: operand is a constant. */
1434 case vect_constant_def:
1439 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1440 if (vect_print_dump_info (REPORT_DETAILS))
1441 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1443 for (i = nunits - 1; i >= 0; --i)
1445 t = tree_cons (NULL_TREE, op, t);
1447 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1448 vec_cst = build_vector (vector_type, t);
1450 return vect_init_vector (stmt, vec_cst, vector_type);
1453 /* Case 2: operand is defined outside the loop - loop invariant. */
1454 case vect_invariant_def:
1459 /* Create 'vec_inv = {inv,inv,..,inv}' */
1460 if (vect_print_dump_info (REPORT_DETAILS))
1461 fprintf (vect_dump, "Create vector_inv.");
1463 for (i = nunits - 1; i >= 0; --i)
1465 t = tree_cons (NULL_TREE, def, t);
1468 /* FIXME: use build_constructor directly. */
1469 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1470 vec_inv = build_constructor_from_list (vector_type, t);
1472 return vect_init_vector (stmt, vec_inv, vector_type);
1475 /* Case 3: operand is defined inside the loop. */
1479 *scalar_def = def_stmt;
1481 /* Get the def from the vectorized stmt. */
1482 def_stmt_info = vinfo_for_stmt (def_stmt);
1483 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1484 gcc_assert (vec_stmt);
1485 if (TREE_CODE (vec_stmt) == PHI_NODE)
1486 vec_oprnd = PHI_RESULT (vec_stmt);
1488 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
1492 /* Case 4: operand is defined by a loop header phi - reduction */
1493 case vect_reduction_def:
1497 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1498 loop = (bb_for_stmt (def_stmt))->loop_father;
1500 /* Get the def before the loop */
1501 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1502 return get_initial_def_for_reduction (stmt, op, scalar_def);
1505 /* Case 5: operand is defined by loop-header phi - induction. */
1506 case vect_induction_def:
1508 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1510 /* Get the def from the vectorized stmt. */
1511 def_stmt_info = vinfo_for_stmt (def_stmt);
1512 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1513 gcc_assert (vec_stmt && (TREE_CODE (vec_stmt) == PHI_NODE));
1514 vec_oprnd = PHI_RESULT (vec_stmt);
1524 /* Function vect_get_vec_def_for_stmt_copy
1526 Return a vector-def for an operand. This function is used when the
1527 vectorized stmt to be created (by the caller to this function) is a "copy"
1528 created in case the vectorized result cannot fit in one vector, and several
1529 copies of the vector-stmt are required. In this case the vector-def is
1530 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
1531 of the stmt that defines VEC_OPRND.
1532 DT is the type of the vector def VEC_OPRND.
1535 In case the vectorization factor (VF) is bigger than the number
1536 of elements that can fit in a vectype (nunits), we have to generate
1537 more than one vector stmt to vectorize the scalar stmt. This situation
1538 arises when there are multiple data-types operated upon in the loop; the
1539 smallest data-type determines the VF, and as a result, when vectorizing
1540 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
1541 vector stmt (each computing a vector of 'nunits' results, and together
1542 computing 'VF' results in each iteration). This function is called when
1543 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
1544 which VF=16 and nunits=4, so the number of copies required is 4):
1546 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
1548 S1: x = load VS1.0: vx.0 = memref0 VS1.1
1549 VS1.1: vx.1 = memref1 VS1.2
1550 VS1.2: vx.2 = memref2 VS1.3
1551 VS1.3: vx.3 = memref3
1553 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
1554 VSnew.1: vz1 = vx.1 + ... VSnew.2
1555 VSnew.2: vz2 = vx.2 + ... VSnew.3
1556 VSnew.3: vz3 = vx.3 + ...
1558 The vectorization of S1 is explained in vectorizable_load.
1559 The vectorization of S2:
1560 To create the first vector-stmt out of the 4 copies - VSnew.0 -
1561 the function 'vect_get_vec_def_for_operand' is called to
1562 get the relevant vector-def for each operand of S2. For operand x it
1563 returns the vector-def 'vx.0'.
1565 To create the remaining copies of the vector-stmt (VSnew.j), this
1566 function is called to get the relevant vector-def for each operand. It is
1567 obtained from the respective VS1.j stmt, which is recorded in the
1568 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
1570 For example, to obtain the vector-def 'vx.1' in order to create the
1571 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
1572 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
1573 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
1574 and return its def ('vx.1').
1575 Overall, to create the above sequence this function will be called 3 times:
1576 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
1577 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
1578 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
1581 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
1583 tree vec_stmt_for_operand;
1584 stmt_vec_info def_stmt_info;
1586 /* Do nothing; can reuse same def. */
1587 if (dt == vect_invariant_def || dt == vect_constant_def )
1590 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
1591 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
1592 gcc_assert (def_stmt_info);
1593 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
1594 gcc_assert (vec_stmt_for_operand);
1595 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
1600 /* Function vect_finish_stmt_generation.
1602 Insert a new stmt. */
1605 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
1606 block_stmt_iterator *bsi)
1608 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1609 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1611 gcc_assert (stmt == bsi_stmt (*bsi));
1612 gcc_assert (TREE_CODE (stmt) != LABEL_EXPR);
1614 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
1616 set_stmt_info (get_stmt_ann (vec_stmt),
1617 new_stmt_vec_info (vec_stmt, loop_vinfo));
1619 if (vect_print_dump_info (REPORT_DETAILS))
1621 fprintf (vect_dump, "add new stmt: ");
1622 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
1625 /* Make sure bsi points to the stmt that is being vectorized. */
1626 gcc_assert (stmt == bsi_stmt (*bsi));
1628 #ifdef USE_MAPPED_LOCATION
1629 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
1631 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
1636 /* Function get_initial_def_for_reduction
1639 STMT - a stmt that performs a reduction operation in the loop.
1640 INIT_VAL - the initial value of the reduction variable
1643 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
1644 of the reduction (used for adjusting the epilog - see below).
1645 Return a vector variable, initialized according to the operation that STMT
1646 performs. This vector will be used as the initial value of the
1647 vector of partial results.
1649 Option1 (adjust in epilog): Initialize the vector as follows:
1652 min/max: [init_val,init_val,..,init_val,init_val]
1653 bit and/or: [init_val,init_val,..,init_val,init_val]
1654 and when necessary (e.g. add/mult case) let the caller know
1655 that it needs to adjust the result by init_val.
1657 Option2: Initialize the vector as follows:
1658 add: [0,0,...,0,init_val]
1659 mult: [1,1,...,1,init_val]
1660 min/max: [init_val,init_val,...,init_val]
1661 bit and/or: [init_val,init_val,...,init_val]
1662 and no adjustments are needed.
1664 For example, for the following code:
1670 STMT is 's = s + a[i]', and the reduction variable is 's'.
1671 For a vector of 4 units, we want to return either [0,0,0,init_val],
1672 or [0,0,0,0] and let the caller know that it needs to adjust
1673 the result at the end by 'init_val'.
1675 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
1676 initialization vector is simpler (same element in all entries).
1677 A cost model should help decide between these two schemes. */
1680 get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
1682 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1683 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1684 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1685 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1686 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1687 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
1688 tree type = TREE_TYPE (init_val);
1695 bool nested_in_vect_loop = false;
1697 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
1698 if (nested_in_vect_loop_p (loop, stmt))
1699 nested_in_vect_loop = true;
1701 gcc_assert (loop == (bb_for_stmt (stmt))->loop_father);
1703 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
1707 case WIDEN_SUM_EXPR:
1710 if (nested_in_vect_loop)
1711 *adjustment_def = vecdef;
1713 *adjustment_def = init_val;
1714 /* Create a vector of zeros for init_def. */
1715 if (INTEGRAL_TYPE_P (type))
1716 def_for_init = build_int_cst (type, 0);
1718 def_for_init = build_real (type, dconst0);
1719 for (i = nunits - 1; i >= 0; --i)
1720 t = tree_cons (NULL_TREE, def_for_init, t);
1721 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
1722 init_def = build_vector (vector_type, t);
1727 *adjustment_def = NULL_TREE;
1739 /* Function vect_create_epilog_for_reduction
1741 Create code at the loop-epilog to finalize the result of a reduction
1744 VECT_DEF is a vector of partial results.
1745 REDUC_CODE is the tree-code for the epilog reduction.
1746 STMT is the scalar reduction stmt that is being vectorized.
1747 REDUCTION_PHI is the phi-node that carries the reduction computation.
1750 1. Creates the reduction def-use cycle: sets the arguments for
1752 The loop-entry argument is the vectorized initial-value of the reduction.
1753 The loop-latch argument is VECT_DEF - the vector of partial sums.
1754 2. "Reduces" the vector of partial results VECT_DEF into a single result,
1755 by applying the operation specified by REDUC_CODE if available, or by
1756 other means (whole-vector shifts or a scalar loop).
1757 The function also creates a new phi node at the loop exit to preserve
1758 loop-closed form, as illustrated below.
1760 The flow at the entry to this function:
1763 vec_def = phi <null, null> # REDUCTION_PHI
1764 VECT_DEF = vector_stmt # vectorized form of STMT
1765 s_loop = scalar_stmt # (scalar) STMT
1767 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1771 The above is transformed by this function into:
1774 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
1775 VECT_DEF = vector_stmt # vectorized form of STMT
1776 s_loop = scalar_stmt # (scalar) STMT
1778 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1779 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1780 v_out2 = reduce <v_out1>
1781 s_out3 = extract_field <v_out2, 0>
1782 s_out4 = adjust_result <s_out3>
1788 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
1789 enum tree_code reduc_code, tree reduction_phi)
1791 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1793 enum machine_mode mode;
1794 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1795 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1796 basic_block exit_bb;
1800 block_stmt_iterator exit_bsi;
1802 tree new_temp = NULL_TREE;
1804 tree epilog_stmt = NULL_TREE;
1805 tree new_scalar_dest, exit_phi, new_dest;
1806 tree bitsize, bitpos, bytesize;
1807 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
1808 tree adjustment_def;
1809 tree vec_initial_def;
1811 imm_use_iterator imm_iter;
1812 use_operand_p use_p;
1813 bool extract_scalar_result = false;
1814 tree reduction_op, expr;
1817 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
1818 bool nested_in_vect_loop = false;
1821 if (nested_in_vect_loop_p (loop, stmt))
1824 nested_in_vect_loop = true;
1827 op_type = TREE_OPERAND_LENGTH (operation);
1828 reduction_op = TREE_OPERAND (operation, op_type-1);
1829 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
1830 mode = TYPE_MODE (vectype);
1832 /*** 1. Create the reduction def-use cycle ***/
1834 /* 1.1 set the loop-entry arg of the reduction-phi: */
1835 /* For the case of reduction, vect_get_vec_def_for_operand returns
1836 the scalar def before the loop, that defines the initial value
1837 of the reduction variable. */
1838 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
1840 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
1842 /* 1.2 set the loop-latch arg for the reduction-phi: */
1843 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
1845 if (vect_print_dump_info (REPORT_DETAILS))
1847 fprintf (vect_dump, "transform reduction: created def-use cycle:");
1848 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
1849 fprintf (vect_dump, "\n");
1850 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
1854 /*** 2. Create epilog code
1855 The reduction epilog code operates across the elements of the vector
1856 of partial results computed by the vectorized loop.
1857 The reduction epilog code consists of:
1858 step 1: compute the scalar result in a vector (v_out2)
1859 step 2: extract the scalar result (s_out3) from the vector (v_out2)
1860 step 3: adjust the scalar result (s_out3) if needed.
1862 Step 1 can be accomplished using one the following three schemes:
1863 (scheme 1) using reduc_code, if available.
1864 (scheme 2) using whole-vector shifts, if available.
1865 (scheme 3) using a scalar loop. In this case steps 1+2 above are
1868 The overall epilog code looks like this:
1870 s_out0 = phi <s_loop> # original EXIT_PHI
1871 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1872 v_out2 = reduce <v_out1> # step 1
1873 s_out3 = extract_field <v_out2, 0> # step 2
1874 s_out4 = adjust_result <s_out3> # step 3
1876 (step 3 is optional, and step2 1 and 2 may be combined).
1877 Lastly, the uses of s_out0 are replaced by s_out4.
1881 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
1882 v_out1 = phi <v_loop> */
1884 exit_bb = single_exit (loop)->dest;
1885 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
1886 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
1887 exit_bsi = bsi_after_labels (exit_bb);
1889 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
1890 (i.e. when reduc_code is not available) and in the final adjustment
1891 code (if needed). Also get the original scalar reduction variable as
1892 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
1893 represents a reduction pattern), the tree-code and scalar-def are
1894 taken from the original stmt that the pattern-stmt (STMT) replaces.
1895 Otherwise (it is a regular reduction) - the tree-code and scalar-def
1896 are taken from STMT. */
1898 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1901 /* Regular reduction */
1906 /* Reduction pattern */
1907 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
1908 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1909 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
1911 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1912 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
1913 scalar_type = TREE_TYPE (scalar_dest);
1914 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
1915 bitsize = TYPE_SIZE (scalar_type);
1916 bytesize = TYPE_SIZE_UNIT (scalar_type);
1919 /* In case this is a reduction in an inner-loop while vectorizing an outer
1920 loop - we don't need to extract a single scalar result at the end of the
1921 inner-loop. The final vector of partial results will be used in the
1922 vectorized outer-loop, or reduced to a scalar result at the end of the
1924 if (nested_in_vect_loop)
1925 goto vect_finalize_reduction;
1927 /* 2.3 Create the reduction code, using one of the three schemes described
1930 if (reduc_code < NUM_TREE_CODES)
1934 /*** Case 1: Create:
1935 v_out2 = reduc_expr <v_out1> */
1937 if (vect_print_dump_info (REPORT_DETAILS))
1938 fprintf (vect_dump, "Reduce using direct vector reduction.");
1940 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1941 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
1942 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
1943 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1944 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1945 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
1947 extract_scalar_result = true;
1951 enum tree_code shift_code = 0;
1952 bool have_whole_vector_shift = true;
1954 int element_bitsize = tree_low_cst (bitsize, 1);
1955 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1958 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
1959 shift_code = VEC_RSHIFT_EXPR;
1961 have_whole_vector_shift = false;
1963 /* Regardless of whether we have a whole vector shift, if we're
1964 emulating the operation via tree-vect-generic, we don't want
1965 to use it. Only the first round of the reduction is likely
1966 to still be profitable via emulation. */
1967 /* ??? It might be better to emit a reduction tree code here, so that
1968 tree-vect-generic can expand the first round via bit tricks. */
1969 if (!VECTOR_MODE_P (mode))
1970 have_whole_vector_shift = false;
1973 optab optab = optab_for_tree_code (code, vectype);
1974 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
1975 have_whole_vector_shift = false;
1978 if (have_whole_vector_shift)
1980 /*** Case 2: Create:
1981 for (offset = VS/2; offset >= element_size; offset/=2)
1983 Create: va' = vec_shift <va, offset>
1984 Create: va = vop <va, va'>
1987 if (vect_print_dump_info (REPORT_DETAILS))
1988 fprintf (vect_dump, "Reduce using vector shifts");
1990 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1991 new_temp = PHI_RESULT (new_phi);
1993 for (bit_offset = vec_size_in_bits/2;
1994 bit_offset >= element_bitsize;
1997 tree bitpos = size_int (bit_offset);
1998 tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
1999 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2000 new_name = make_ssa_name (vec_dest, epilog_stmt);
2001 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2002 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2004 tmp = build2 (code, vectype, new_name, new_temp);
2005 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2006 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2007 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2008 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2011 extract_scalar_result = true;
2017 /*** Case 3: Create:
2018 s = extract_field <v_out2, 0>
2019 for (offset = element_size;
2020 offset < vector_size;
2021 offset += element_size;)
2023 Create: s' = extract_field <v_out2, offset>
2024 Create: s = op <s, s'>
2027 if (vect_print_dump_info (REPORT_DETAILS))
2028 fprintf (vect_dump, "Reduce using scalar code. ");
2030 vec_temp = PHI_RESULT (new_phi);
2031 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2032 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2034 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2035 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2036 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2037 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2038 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2040 for (bit_offset = element_bitsize;
2041 bit_offset < vec_size_in_bits;
2042 bit_offset += element_bitsize)
2045 tree bitpos = bitsize_int (bit_offset);
2046 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2049 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2050 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2051 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2052 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2053 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2055 tmp = build2 (code, scalar_type, new_name, new_temp);
2056 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
2057 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2058 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2059 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2062 extract_scalar_result = false;
2066 /* 2.4 Extract the final scalar result. Create:
2067 s_out3 = extract_field <v_out2, bitpos> */
2069 if (extract_scalar_result)
2073 gcc_assert (!nested_in_vect_loop);
2074 if (vect_print_dump_info (REPORT_DETAILS))
2075 fprintf (vect_dump, "extract scalar result");
2077 if (BYTES_BIG_ENDIAN)
2078 bitpos = size_binop (MULT_EXPR,
2079 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2080 TYPE_SIZE (scalar_type));
2082 bitpos = bitsize_zero_node;
2084 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2085 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2086 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2087 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2088 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2089 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2092 vect_finalize_reduction:
2094 /* 2.5 Adjust the final result by the initial value of the reduction
2095 variable. (When such adjustment is not needed, then
2096 'adjustment_def' is zero). For example, if code is PLUS we create:
2097 new_temp = loop_exit_def + adjustment_def */
2101 if (nested_in_vect_loop)
2103 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2104 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2105 new_dest = vect_create_destination_var (scalar_dest, vectype);
2109 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2110 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2111 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2113 epilog_stmt = build_gimple_modify_stmt (new_dest, expr);
2114 new_temp = make_ssa_name (new_dest, epilog_stmt);
2115 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2117 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
2119 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2124 /* 2.6 Handle the loop-exit phi */
2126 /* Replace uses of s_out0 with uses of s_out3:
2127 Find the loop-closed-use at the loop exit of the original scalar result.
2128 (The reduction result is expected to have two immediate uses - one at the
2129 latch block, and one at the loop exit). */
2131 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2133 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
2135 exit_phi = USE_STMT (use_p);
2139 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2140 gcc_assert (exit_phi);
2142 if (nested_in_vect_loop)
2144 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2146 /* FORNOW. Currently not supporting the case that an inner-loop reduction
2147 is not used in the outer-loop (but only outside the outer-loop). */
2148 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2149 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2151 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2152 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2153 set_stmt_info (get_stmt_ann (epilog_stmt),
2154 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2156 if (vect_print_dump_info (REPORT_DETAILS))
2158 fprintf (vect_dump, "vector of partial results after inner-loop:");
2159 print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
2164 /* Replace the uses: */
2165 orig_name = PHI_RESULT (exit_phi);
2166 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2167 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2168 SET_USE (use_p, new_temp);
2172 /* Function vectorizable_reduction.
2174 Check if STMT performs a reduction operation that can be vectorized.
2175 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2176 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2177 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2179 This function also handles reduction idioms (patterns) that have been
2180 recognized in advance during vect_pattern_recog. In this case, STMT may be
2182 X = pattern_expr (arg0, arg1, ..., X)
2183 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2184 sequence that had been detected and replaced by the pattern-stmt (STMT).
2186 In some cases of reduction patterns, the type of the reduction variable X is
2187 different than the type of the other arguments of STMT.
2188 In such cases, the vectype that is used when transforming STMT into a vector
2189 stmt is different than the vectype that is used to determine the
2190 vectorization factor, because it consists of a different number of elements
2191 than the actual number of elements that are being operated upon in parallel.
2193 For example, consider an accumulation of shorts into an int accumulator.
2194 On some targets it's possible to vectorize this pattern operating on 8
2195 shorts at a time (hence, the vectype for purposes of determining the
2196 vectorization factor should be V8HI); on the other hand, the vectype that
2197 is used to create the vector form is actually V4SI (the type of the result).
2199 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2200 indicates what is the actual level of parallelism (V8HI in the example), so
2201 that the right vectorization factor would be derived. This vectype
2202 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2203 be used to create the vectorized stmt. The right vectype for the vectorized
2204 stmt is obtained from the type of the result X:
2205 get_vectype_for_scalar_type (TREE_TYPE (X))
2207 This means that, contrary to "regular" reductions (or "regular" stmts in
2208 general), the following equation:
2209 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2210 does *NOT* necessarily hold for reduction patterns. */
2213 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2218 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2219 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2220 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2221 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2222 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2224 enum tree_code code, orig_code, epilog_reduc_code = 0;
2225 enum machine_mode vec_mode;
2227 optab optab, reduc_optab;
2228 tree new_temp = NULL_TREE;
2230 enum vect_def_type dt;
2235 stmt_vec_info orig_stmt_info;
2236 tree expr = NULL_TREE;
2238 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2239 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2240 stmt_vec_info prev_stmt_info;
2242 tree new_stmt = NULL_TREE;
2245 if (nested_in_vect_loop_p (loop, stmt))
2248 /* FORNOW. This restriction should be relaxed. */
2251 if (vect_print_dump_info (REPORT_DETAILS))
2252 fprintf (vect_dump, "multiple types in nested loop.");
2257 gcc_assert (ncopies >= 1);
2259 /* 1. Is vectorizable reduction? */
2261 /* Not supportable if the reduction variable is used in the loop. */
2262 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2265 /* Reductions that are not used even in an enclosing outer-loop,
2266 are expected to be "live" (used out of the loop). */
2267 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2268 && !STMT_VINFO_LIVE_P (stmt_info))
2271 /* Make sure it was already recognized as a reduction computation. */
2272 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2275 /* 2. Has this been recognized as a reduction pattern?
2277 Check if STMT represents a pattern that has been recognized
2278 in earlier analysis stages. For stmts that represent a pattern,
2279 the STMT_VINFO_RELATED_STMT field records the last stmt in
2280 the original sequence that constitutes the pattern. */
2282 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2285 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2286 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2287 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2288 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2291 /* 3. Check the operands of the operation. The first operands are defined
2292 inside the loop body. The last operand is the reduction variable,
2293 which is defined by the loop-header-phi. */
2295 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
2297 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2298 code = TREE_CODE (operation);
2299 op_type = TREE_OPERAND_LENGTH (operation);
2300 if (op_type != binary_op && op_type != ternary_op)
2302 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2303 scalar_type = TREE_TYPE (scalar_dest);
2305 /* All uses but the last are expected to be defined in the loop.
2306 The last use is the reduction variable. */
2307 for (i = 0; i < op_type-1; i++)
2309 op = TREE_OPERAND (operation, i);
2310 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2311 gcc_assert (is_simple_use);
2312 if (dt != vect_loop_def
2313 && dt != vect_invariant_def
2314 && dt != vect_constant_def
2315 && dt != vect_induction_def)
2319 op = TREE_OPERAND (operation, i);
2320 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2321 gcc_assert (is_simple_use);
2322 gcc_assert (dt == vect_reduction_def);
2323 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
2325 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2327 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2329 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2332 /* 4. Supportable by target? */
2334 /* 4.1. check support for the operation in the loop */
2335 optab = optab_for_tree_code (code, vectype);
2338 if (vect_print_dump_info (REPORT_DETAILS))
2339 fprintf (vect_dump, "no optab.");
2342 vec_mode = TYPE_MODE (vectype);
2343 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2345 if (vect_print_dump_info (REPORT_DETAILS))
2346 fprintf (vect_dump, "op not supported by target.");
2347 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2348 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2349 < vect_min_worthwhile_factor (code))
2351 if (vect_print_dump_info (REPORT_DETAILS))
2352 fprintf (vect_dump, "proceeding using word mode.");
2355 /* Worthwhile without SIMD support? */
2356 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2357 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2358 < vect_min_worthwhile_factor (code))
2360 if (vect_print_dump_info (REPORT_DETAILS))
2361 fprintf (vect_dump, "not worthwhile without SIMD support.");
2365 /* 4.2. Check support for the epilog operation.
2367 If STMT represents a reduction pattern, then the type of the
2368 reduction variable may be different than the type of the rest
2369 of the arguments. For example, consider the case of accumulation
2370 of shorts into an int accumulator; The original code:
2371 S1: int_a = (int) short_a;
2372 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2375 STMT: int_acc = widen_sum <short_a, int_acc>
2378 1. The tree-code that is used to create the vector operation in the
2379 epilog code (that reduces the partial results) is not the
2380 tree-code of STMT, but is rather the tree-code of the original
2381 stmt from the pattern that STMT is replacing. I.e, in the example
2382 above we want to use 'widen_sum' in the loop, but 'plus' in the
2384 2. The type (mode) we use to check available target support
2385 for the vector operation to be created in the *epilog*, is
2386 determined by the type of the reduction variable (in the example
2387 above we'd check this: plus_optab[vect_int_mode]).
2388 However the type (mode) we use to check available target support
2389 for the vector operation to be created *inside the loop*, is
2390 determined by the type of the other arguments to STMT (in the
2391 example we'd check this: widen_sum_optab[vect_short_mode]).
2393 This is contrary to "regular" reductions, in which the types of all
2394 the arguments are the same as the type of the reduction variable.
2395 For "regular" reductions we can therefore use the same vector type
2396 (and also the same tree-code) when generating the epilog code and
2397 when generating the code inside the loop. */
2401 /* This is a reduction pattern: get the vectype from the type of the
2402 reduction variable, and get the tree-code from orig_stmt. */
2403 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2404 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
2405 vec_mode = TYPE_MODE (vectype);
2409 /* Regular reduction: use the same vectype and tree-code as used for
2410 the vector code inside the loop can be used for the epilog code. */
2414 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
2416 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
2419 if (vect_print_dump_info (REPORT_DETAILS))
2420 fprintf (vect_dump, "no optab for reduction.");
2421 epilog_reduc_code = NUM_TREE_CODES;
2423 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
2425 if (vect_print_dump_info (REPORT_DETAILS))
2426 fprintf (vect_dump, "reduc op not supported by target.");
2427 epilog_reduc_code = NUM_TREE_CODES;
2430 if (!vec_stmt) /* transformation not required. */
2432 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
2433 vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
2439 if (vect_print_dump_info (REPORT_DETAILS))
2440 fprintf (vect_dump, "transform reduction.");
2442 /* Create the destination vector */
2443 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2445 /* Create the reduction-phi that defines the reduction-operand. */
2446 new_phi = create_phi_node (vec_dest, loop->header);
2448 /* In case the vectorization factor (VF) is bigger than the number
2449 of elements that we can fit in a vectype (nunits), we have to generate
2450 more than one vector stmt - i.e - we need to "unroll" the
2451 vector stmt by a factor VF/nunits. For more details see documentation
2452 in vectorizable_operation. */
2454 prev_stmt_info = NULL;
2455 for (j = 0; j < ncopies; j++)
2460 op = TREE_OPERAND (operation, 0);
2461 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
2462 if (op_type == ternary_op)
2464 op = TREE_OPERAND (operation, 1);
2465 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
2468 /* Get the vector def for the reduction variable from the phi node */
2469 reduc_def = PHI_RESULT (new_phi);
2473 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2474 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
2475 if (op_type == ternary_op)
2476 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
2478 /* Get the vector def for the reduction variable from the vectorized
2479 reduction operation generated in the previous iteration (j-1) */
2480 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
2483 /* Arguments are ready. create the new vector stmt. */
2484 if (op_type == binary_op)
2485 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
2487 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
2489 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2490 new_temp = make_ssa_name (vec_dest, new_stmt);
2491 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2492 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2495 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2497 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2498 prev_stmt_info = vinfo_for_stmt (new_stmt);
2501 /* Finalize the reduction-phi (set it's arguments) and create the
2502 epilog reduction code. */
2503 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
2507 /* Checks if CALL can be vectorized in type VECTYPE. Returns
2508 a function declaration if the target has a vectorized version
2509 of the function, or NULL_TREE if the function cannot be vectorized. */
2512 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
2514 tree fndecl = get_callee_fndecl (call);
2515 enum built_in_function code;
2517 /* We only handle functions that do not read or clobber memory -- i.e.
2518 const or novops ones. */
2519 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
2523 || TREE_CODE (fndecl) != FUNCTION_DECL
2524 || !DECL_BUILT_IN (fndecl))
2527 code = DECL_FUNCTION_CODE (fndecl);
2528 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
2532 /* Function vectorizable_call.
2534 Check if STMT performs a function call that can be vectorized.
2535 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2536 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2537 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2540 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2546 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
2547 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
2548 tree vectype_out, vectype_in;
2551 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2552 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2553 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
2554 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
2556 int ncopies, j, nargs;
2557 call_expr_arg_iterator iter;
2559 enum { NARROW, NONE, WIDEN } modifier;
2561 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2564 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2567 /* FORNOW: not yet supported. */
2568 if (STMT_VINFO_LIVE_P (stmt_info))
2570 if (vect_print_dump_info (REPORT_DETAILS))
2571 fprintf (vect_dump, "value used after loop.");
2575 /* Is STMT a vectorizable call? */
2576 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2579 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2582 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2583 if (TREE_CODE (operation) != CALL_EXPR)
2586 /* Process function arguments. */
2587 rhs_type = NULL_TREE;
2589 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
2591 /* Bail out if the function has more than two arguments, we
2592 do not have interesting builtin functions to vectorize with
2593 more than two arguments. */
2597 /* We can only handle calls with arguments of the same type. */
2599 && rhs_type != TREE_TYPE (op))
2601 if (vect_print_dump_info (REPORT_DETAILS))
2602 fprintf (vect_dump, "argument types differ.");
2605 rhs_type = TREE_TYPE (op);
2607 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
2609 if (vect_print_dump_info (REPORT_DETAILS))
2610 fprintf (vect_dump, "use not simple.");
2617 /* No arguments is also not good. */
2621 vectype_in = get_vectype_for_scalar_type (rhs_type);
2622 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2624 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
2625 vectype_out = get_vectype_for_scalar_type (lhs_type);
2626 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2629 if (nunits_in == nunits_out / 2)
2631 else if (nunits_out == nunits_in)
2633 else if (nunits_out == nunits_in / 2)
2638 /* For now, we only vectorize functions if a target specific builtin
2639 is available. TODO -- in some cases, it might be profitable to
2640 insert the calls for pieces of the vector, in order to be able
2641 to vectorize other operations in the loop. */
2642 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
2643 if (fndecl == NULL_TREE)
2645 if (vect_print_dump_info (REPORT_DETAILS))
2646 fprintf (vect_dump, "function is not vectorizable.");
2651 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
2653 if (modifier == NARROW)
2654 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2656 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2658 /* Sanity check: make sure that at least one copy of the vectorized stmt
2659 needs to be generated. */
2660 gcc_assert (ncopies >= 1);
2662 /* FORNOW. This restriction should be relaxed. */
2663 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
2665 if (vect_print_dump_info (REPORT_DETAILS))
2666 fprintf (vect_dump, "multiple types in nested loop.");
2670 if (!vec_stmt) /* transformation not required. */
2672 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
2673 if (vect_print_dump_info (REPORT_DETAILS))
2674 fprintf (vect_dump, "=== vectorizable_call ===");
2675 vect_model_simple_cost (stmt_info, ncopies, dt);
2681 if (vect_print_dump_info (REPORT_DETAILS))
2682 fprintf (vect_dump, "transform operation.");
2684 /* FORNOW. This restriction should be relaxed. */
2685 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
2687 if (vect_print_dump_info (REPORT_DETAILS))
2688 fprintf (vect_dump, "multiple types in nested loop.");
2693 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2694 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2696 prev_stmt_info = NULL;
2700 for (j = 0; j < ncopies; ++j)
2702 /* Build argument list for the vectorized call. */
2703 /* FIXME: Rewrite this so that it doesn't
2704 construct a temporary list. */
2707 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
2711 = vect_get_vec_def_for_operand (op, stmt, NULL);
2714 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
2716 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
2720 vargs = nreverse (vargs);
2722 rhs = build_function_call_expr (fndecl, vargs);
2723 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
2724 new_temp = make_ssa_name (vec_dest, new_stmt);
2725 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2727 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2730 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2732 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2734 prev_stmt_info = vinfo_for_stmt (new_stmt);
2740 for (j = 0; j < ncopies; ++j)
2742 /* Build argument list for the vectorized call. */
2743 /* FIXME: Rewrite this so that it doesn't
2744 construct a temporary list. */
2747 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
2752 = vect_get_vec_def_for_operand (op, stmt, NULL);
2754 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
2759 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
2761 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
2764 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
2765 vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs);
2769 vargs = nreverse (vargs);
2771 rhs = build_function_call_expr (fndecl, vargs);
2772 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
2773 new_temp = make_ssa_name (vec_dest, new_stmt);
2774 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2776 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2779 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2781 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2783 prev_stmt_info = vinfo_for_stmt (new_stmt);
2786 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2791 /* No current target implements this case. */
2795 /* The call in STMT might prevent it from being removed in dce.
2796 We however cannot remove it here, due to the way the ssa name
2797 it defines is mapped to the new definition. So just replace
2798 rhs of the statement with something harmless. */
2799 type = TREE_TYPE (scalar_dest);
2800 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
2807 /* Function vect_gen_widened_results_half
2809 Create a vector stmt whose code, type, number of arguments, and result
2810 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
2811 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
2812 In the case that CODE is a CALL_EXPR, this means that a call to DECL
2813 needs to be created (DECL is a function-decl of a target-builtin).
2814 STMT is the original scalar stmt that we are vectorizing. */
2817 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
2818 tree vec_oprnd0, tree vec_oprnd1, int op_type,
2819 tree vec_dest, block_stmt_iterator *bsi,
2828 /* Generate half of the widened result: */
2829 if (code == CALL_EXPR)
2831 /* Target specific support */
2832 if (op_type == binary_op)
2833 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
2835 expr = build_call_expr (decl, 1, vec_oprnd0);
2839 /* Generic support */
2840 gcc_assert (op_type == TREE_CODE_LENGTH (code));
2841 if (op_type == binary_op)
2842 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
2844 expr = build1 (code, vectype, vec_oprnd0);
2846 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2847 new_temp = make_ssa_name (vec_dest, new_stmt);
2848 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2849 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2851 if (code == CALL_EXPR)
2853 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2855 if (TREE_CODE (sym) == SSA_NAME)
2856 sym = SSA_NAME_VAR (sym);
2857 mark_sym_for_renaming (sym);
2865 /* Function vectorizable_conversion.
2867 Check if STMT performs a conversion operation, that can be vectorized.
2868 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2869 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2870 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2873 vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
2880 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
2881 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2882 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2883 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2884 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
2885 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
2888 enum vect_def_type dt0;
2890 stmt_vec_info prev_stmt_info;
2893 tree vectype_out, vectype_in;
2896 tree rhs_type, lhs_type;
2898 enum { NARROW, NONE, WIDEN } modifier;
2900 /* Is STMT a vectorizable conversion? */
2902 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2905 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2908 if (STMT_VINFO_LIVE_P (stmt_info))
2910 /* FORNOW: not yet supported. */
2911 if (vect_print_dump_info (REPORT_DETAILS))
2912 fprintf (vect_dump, "value used after loop.");
2916 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2919 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2922 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2923 code = TREE_CODE (operation);
2924 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
2927 /* Check types of lhs and rhs */
2928 op0 = TREE_OPERAND (operation, 0);
2929 rhs_type = TREE_TYPE (op0);
2930 vectype_in = get_vectype_for_scalar_type (rhs_type);
2931 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2933 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2934 lhs_type = TREE_TYPE (scalar_dest);
2935 vectype_out = get_vectype_for_scalar_type (lhs_type);
2936 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2939 if (nunits_in == nunits_out / 2)
2941 else if (nunits_out == nunits_in)
2943 else if (nunits_out == nunits_in / 2)
2948 if (modifier == NONE)
2949 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
2951 /* Bail out if the types are both integral or non-integral */
2952 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
2953 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
2956 if (modifier == NARROW)
2957 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2959 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2961 /* Sanity check: make sure that at least one copy of the vectorized stmt
2962 needs to be generated. */
2963 gcc_assert (ncopies >= 1);
2965 /* FORNOW. This restriction should be relaxed. */
2966 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
2968 if (vect_print_dump_info (REPORT_DETAILS))
2969 fprintf (vect_dump, "multiple types in nested loop.");
2973 /* Check the operands of the operation. */
2974 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2976 if (vect_print_dump_info (REPORT_DETAILS))
2977 fprintf (vect_dump, "use not simple.");
2981 /* Supportable by target? */
2982 if ((modifier == NONE
2983 && !targetm.vectorize.builtin_conversion (code, vectype_in))
2984 || (modifier == WIDEN
2985 && !supportable_widening_operation (code, stmt, vectype_in,
2988 || (modifier == NARROW
2989 && !supportable_narrowing_operation (code, stmt, vectype_in,
2992 if (vect_print_dump_info (REPORT_DETAILS))
2993 fprintf (vect_dump, "op not supported by target.");
2997 if (modifier != NONE)
2998 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3000 if (!vec_stmt) /* transformation not required. */
3002 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3007 if (vect_print_dump_info (REPORT_DETAILS))
3008 fprintf (vect_dump, "transform conversion.");
3011 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3013 prev_stmt_info = NULL;
3017 for (j = 0; j < ncopies; j++)
3023 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3025 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
3028 targetm.vectorize.builtin_conversion (code, vectype_in);
3029 new_stmt = build_call_expr (builtin_decl, 1, vec_oprnd0);
3031 /* Arguments are ready. create the new vector stmt. */
3032 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3033 new_temp = make_ssa_name (vec_dest, new_stmt);
3034 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3035 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3036 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3038 if (TREE_CODE (sym) == SSA_NAME)
3039 sym = SSA_NAME_VAR (sym);
3040 mark_sym_for_renaming (sym);
3044 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3046 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3047 prev_stmt_info = vinfo_for_stmt (new_stmt);
3052 /* In case the vectorization factor (VF) is bigger than the number
3053 of elements that we can fit in a vectype (nunits), we have to
3054 generate more than one vector stmt - i.e - we need to "unroll"
3055 the vector stmt by a factor VF/nunits. */
3056 for (j = 0; j < ncopies; j++)
3059 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3061 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
3063 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3065 /* Generate first half of the widened result: */
3067 = vect_gen_widened_results_half (code1, vectype_out, decl1,
3068 vec_oprnd0, vec_oprnd1,
3069 unary_op, vec_dest, bsi, stmt);
3071 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3073 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3074 prev_stmt_info = vinfo_for_stmt (new_stmt);
3076 /* Generate second half of the widened result: */
3078 = vect_gen_widened_results_half (code2, vectype_out, decl2,
3079 vec_oprnd0, vec_oprnd1,
3080 unary_op, vec_dest, bsi, stmt);
3081 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3082 prev_stmt_info = vinfo_for_stmt (new_stmt);
3087 /* In case the vectorization factor (VF) is bigger than the number
3088 of elements that we can fit in a vectype (nunits), we have to
3089 generate more than one vector stmt - i.e - we need to "unroll"
3090 the vector stmt by a factor VF/nunits. */
3091 for (j = 0; j < ncopies; j++)
3096 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3097 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
3101 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
3102 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
3105 /* Arguments are ready. Create the new vector stmt. */
3106 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3107 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3108 new_temp = make_ssa_name (vec_dest, new_stmt);
3109 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3110 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3113 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3115 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3117 prev_stmt_info = vinfo_for_stmt (new_stmt);
3120 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3126 /* Function vectorizable_assignment.
3128 Check if STMT performs an assignment (copy) that can be vectorized.
3129 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3130 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3131 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3134 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3140 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3141 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3142 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3145 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3146 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3147 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3149 gcc_assert (ncopies >= 1);
3151 return false; /* FORNOW */
3153 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3156 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3159 /* FORNOW: not yet supported. */
3160 if (STMT_VINFO_LIVE_P (stmt_info))
3162 if (vect_print_dump_info (REPORT_DETAILS))
3163 fprintf (vect_dump, "value used after loop.");
3167 /* Is vectorizable assignment? */
3168 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3171 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3172 if (TREE_CODE (scalar_dest) != SSA_NAME)
3175 op = GIMPLE_STMT_OPERAND (stmt, 1);
3176 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3178 if (vect_print_dump_info (REPORT_DETAILS))
3179 fprintf (vect_dump, "use not simple.");
3183 if (!vec_stmt) /* transformation not required. */
3185 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3186 if (vect_print_dump_info (REPORT_DETAILS))
3187 fprintf (vect_dump, "=== vectorizable_assignment ===");
3188 vect_model_simple_cost (stmt_info, ncopies, dt);
3193 if (vect_print_dump_info (REPORT_DETAILS))
3194 fprintf (vect_dump, "transform assignment.");
3197 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3200 op = GIMPLE_STMT_OPERAND (stmt, 1);
3201 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
3203 /* Arguments are ready. create the new vector stmt. */
3204 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_oprnd);
3205 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3206 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
3207 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3213 /* Function vect_min_worthwhile_factor.
3215 For a loop where we could vectorize the operation indicated by CODE,
3216 return the minimum vectorization factor that makes it worthwhile
3217 to use generic vectors. */
3219 vect_min_worthwhile_factor (enum tree_code code)
3240 /* Function vectorizable_induction
3242 Check if PHI performs an induction computation that can be vectorized.
3243 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3244 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3245 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3248 vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3251 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3252 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3253 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3254 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3255 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3258 gcc_assert (ncopies >= 1);
3260 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3263 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3265 if (STMT_VINFO_LIVE_P (stmt_info))
3267 /* FORNOW: not yet supported. */
3268 if (vect_print_dump_info (REPORT_DETAILS))
3269 fprintf (vect_dump, "value used after loop.");
3273 if (TREE_CODE (phi) != PHI_NODE)
3276 if (!vec_stmt) /* transformation not required. */
3278 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3279 if (vect_print_dump_info (REPORT_DETAILS))
3280 fprintf (vect_dump, "=== vectorizable_induction ===");
3281 vect_model_induction_cost (stmt_info, ncopies);
3287 if (vect_print_dump_info (REPORT_DETAILS))
3288 fprintf (vect_dump, "transform induction phi.");
3290 vec_def = get_initial_def_for_induction (phi);
3291 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3296 /* Function vectorizable_operation.
3298 Check if STMT performs a binary or unary operation that can be vectorized.
3299 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3300 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3301 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3304 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3309 tree op0, op1 = NULL;
3310 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3311 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3312 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3313 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3314 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3315 enum tree_code code;
3316 enum machine_mode vec_mode;
3321 enum machine_mode optab_op2_mode;
3323 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3325 stmt_vec_info prev_stmt_info;
3326 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
3329 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3332 gcc_assert (ncopies >= 1);
3333 /* FORNOW. This restriction should be relaxed. */
3334 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3336 if (vect_print_dump_info (REPORT_DETAILS))
3337 fprintf (vect_dump, "multiple types in nested loop.");
3341 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3344 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3347 /* FORNOW: not yet supported. */
3348 if (STMT_VINFO_LIVE_P (stmt_info))
3350 if (vect_print_dump_info (REPORT_DETAILS))
3351 fprintf (vect_dump, "value used after loop.");
3355 /* Is STMT a vectorizable binary/unary operation? */
3356 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3359 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3362 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3363 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3364 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3365 if (nunits_out != nunits_in)
3368 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3369 code = TREE_CODE (operation);
3371 /* For pointer addition, we should use the normal plus for
3372 the vector addition. */
3373 if (code == POINTER_PLUS_EXPR)
3376 optab = optab_for_tree_code (code, vectype);
3378 /* Support only unary or binary operations. */
3379 op_type = TREE_OPERAND_LENGTH (operation);
3380 if (op_type != unary_op && op_type != binary_op)
3382 if (vect_print_dump_info (REPORT_DETAILS))
3383 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
3387 op0 = TREE_OPERAND (operation, 0);
3388 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3390 if (vect_print_dump_info (REPORT_DETAILS))
3391 fprintf (vect_dump, "use not simple.");
3395 if (op_type == binary_op)
3397 op1 = TREE_OPERAND (operation, 1);
3398 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
3400 if (vect_print_dump_info (REPORT_DETAILS))
3401 fprintf (vect_dump, "use not simple.");
3406 /* Supportable by target? */
3409 if (vect_print_dump_info (REPORT_DETAILS))
3410 fprintf (vect_dump, "no optab.");
3413 vec_mode = TYPE_MODE (vectype);
3414 icode = (int) optab_handler (optab, vec_mode)->insn_code;
3415 if (icode == CODE_FOR_nothing)
3417 if (vect_print_dump_info (REPORT_DETAILS))
3418 fprintf (vect_dump, "op not supported by target.");
3419 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3420 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3421 < vect_min_worthwhile_factor (code))
3423 if (vect_print_dump_info (REPORT_DETAILS))
3424 fprintf (vect_dump, "proceeding using word mode.");
3427 /* Worthwhile without SIMD support? */
3428 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3429 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3430 < vect_min_worthwhile_factor (code))
3432 if (vect_print_dump_info (REPORT_DETAILS))
3433 fprintf (vect_dump, "not worthwhile without SIMD support.");
3437 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
3439 /* FORNOW: not yet supported. */
3440 if (!VECTOR_MODE_P (vec_mode))
3443 /* Invariant argument is needed for a vector shift
3444 by a scalar shift operand. */
3445 optab_op2_mode = insn_data[icode].operand[2].mode;
3446 if (! (VECTOR_MODE_P (optab_op2_mode)
3447 || dt[1] == vect_constant_def
3448 || dt[1] == vect_invariant_def))
3450 if (vect_print_dump_info (REPORT_DETAILS))
3451 fprintf (vect_dump, "operand mode requires invariant argument.");
3456 if (!vec_stmt) /* transformation not required. */
3458 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
3459 if (vect_print_dump_info (REPORT_DETAILS))
3460 fprintf (vect_dump, "=== vectorizable_operation ===");
3461 vect_model_simple_cost (stmt_info, ncopies, dt);
3467 if (vect_print_dump_info (REPORT_DETAILS))
3468 fprintf (vect_dump, "transform binary/unary operation.");
3471 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3473 /* In case the vectorization factor (VF) is bigger than the number
3474 of elements that we can fit in a vectype (nunits), we have to generate
3475 more than one vector stmt - i.e - we need to "unroll" the
3476 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3477 from one copy of the vector stmt to the next, in the field
3478 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3479 stages to find the correct vector defs to be used when vectorizing
3480 stmts that use the defs of the current stmt. The example below illustrates
3481 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3482 4 vectorized stmts):
3484 before vectorization:
3485 RELATED_STMT VEC_STMT
3489 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
3491 RELATED_STMT VEC_STMT
3492 VS1_0: vx0 = memref0 VS1_1 -
3493 VS1_1: vx1 = memref1 VS1_2 -
3494 VS1_2: vx2 = memref2 VS1_3 -
3495 VS1_3: vx3 = memref3 - -
3496 S1: x = load - VS1_0
3499 step2: vectorize stmt S2 (done here):
3500 To vectorize stmt S2 we first need to find the relevant vector
3501 def for the first operand 'x'. This is, as usual, obtained from
3502 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
3503 that defines 'x' (S1). This way we find the stmt VS1_0, and the
3504 relevant vector def 'vx0'. Having found 'vx0' we can generate
3505 the vector stmt VS2_0, and as usual, record it in the
3506 STMT_VINFO_VEC_STMT of stmt S2.
3507 When creating the second copy (VS2_1), we obtain the relevant vector
3508 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
3509 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
3510 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
3511 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
3512 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
3513 chain of stmts and pointers:
3514 RELATED_STMT VEC_STMT
3515 VS1_0: vx0 = memref0 VS1_1 -
3516 VS1_1: vx1 = memref1 VS1_2 -
3517 VS1_2: vx2 = memref2 VS1_3 -
3518 VS1_3: vx3 = memref3 - -
3519 S1: x = load - VS1_0
3520 VS2_0: vz0 = vx0 + v1 VS2_1 -
3521 VS2_1: vz1 = vx1 + v1 VS2_2 -
3522 VS2_2: vz2 = vx2 + v1 VS2_3 -
3523 VS2_3: vz3 = vx3 + v1 - -
3524 S2: z = x + 1 - VS2_0 */
3526 prev_stmt_info = NULL;
3527 for (j = 0; j < ncopies; j++)
3532 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3533 if (op_type == binary_op)
3535 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
3537 /* Vector shl and shr insn patterns can be defined with
3538 scalar operand 2 (shift operand). In this case, use
3539 constant or loop invariant op1 directly, without
3540 extending it to vector mode first. */
3541 optab_op2_mode = insn_data[icode].operand[2].mode;
3542 if (!VECTOR_MODE_P (optab_op2_mode))
3544 if (vect_print_dump_info (REPORT_DETAILS))
3545 fprintf (vect_dump, "operand 1 using scalar mode.");
3550 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
3555 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3556 if (op_type == binary_op)
3557 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
3560 /* Arguments are ready. create the new vector stmt. */
3562 if (op_type == binary_op)
3563 new_stmt = build_gimple_modify_stmt (vec_dest,
3564 build2 (code, vectype, vec_oprnd0, vec_oprnd1));
3566 new_stmt = build_gimple_modify_stmt (vec_dest,
3567 build1 (code, vectype, vec_oprnd0));
3568 new_temp = make_ssa_name (vec_dest, new_stmt);
3569 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3570 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3573 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3575 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3576 prev_stmt_info = vinfo_for_stmt (new_stmt);
3583 /* Function vectorizable_type_demotion
3585 Check if STMT performs a binary or unary operation that involves
3586 type demotion, and if it can be vectorized.
3587 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3588 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3589 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3592 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
3599 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
3600 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3601 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3602 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3603 enum tree_code code, code1 = ERROR_MARK;
3606 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3608 stmt_vec_info prev_stmt_info;
3617 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3620 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3623 /* FORNOW: not yet supported. */
3624 if (STMT_VINFO_LIVE_P (stmt_info))
3626 if (vect_print_dump_info (REPORT_DETAILS))
3627 fprintf (vect_dump, "value used after loop.");
3631 /* Is STMT a vectorizable type-demotion operation? */
3632 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3635 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3638 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3639 code = TREE_CODE (operation);
3640 if (code != NOP_EXPR && code != CONVERT_EXPR)
3643 op0 = TREE_OPERAND (operation, 0);
3644 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
3645 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3647 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3648 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3649 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3650 if (nunits_in != nunits_out / 2) /* FORNOW */
3653 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3654 gcc_assert (ncopies >= 1);
3655 /* FORNOW. This restriction should be relaxed. */
3656 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3658 if (vect_print_dump_info (REPORT_DETAILS))
3659 fprintf (vect_dump, "multiple types in nested loop.");
3663 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
3664 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3665 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
3666 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
3667 && (code == NOP_EXPR || code == CONVERT_EXPR))))
3670 /* Check the operands of the operation. */
3671 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3673 if (vect_print_dump_info (REPORT_DETAILS))
3674 fprintf (vect_dump, "use not simple.");
3678 /* Supportable by target? */
3679 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1))
3682 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3684 if (!vec_stmt) /* transformation not required. */
3686 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
3687 if (vect_print_dump_info (REPORT_DETAILS))
3688 fprintf (vect_dump, "=== vectorizable_demotion ===");
3689 vect_model_simple_cost (stmt_info, ncopies, dt);
3694 if (vect_print_dump_info (REPORT_DETAILS))
3695 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
3699 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3701 /* In case the vectorization factor (VF) is bigger than the number
3702 of elements that we can fit in a vectype (nunits), we have to generate
3703 more than one vector stmt - i.e - we need to "unroll" the
3704 vector stmt by a factor VF/nunits. */
3705 prev_stmt_info = NULL;
3706 for (j = 0; j < ncopies; j++)
3711 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3712 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3716 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3717 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3720 /* Arguments are ready. Create the new vector stmt. */
3721 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3722 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3723 new_temp = make_ssa_name (vec_dest, new_stmt);
3724 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3725 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3728 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3730 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3732 prev_stmt_info = vinfo_for_stmt (new_stmt);
3735 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3740 /* Function vectorizable_type_promotion
3742 Check if STMT performs a binary or unary operation that involves
3743 type promotion, and if it can be vectorized.
3744 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3745 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3746 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3749 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
3755 tree op0, op1 = NULL;
3756 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
3757 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3758 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3759 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3760 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3761 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3764 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3766 stmt_vec_info prev_stmt_info;
3774 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3777 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3780 /* FORNOW: not yet supported. */
3781 if (STMT_VINFO_LIVE_P (stmt_info))
3783 if (vect_print_dump_info (REPORT_DETAILS))
3784 fprintf (vect_dump, "value used after loop.");
3788 /* Is STMT a vectorizable type-promotion operation? */
3789 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3792 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3795 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3796 code = TREE_CODE (operation);
3797 if (code != NOP_EXPR && code != CONVERT_EXPR
3798 && code != WIDEN_MULT_EXPR)
3801 op0 = TREE_OPERAND (operation, 0);
3802 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
3803 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3805 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3806 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3807 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3808 if (nunits_out != nunits_in / 2) /* FORNOW */
3811 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3812 gcc_assert (ncopies >= 1);
3813 /* FORNOW. This restriction should be relaxed. */
3814 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3816 if (vect_print_dump_info (REPORT_DETAILS))
3817 fprintf (vect_dump, "multiple types in nested loop.");
3821 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
3822 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3823 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
3824 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
3825 && (code == CONVERT_EXPR || code == NOP_EXPR))))
3828 /* Check the operands of the operation. */
3829 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3831 if (vect_print_dump_info (REPORT_DETAILS))
3832 fprintf (vect_dump, "use not simple.");
3836 op_type = TREE_CODE_LENGTH (code);
3837 if (op_type == binary_op)
3839 op1 = TREE_OPERAND (operation, 1);
3840 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
3842 if (vect_print_dump_info (REPORT_DETAILS))
3843 fprintf (vect_dump, "use not simple.");
3848 /* Supportable by target? */
3849 if (!supportable_widening_operation (code, stmt, vectype_in,
3850 &decl1, &decl2, &code1, &code2))
3853 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3855 if (!vec_stmt) /* transformation not required. */
3857 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
3858 if (vect_print_dump_info (REPORT_DETAILS))
3859 fprintf (vect_dump, "=== vectorizable_promotion ===");
3860 vect_model_simple_cost (stmt_info, 2*ncopies, dt);
3866 if (vect_print_dump_info (REPORT_DETAILS))
3867 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
3871 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3873 /* In case the vectorization factor (VF) is bigger than the number
3874 of elements that we can fit in a vectype (nunits), we have to generate
3875 more than one vector stmt - i.e - we need to "unroll" the
3876 vector stmt by a factor VF/nunits. */
3878 prev_stmt_info = NULL;
3879 for (j = 0; j < ncopies; j++)
3884 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3885 if (op_type == binary_op)
3886 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
3890 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3891 if (op_type == binary_op)
3892 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
3895 /* Arguments are ready. Create the new vector stmt. We are creating
3896 two vector defs because the widened result does not fit in one vector.
3897 The vectorized stmt can be expressed as a call to a taregt builtin,
3898 or a using a tree-code. */
3899 /* Generate first half of the widened result: */
3900 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
3901 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
3903 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3905 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3906 prev_stmt_info = vinfo_for_stmt (new_stmt);
3908 /* Generate second half of the widened result: */
3909 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
3910 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
3911 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3912 prev_stmt_info = vinfo_for_stmt (new_stmt);
3916 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3921 /* Function vect_strided_store_supported.
3923 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
3924 and FALSE otherwise. */
3927 vect_strided_store_supported (tree vectype)
3929 optab interleave_high_optab, interleave_low_optab;
3932 mode = (int) TYPE_MODE (vectype);
3934 /* Check that the operation is supported. */
3935 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
3937 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
3939 if (!interleave_high_optab || !interleave_low_optab)
3941 if (vect_print_dump_info (REPORT_DETAILS))
3942 fprintf (vect_dump, "no optab for interleave.");
3946 if (optab_handler (interleave_high_optab, mode)->insn_code
3948 || optab_handler (interleave_low_optab, mode)->insn_code
3949 == CODE_FOR_nothing)
3951 if (vect_print_dump_info (REPORT_DETAILS))
3952 fprintf (vect_dump, "interleave op not supported by target.");
3959 /* Function vect_permute_store_chain.
3961 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
3962 a power of 2, generate interleave_high/low stmts to reorder the data
3963 correctly for the stores. Return the final references for stores in
3966 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3967 The input is 4 vectors each containing 8 elements. We assign a number to each
3968 element, the input sequence is:
3970 1st vec: 0 1 2 3 4 5 6 7
3971 2nd vec: 8 9 10 11 12 13 14 15
3972 3rd vec: 16 17 18 19 20 21 22 23
3973 4th vec: 24 25 26 27 28 29 30 31
3975 The output sequence should be:
3977 1st vec: 0 8 16 24 1 9 17 25
3978 2nd vec: 2 10 18 26 3 11 19 27
3979 3rd vec: 4 12 20 28 5 13 21 30
3980 4th vec: 6 14 22 30 7 15 23 31
3982 i.e., we interleave the contents of the four vectors in their order.
3984 We use interleave_high/low instructions to create such output. The input of
3985 each interleave_high/low operation is two vectors:
3988 the even elements of the result vector are obtained left-to-right from the
3989 high/low elements of the first vector. The odd elements of the result are
3990 obtained left-to-right from the high/low elements of the second vector.
3991 The output of interleave_high will be: 0 4 1 5
3992 and of interleave_low: 2 6 3 7
3995 The permutation is done in log LENGTH stages. In each stage interleave_high
3996 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
3997 where the first argument is taken from the first half of DR_CHAIN and the
3998 second argument from it's second half.
4001 I1: interleave_high (1st vec, 3rd vec)
4002 I2: interleave_low (1st vec, 3rd vec)
4003 I3: interleave_high (2nd vec, 4th vec)
4004 I4: interleave_low (2nd vec, 4th vec)
4006 The output for the first stage is:
4008 I1: 0 16 1 17 2 18 3 19
4009 I2: 4 20 5 21 6 22 7 23
4010 I3: 8 24 9 25 10 26 11 27
4011 I4: 12 28 13 29 14 30 15 31
4013 The output of the second stage, i.e. the final result is:
4015 I1: 0 8 16 24 1 9 17 25
4016 I2: 2 10 18 26 3 11 19 27
4017 I3: 4 12 20 28 5 13 21 30
4018 I4: 6 14 22 30 7 15 23 31. */
4021 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
4022 unsigned int length,
4024 block_stmt_iterator *bsi,
4025 VEC(tree,heap) **result_chain)
4027 tree perm_dest, perm_stmt, vect1, vect2, high, low;
4028 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4029 tree scalar_dest, tmp;
4032 VEC(tree,heap) *first, *second;
4034 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4035 first = VEC_alloc (tree, heap, length/2);
4036 second = VEC_alloc (tree, heap, length/2);
4038 /* Check that the operation is supported. */
4039 if (!vect_strided_store_supported (vectype))
4042 *result_chain = VEC_copy (tree, heap, dr_chain);
4044 for (i = 0; i < exact_log2 (length); i++)
4046 for (j = 0; j < length/2; j++)
4048 vect1 = VEC_index (tree, dr_chain, j);
4049 vect2 = VEC_index (tree, dr_chain, j+length/2);
4051 /* Create interleaving stmt:
4052 in the case of big endian:
4053 high = interleave_high (vect1, vect2)
4054 and in the case of little endian:
4055 high = interleave_low (vect1, vect2). */
4056 perm_dest = create_tmp_var (vectype, "vect_inter_high");
4057 DECL_GIMPLE_REG_P (perm_dest) = 1;
4058 add_referenced_var (perm_dest);
4059 if (BYTES_BIG_ENDIAN)
4060 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4062 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4063 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4064 high = make_ssa_name (perm_dest, perm_stmt);
4065 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
4066 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4067 VEC_replace (tree, *result_chain, 2*j, high);
4069 /* Create interleaving stmt:
4070 in the case of big endian:
4071 low = interleave_low (vect1, vect2)
4072 and in the case of little endian:
4073 low = interleave_high (vect1, vect2). */
4074 perm_dest = create_tmp_var (vectype, "vect_inter_low");
4075 DECL_GIMPLE_REG_P (perm_dest) = 1;
4076 add_referenced_var (perm_dest);
4077 if (BYTES_BIG_ENDIAN)
4078 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4080 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4081 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4082 low = make_ssa_name (perm_dest, perm_stmt);
4083 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
4084 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4085 VEC_replace (tree, *result_chain, 2*j+1, low);
4087 dr_chain = VEC_copy (tree, heap, *result_chain);
4093 /* Function vectorizable_store.
4095 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
4097 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4098 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4099 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4102 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
4107 tree vec_oprnd = NULL_TREE;
4108 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4109 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
4110 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4111 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4112 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4113 enum machine_mode vec_mode;
4115 enum dr_alignment_support alignment_support_cheme;
4117 enum vect_def_type dt;
4118 stmt_vec_info prev_stmt_info = NULL;
4119 tree dataref_ptr = NULL_TREE;
4120 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4121 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4123 tree next_stmt, first_stmt;
4124 bool strided_store = false;
4125 unsigned int group_size, i;
4126 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
4127 gcc_assert (ncopies >= 1);
4128 /* FORNOW. This restriction should be relaxed. */
4129 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4131 if (vect_print_dump_info (REPORT_DETAILS))
4132 fprintf (vect_dump, "multiple types in nested loop.");
4136 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4139 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4142 if (STMT_VINFO_LIVE_P (stmt_info))
4144 if (vect_print_dump_info (REPORT_DETAILS))
4145 fprintf (vect_dump, "value used after loop.");
4149 /* Is vectorizable store? */
4151 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4154 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4155 if (TREE_CODE (scalar_dest) != ARRAY_REF
4156 && TREE_CODE (scalar_dest) != INDIRECT_REF
4157 && !DR_GROUP_FIRST_DR (stmt_info))
4160 op = GIMPLE_STMT_OPERAND (stmt, 1);
4161 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4163 if (vect_print_dump_info (REPORT_DETAILS))
4164 fprintf (vect_dump, "use not simple.");
4168 vec_mode = TYPE_MODE (vectype);
4169 /* FORNOW. In some cases can vectorize even if data-type not supported
4170 (e.g. - array initialization with 0). */
4171 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
4174 if (!STMT_VINFO_DATA_REF (stmt_info))
4177 if (DR_GROUP_FIRST_DR (stmt_info))
4179 strided_store = true;
4180 if (!vect_strided_store_supported (vectype))
4184 if (!vec_stmt) /* transformation not required. */
4186 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
4187 vect_model_store_cost (stmt_info, ncopies, dt);
4195 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4196 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
4197 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
4199 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
4201 /* We vectorize all the stmts of the interleaving group when we
4202 reach the last stmt in the group. */
4203 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
4204 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
4206 *vec_stmt = NULL_TREE;
4217 if (vect_print_dump_info (REPORT_DETAILS))
4218 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
4220 dr_chain = VEC_alloc (tree, heap, group_size);
4221 oprnds = VEC_alloc (tree, heap, group_size);
4223 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
4224 gcc_assert (alignment_support_cheme);
4225 gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */
4227 /* In case the vectorization factor (VF) is bigger than the number
4228 of elements that we can fit in a vectype (nunits), we have to generate
4229 more than one vector stmt - i.e - we need to "unroll" the
4230 vector stmt by a factor VF/nunits. For more details see documentation in
4231 vect_get_vec_def_for_copy_stmt. */
4233 /* In case of interleaving (non-unit strided access):
4240 We create vectorized stores starting from base address (the access of the
4241 first stmt in the chain (S2 in the above example), when the last store stmt
4242 of the chain (S4) is reached:
4245 VS2: &base + vec_size*1 = vx0
4246 VS3: &base + vec_size*2 = vx1
4247 VS4: &base + vec_size*3 = vx3
4249 Then permutation statements are generated:
4251 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
4252 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
4255 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
4256 (the order of the data-refs in the output of vect_permute_store_chain
4257 corresponds to the order of scalar stmts in the interleaving chain - see
4258 the documentation of vect_permute_store_chain()).
4260 In case of both multiple types and interleaving, above vector stores and
4261 permutation stmts are created for every copy. The result vector stmts are
4262 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
4263 STMT_VINFO_RELATED_STMT for the next copies.
4266 prev_stmt_info = NULL;
4267 for (j = 0; j < ncopies; j++)
4274 /* For interleaved stores we collect vectorized defs for all the
4275 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
4276 as an input to vect_permute_store_chain(), and OPRNDS as an input
4277 to vect_get_vec_def_for_stmt_copy() for the next copy.
4278 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4279 OPRNDS are of size 1. */
4280 next_stmt = first_stmt;
4281 for (i = 0; i < group_size; i++)
4283 /* Since gaps are not supported for interleaved stores, GROUP_SIZE
4284 is the exact number of stmts in the chain. Therefore, NEXT_STMT
4285 can't be NULL_TREE. In case that there is no interleaving,
4286 GROUP_SIZE is 1, and only one iteration of the loop will be
4288 gcc_assert (next_stmt);
4289 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4290 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
4291 VEC_quick_push(tree, dr_chain, vec_oprnd);
4292 VEC_quick_push(tree, oprnds, vec_oprnd);
4293 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4295 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE,
4296 &dummy, &ptr_incr, false,
4297 TREE_TYPE (vec_oprnd));
4301 /* For interleaved stores we created vectorized defs for all the
4302 defs stored in OPRNDS in the previous iteration (previous copy).
4303 DR_CHAIN is then used as an input to vect_permute_store_chain(),
4304 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
4306 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4307 OPRNDS are of size 1. */
4308 for (i = 0; i < group_size; i++)
4310 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
4311 VEC_index (tree, oprnds, i));
4312 VEC_replace(tree, dr_chain, i, vec_oprnd);
4313 VEC_replace(tree, oprnds, i, vec_oprnd);
4315 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
4320 result_chain = VEC_alloc (tree, heap, group_size);
4322 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
4327 next_stmt = first_stmt;
4328 for (i = 0; i < group_size; i++)
4330 /* For strided stores vectorized defs are interleaved in
4331 vect_permute_store_chain(). */
4333 vec_oprnd = VEC_index(tree, result_chain, i);
4335 data_ref = build_fold_indirect_ref (dataref_ptr);
4336 /* Arguments are ready. Create the new vector stmt. */
4337 new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
4338 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4339 mark_symbols_for_renaming (new_stmt);
4342 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4344 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4346 prev_stmt_info = vinfo_for_stmt (new_stmt);
4347 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4350 /* Bump the vector pointer. */
4351 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
4359 /* Function vect_setup_realignment
4361 This function is called when vectorizing an unaligned load using
4362 the dr_unaligned_software_pipeline scheme.
4363 This function generates the following code at the loop prolog:
4366 msq_init = *(floor(p)); # prolog load
4367 realignment_token = call target_builtin;
4369 msq = phi (msq_init, ---)
4371 The code above sets up a new (vector) pointer, pointing to the first
4372 location accessed by STMT, and a "floor-aligned" load using that pointer.
4373 It also generates code to compute the "realignment-token" (if the relevant
4374 target hook was defined), and creates a phi-node at the loop-header bb
4375 whose arguments are the result of the prolog-load (created by this
4376 function) and the result of a load that takes place in the loop (to be
4377 created by the caller to this function).
4378 The caller to this function uses the phi-result (msq) to create the
4379 realignment code inside the loop, and sets up the missing phi argument,
4383 msq = phi (msq_init, lsq)
4384 lsq = *(floor(p')); # load in loop
4385 result = realign_load (msq, lsq, realignment_token);
4388 STMT - (scalar) load stmt to be vectorized. This load accesses
4389 a memory location that may be unaligned.
4390 BSI - place where new code is to be inserted.
4393 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4394 target hook, if defined.
4395 Return value - the result of the loop-header phi node. */
4398 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
4399 tree *realignment_token)
4401 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4402 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4403 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4404 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4405 edge pe = loop_preheader_edge (loop);
4406 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4419 /* 1. Create msq_init = *(floor(p1)) in the loop preheader */
4420 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4421 ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true,
4423 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
4424 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
4425 new_temp = make_ssa_name (vec_dest, new_stmt);
4426 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4427 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
4428 gcc_assert (!new_bb);
4429 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
4431 /* 2. Create permutation mask, if required, in loop preheader. */
4432 if (targetm.vectorize.builtin_mask_for_load)
4436 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4437 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
4438 vec_dest = vect_create_destination_var (scalar_dest,
4439 TREE_TYPE (new_stmt));
4440 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
4441 new_temp = make_ssa_name (vec_dest, new_stmt);
4442 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4443 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
4444 gcc_assert (!new_bb);
4445 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
4447 /* The result of the CALL_EXPR to this builtin is determined from
4448 the value of the parameter and no global variables are touched
4449 which makes the builtin a "const" function. Requiring the
4450 builtin to have the "const" attribute makes it unnecessary
4451 to call mark_call_clobbered. */
4452 gcc_assert (TREE_READONLY (builtin_decl));
4455 /* 3. Create msq = phi <msq_init, lsq> in loop */
4456 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4457 msq = make_ssa_name (vec_dest, NULL_TREE);
4458 phi_stmt = create_phi_node (msq, loop->header);
4459 SSA_NAME_DEF_STMT (msq) = phi_stmt;
4460 add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop));
4466 /* Function vect_strided_load_supported.
4468 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
4469 and FALSE otherwise. */
4472 vect_strided_load_supported (tree vectype)
4474 optab perm_even_optab, perm_odd_optab;
4477 mode = (int) TYPE_MODE (vectype);
4479 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
4480 if (!perm_even_optab)
4482 if (vect_print_dump_info (REPORT_DETAILS))
4483 fprintf (vect_dump, "no optab for perm_even.");
4487 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
4489 if (vect_print_dump_info (REPORT_DETAILS))
4490 fprintf (vect_dump, "perm_even op not supported by target.");
4494 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
4495 if (!perm_odd_optab)
4497 if (vect_print_dump_info (REPORT_DETAILS))
4498 fprintf (vect_dump, "no optab for perm_odd.");
4502 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
4504 if (vect_print_dump_info (REPORT_DETAILS))
4505 fprintf (vect_dump, "perm_odd op not supported by target.");
4512 /* Function vect_permute_load_chain.
4514 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4515 a power of 2, generate extract_even/odd stmts to reorder the input data
4516 correctly. Return the final references for loads in RESULT_CHAIN.
4518 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4519 The input is 4 vectors each containing 8 elements. We assign a number to each
4520 element, the input sequence is:
4522 1st vec: 0 1 2 3 4 5 6 7
4523 2nd vec: 8 9 10 11 12 13 14 15
4524 3rd vec: 16 17 18 19 20 21 22 23
4525 4th vec: 24 25 26 27 28 29 30 31
4527 The output sequence should be:
4529 1st vec: 0 4 8 12 16 20 24 28
4530 2nd vec: 1 5 9 13 17 21 25 29
4531 3rd vec: 2 6 10 14 18 22 26 30
4532 4th vec: 3 7 11 15 19 23 27 31
4534 i.e., the first output vector should contain the first elements of each
4535 interleaving group, etc.
4537 We use extract_even/odd instructions to create such output. The input of each
4538 extract_even/odd operation is two vectors
4542 and the output is the vector of extracted even/odd elements. The output of
4543 extract_even will be: 0 2 4 6
4544 and of extract_odd: 1 3 5 7
4547 The permutation is done in log LENGTH stages. In each stage extract_even and
4548 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
4549 order. In our example,
4551 E1: extract_even (1st vec, 2nd vec)
4552 E2: extract_odd (1st vec, 2nd vec)
4553 E3: extract_even (3rd vec, 4th vec)
4554 E4: extract_odd (3rd vec, 4th vec)
4556 The output for the first stage will be:
4558 E1: 0 2 4 6 8 10 12 14
4559 E2: 1 3 5 7 9 11 13 15
4560 E3: 16 18 20 22 24 26 28 30
4561 E4: 17 19 21 23 25 27 29 31
4563 In order to proceed and create the correct sequence for the next stage (or
4564 for the correct output, if the second stage is the last one, as in our
4565 example), we first put the output of extract_even operation and then the
4566 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4567 The input for the second stage is:
4569 1st vec (E1): 0 2 4 6 8 10 12 14
4570 2nd vec (E3): 16 18 20 22 24 26 28 30
4571 3rd vec (E2): 1 3 5 7 9 11 13 15
4572 4th vec (E4): 17 19 21 23 25 27 29 31
4574 The output of the second stage:
4576 E1: 0 4 8 12 16 20 24 28
4577 E2: 2 6 10 14 18 22 26 30
4578 E3: 1 5 9 13 17 21 25 29
4579 E4: 3 7 11 15 19 23 27 31
4581 And RESULT_CHAIN after reordering:
4583 1st vec (E1): 0 4 8 12 16 20 24 28
4584 2nd vec (E3): 1 5 9 13 17 21 25 29
4585 3rd vec (E2): 2 6 10 14 18 22 26 30
4586 4th vec (E4): 3 7 11 15 19 23 27 31. */
4589 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
4590 unsigned int length,
4592 block_stmt_iterator *bsi,
4593 VEC(tree,heap) **result_chain)
4595 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
4596 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4601 /* Check that the operation is supported. */
4602 if (!vect_strided_load_supported (vectype))
4605 *result_chain = VEC_copy (tree, heap, dr_chain);
4606 for (i = 0; i < exact_log2 (length); i++)
4608 for (j = 0; j < length; j +=2)
4610 first_vect = VEC_index (tree, dr_chain, j);
4611 second_vect = VEC_index (tree, dr_chain, j+1);
4613 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4614 perm_dest = create_tmp_var (vectype, "vect_perm_even");
4615 DECL_GIMPLE_REG_P (perm_dest) = 1;
4616 add_referenced_var (perm_dest);
4618 tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
4619 first_vect, second_vect);
4620 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4622 data_ref = make_ssa_name (perm_dest, perm_stmt);
4623 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
4624 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4625 mark_symbols_for_renaming (perm_stmt);
4627 VEC_replace (tree, *result_chain, j/2, data_ref);
4629 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4630 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
4631 DECL_GIMPLE_REG_P (perm_dest) = 1;
4632 add_referenced_var (perm_dest);
4634 tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
4635 first_vect, second_vect);
4636 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4637 data_ref = make_ssa_name (perm_dest, perm_stmt);
4638 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
4639 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4640 mark_symbols_for_renaming (perm_stmt);
4642 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
4644 dr_chain = VEC_copy (tree, heap, *result_chain);
4650 /* Function vect_transform_strided_load.
4652 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4653 to perform their permutation and ascribe the result vectorized statements to
4654 the scalar statements.
4658 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
4659 block_stmt_iterator *bsi)
4661 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4662 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4663 tree next_stmt, new_stmt;
4664 VEC(tree,heap) *result_chain = NULL;
4665 unsigned int i, gap_count;
4668 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4669 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4670 vectors, that are ready for vector computation. */
4671 result_chain = VEC_alloc (tree, heap, size);
4673 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
4676 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4677 Since we scan the chain starting from it's first node, their order
4678 corresponds the order of data-refs in RESULT_CHAIN. */
4679 next_stmt = first_stmt;
4681 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
4686 /* Skip the gaps. Loads created for the gaps will be removed by dead
4687 code elimination pass later.
4688 DR_GROUP_GAP is the number of steps in elements from the previous
4689 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
4690 correspond to the gaps.
4692 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
4700 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4701 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4702 copies, and we put the new vector statement in the first available
4704 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4705 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4708 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4709 tree rel_stmt = STMT_VINFO_RELATED_STMT (
4710 vinfo_for_stmt (prev_stmt));
4713 prev_stmt = rel_stmt;
4714 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4716 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
4718 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4720 /* If NEXT_STMT accesses the same DR as the previous statement,
4721 put the same TMP_DATA_REF as its vectorized statement; otherwise
4722 get the next data-ref from RESULT_CHAIN. */
4723 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4731 /* vectorizable_load.
4733 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
4735 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4736 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4737 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4740 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
4743 tree vec_dest = NULL;
4744 tree data_ref = NULL;
4746 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4747 stmt_vec_info prev_stmt_info;
4748 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4749 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4750 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
4751 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4754 tree new_stmt = NULL_TREE;
4756 enum dr_alignment_support alignment_support_cheme;
4757 tree dataref_ptr = NULL_TREE;
4759 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4760 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4761 int i, j, group_size;
4762 tree msq = NULL_TREE, lsq;
4763 tree offset = NULL_TREE;
4764 tree realignment_token = NULL_TREE;
4765 tree phi_stmt = NULL_TREE;
4766 VEC(tree,heap) *dr_chain = NULL;
4767 bool strided_load = false;
4770 gcc_assert (ncopies >= 1);
4771 /* FORNOW. This restriction should be relaxed. */
4772 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4774 if (vect_print_dump_info (REPORT_DETAILS))
4775 fprintf (vect_dump, "multiple types in nested loop.");
4779 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4782 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4785 /* FORNOW: not yet supported. */
4786 if (STMT_VINFO_LIVE_P (stmt_info))
4788 if (vect_print_dump_info (REPORT_DETAILS))
4789 fprintf (vect_dump, "value used after loop.");
4793 /* Is vectorizable load? */
4794 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4797 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4798 if (TREE_CODE (scalar_dest) != SSA_NAME)
4801 op = GIMPLE_STMT_OPERAND (stmt, 1);
4802 if (TREE_CODE (op) != ARRAY_REF
4803 && TREE_CODE (op) != INDIRECT_REF
4804 && !DR_GROUP_FIRST_DR (stmt_info))
4807 if (!STMT_VINFO_DATA_REF (stmt_info))
4810 mode = (int) TYPE_MODE (vectype);
4812 /* FORNOW. In some cases can vectorize even if data-type not supported
4813 (e.g. - data copies). */
4814 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
4816 if (vect_print_dump_info (REPORT_DETAILS))
4817 fprintf (vect_dump, "Aligned load, but unsupported type.");
4821 /* Check if the load is a part of an interleaving chain. */
4822 if (DR_GROUP_FIRST_DR (stmt_info))
4824 strided_load = true;
4826 /* Check if interleaving is supported. */
4827 if (!vect_strided_load_supported (vectype))
4831 if (!vec_stmt) /* transformation not required. */
4833 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
4834 vect_model_load_cost (stmt_info, ncopies);
4838 if (vect_print_dump_info (REPORT_DETAILS))
4839 fprintf (vect_dump, "transform load.");
4845 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4846 /* Check if the chain of loads is already vectorized. */
4847 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
4849 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4852 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
4853 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
4854 dr_chain = VEC_alloc (tree, heap, group_size);
4863 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
4864 gcc_assert (alignment_support_cheme);
4867 /* In case the vectorization factor (VF) is bigger than the number
4868 of elements that we can fit in a vectype (nunits), we have to generate
4869 more than one vector stmt - i.e - we need to "unroll" the
4870 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4871 from one copy of the vector stmt to the next, in the field
4872 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4873 stages to find the correct vector defs to be used when vectorizing
4874 stmts that use the defs of the current stmt. The example below illustrates
4875 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4876 4 vectorized stmts):
4878 before vectorization:
4879 RELATED_STMT VEC_STMT
4883 step 1: vectorize stmt S1:
4884 We first create the vector stmt VS1_0, and, as usual, record a
4885 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
4886 Next, we create the vector stmt VS1_1, and record a pointer to
4887 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
4888 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
4890 RELATED_STMT VEC_STMT
4891 VS1_0: vx0 = memref0 VS1_1 -
4892 VS1_1: vx1 = memref1 VS1_2 -
4893 VS1_2: vx2 = memref2 VS1_3 -
4894 VS1_3: vx3 = memref3 - -
4895 S1: x = load - VS1_0
4898 See in documentation in vect_get_vec_def_for_stmt_copy for how the
4899 information we recorded in RELATED_STMT field is used to vectorize
4902 /* In case of interleaving (non-unit strided access):
4909 Vectorized loads are created in the order of memory accesses
4910 starting from the access of the first stmt of the chain:
4913 VS2: vx1 = &base + vec_size*1
4914 VS3: vx3 = &base + vec_size*2
4915 VS4: vx4 = &base + vec_size*3
4917 Then permutation statements are generated:
4919 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
4920 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
4923 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
4924 (the order of the data-refs in the output of vect_permute_load_chain
4925 corresponds to the order of scalar stmts in the interleaving chain - see
4926 the documentation of vect_permute_load_chain()).
4927 The generation of permutation stmts and recording them in
4928 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
4930 In case of both multiple types and interleaving, the vector loads and
4931 permutation stmts above are created for every copy. The result vector stmts
4932 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
4933 STMT_VINFO_RELATED_STMT for the next copies. */
4935 /* If the data reference is aligned (dr_aligned) or potentially unaligned
4936 on a target that supports unaligned accesses (dr_unaligned_supported)
4937 we generate the following code:
4941 p = p + indx * vectype_size;
4946 Otherwise, the data reference is potentially unaligned on a target that
4947 does not support unaligned accesses (dr_unaligned_software_pipeline) -
4948 then generate the following code, in which the data in each iteration is
4949 obtained by two vector loads, one from the previous iteration, and one
4950 from the current iteration:
4952 msq_init = *(floor(p1))
4953 p2 = initial_addr + VS - 1;
4954 realignment_token = call target_builtin;
4957 p2 = p2 + indx * vectype_size
4959 vec_dest = realign_load (msq, lsq, realignment_token)
4964 if (alignment_support_cheme == dr_unaligned_software_pipeline)
4966 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token);
4967 phi_stmt = SSA_NAME_DEF_STMT (msq);
4968 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
4971 prev_stmt_info = NULL;
4972 for (j = 0; j < ncopies; j++)
4974 /* 1. Create the vector pointer update chain. */
4976 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset, &dummy,
4977 &ptr_incr, false, NULL_TREE);
4979 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
4981 for (i = 0; i < group_size; i++)
4983 /* 2. Create the vector-load in the loop. */
4984 switch (alignment_support_cheme)
4987 gcc_assert (aligned_access_p (first_dr));
4988 data_ref = build_fold_indirect_ref (dataref_ptr);
4990 case dr_unaligned_supported:
4992 int mis = DR_MISALIGNMENT (first_dr);
4993 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
4995 gcc_assert (!aligned_access_p (first_dr));
4996 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
4998 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
5001 case dr_unaligned_software_pipeline:
5002 gcc_assert (!aligned_access_p (first_dr));
5003 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5008 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5009 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5010 new_temp = make_ssa_name (vec_dest, new_stmt);
5011 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5012 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5013 mark_symbols_for_renaming (new_stmt);
5015 /* 3. Handle explicit realignment if necessary/supported. */
5016 if (alignment_support_cheme == dr_unaligned_software_pipeline)
5019 <vec_dest = realign_load (msq, lsq, realignment_token)> */
5020 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
5021 if (!realignment_token)
5022 realignment_token = dataref_ptr;
5023 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5025 build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token);
5026 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5027 new_temp = make_ssa_name (vec_dest, new_stmt);
5028 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5029 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5030 if (i == group_size - 1 && j == ncopies - 1)
5031 add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop));
5035 VEC_quick_push (tree, dr_chain, new_temp);
5036 if (i < group_size - 1)
5037 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
5042 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
5044 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5045 dr_chain = VEC_alloc (tree, heap, group_size);
5050 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5052 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5053 prev_stmt_info = vinfo_for_stmt (new_stmt);
5061 /* Function vectorizable_live_operation.
5063 STMT computes a value that is used outside the loop. Check if
5064 it can be supported. */
5067 vectorizable_live_operation (tree stmt,
5068 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
5069 tree *vec_stmt ATTRIBUTE_UNUSED)
5072 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5073 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5074 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5079 enum vect_def_type dt;
5081 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5083 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5086 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5089 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
5092 /* FORNOW. CHECKME. */
5093 if (nested_in_vect_loop_p (loop, stmt))
5096 operation = GIMPLE_STMT_OPERAND (stmt, 1);
5097 op_type = TREE_OPERAND_LENGTH (operation);
5099 /* FORNOW: support only if all uses are invariant. This means
5100 that the scalar operations can remain in place, unvectorized.
5101 The original last scalar value that they compute will be used. */
5103 for (i = 0; i < op_type; i++)
5105 op = TREE_OPERAND (operation, i);
5106 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5108 if (vect_print_dump_info (REPORT_DETAILS))
5109 fprintf (vect_dump, "use not simple.");
5113 if (dt != vect_invariant_def && dt != vect_constant_def)
5117 /* No transformation is required for the cases we currently support. */
5122 /* Function vect_is_simple_cond.
5125 LOOP - the loop that is being vectorized.
5126 COND - Condition that is checked for simple use.
5128 Returns whether a COND can be vectorized. Checks whether
5129 condition operands are supportable using vec_is_simple_use. */
5132 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
5136 enum vect_def_type dt;
5138 if (!COMPARISON_CLASS_P (cond))
5141 lhs = TREE_OPERAND (cond, 0);
5142 rhs = TREE_OPERAND (cond, 1);
5144 if (TREE_CODE (lhs) == SSA_NAME)
5146 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
5147 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
5150 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
5151 && TREE_CODE (lhs) != FIXED_CST)
5154 if (TREE_CODE (rhs) == SSA_NAME)
5156 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
5157 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
5160 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
5161 && TREE_CODE (rhs) != FIXED_CST)
5167 /* vectorizable_condition.
5169 Check if STMT is conditional modify expression that can be vectorized.
5170 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5171 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
5174 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5177 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
5179 tree scalar_dest = NULL_TREE;
5180 tree vec_dest = NULL_TREE;
5181 tree op = NULL_TREE;
5182 tree cond_expr, then_clause, else_clause;
5183 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5184 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5185 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
5186 tree vec_compare, vec_cond_expr;
5188 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5189 enum machine_mode vec_mode;
5191 enum vect_def_type dt;
5192 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5193 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5195 gcc_assert (ncopies >= 1);
5197 return false; /* FORNOW */
5199 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5202 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5205 /* FORNOW: not yet supported. */
5206 if (STMT_VINFO_LIVE_P (stmt_info))
5208 if (vect_print_dump_info (REPORT_DETAILS))
5209 fprintf (vect_dump, "value used after loop.");
5213 /* Is vectorizable conditional operation? */
5214 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5217 op = GIMPLE_STMT_OPERAND (stmt, 1);
5219 if (TREE_CODE (op) != COND_EXPR)
5222 cond_expr = TREE_OPERAND (op, 0);
5223 then_clause = TREE_OPERAND (op, 1);
5224 else_clause = TREE_OPERAND (op, 2);
5226 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
5229 /* We do not handle two different vector types for the condition
5231 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
5234 if (TREE_CODE (then_clause) == SSA_NAME)
5236 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
5237 if (!vect_is_simple_use (then_clause, loop_vinfo,
5238 &then_def_stmt, &def, &dt))
5241 else if (TREE_CODE (then_clause) != INTEGER_CST
5242 && TREE_CODE (then_clause) != REAL_CST
5243 && TREE_CODE (then_clause) != FIXED_CST)
5246 if (TREE_CODE (else_clause) == SSA_NAME)
5248 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
5249 if (!vect_is_simple_use (else_clause, loop_vinfo,
5250 &else_def_stmt, &def, &dt))
5253 else if (TREE_CODE (else_clause) != INTEGER_CST
5254 && TREE_CODE (else_clause) != REAL_CST
5255 && TREE_CODE (else_clause) != FIXED_CST)
5259 vec_mode = TYPE_MODE (vectype);
5263 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
5264 return expand_vec_cond_expr_p (op, vec_mode);
5270 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5271 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5273 /* Handle cond expr. */
5275 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
5277 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
5278 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
5279 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
5281 /* Arguments are ready. create the new vector stmt. */
5282 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
5283 vec_cond_lhs, vec_cond_rhs);
5284 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
5285 vec_compare, vec_then_clause, vec_else_clause);
5287 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
5288 new_temp = make_ssa_name (vec_dest, *vec_stmt);
5289 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
5290 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
5295 /* Function vect_transform_stmt.
5297 Create a vectorized stmt to replace STMT, and insert it at BSI. */
5300 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
5302 bool is_store = false;
5303 tree vec_stmt = NULL_TREE;
5304 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5305 tree orig_stmt_in_pattern;
5308 switch (STMT_VINFO_TYPE (stmt_info))
5310 case type_demotion_vec_info_type:
5311 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
5315 case type_promotion_vec_info_type:
5316 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
5320 case type_conversion_vec_info_type:
5321 done = vectorizable_conversion (stmt, bsi, &vec_stmt);
5325 case induc_vec_info_type:
5326 done = vectorizable_induction (stmt, bsi, &vec_stmt);
5330 case op_vec_info_type:
5331 done = vectorizable_operation (stmt, bsi, &vec_stmt);
5335 case assignment_vec_info_type:
5336 done = vectorizable_assignment (stmt, bsi, &vec_stmt);
5340 case load_vec_info_type:
5341 done = vectorizable_load (stmt, bsi, &vec_stmt);
5345 case store_vec_info_type:
5346 done = vectorizable_store (stmt, bsi, &vec_stmt);
5348 if (DR_GROUP_FIRST_DR (stmt_info))
5350 /* In case of interleaving, the whole chain is vectorized when the
5351 last store in the chain is reached. Store stmts before the last
5352 one are skipped, and there vec_stmt_info shouldn't be freed
5354 *strided_store = true;
5355 if (STMT_VINFO_VEC_STMT (stmt_info))
5362 case condition_vec_info_type:
5363 done = vectorizable_condition (stmt, bsi, &vec_stmt);
5367 case call_vec_info_type:
5368 done = vectorizable_call (stmt, bsi, &vec_stmt);
5371 case reduc_vec_info_type:
5372 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
5377 if (!STMT_VINFO_LIVE_P (stmt_info))
5379 if (vect_print_dump_info (REPORT_DETAILS))
5380 fprintf (vect_dump, "stmt not supported.");
5385 if (STMT_VINFO_LIVE_P (stmt_info)
5386 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
5388 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
5394 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
5395 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
5396 if (orig_stmt_in_pattern)
5398 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
5399 /* STMT was inserted by the vectorizer to replace a computation idiom.
5400 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
5401 computed this idiom. We need to record a pointer to VEC_STMT in
5402 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
5403 documentation of vect_pattern_recog. */
5404 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
5406 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5407 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
5416 /* This function builds ni_name = number of iterations loop executes
5417 on the loop preheader. */
5420 vect_build_loop_niters (loop_vec_info loop_vinfo)
5422 tree ni_name, stmt, var;
5424 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5425 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5427 var = create_tmp_var (TREE_TYPE (ni), "niters");
5428 add_referenced_var (var);
5429 ni_name = force_gimple_operand (ni, &stmt, false, var);
5431 pe = loop_preheader_edge (loop);
5434 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
5435 gcc_assert (!new_bb);
5442 /* This function generates the following statements:
5444 ni_name = number of iterations loop executes
5445 ratio = ni_name / vf
5446 ratio_mult_vf_name = ratio * vf
5448 and places them at the loop preheader edge. */
5451 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5453 tree *ratio_mult_vf_name_ptr,
5454 tree *ratio_name_ptr)
5462 tree ratio_mult_vf_name;
5463 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5464 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
5465 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5468 pe = loop_preheader_edge (loop);
5470 /* Generate temporary variable that contains
5471 number of iterations loop executes. */
5473 ni_name = vect_build_loop_niters (loop_vinfo);
5474 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
5476 /* Create: ratio = ni >> log2(vf) */
5478 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
5479 if (!is_gimple_val (ratio_name))
5481 var = create_tmp_var (TREE_TYPE (ni), "bnd");
5482 add_referenced_var (var);
5484 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
5485 pe = loop_preheader_edge (loop);
5486 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
5487 gcc_assert (!new_bb);
5490 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5492 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5493 ratio_name, log_vf);
5494 if (!is_gimple_val (ratio_mult_vf_name))
5496 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
5497 add_referenced_var (var);
5499 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
5501 pe = loop_preheader_edge (loop);
5502 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
5503 gcc_assert (!new_bb);
5506 *ni_name_ptr = ni_name;
5507 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5508 *ratio_name_ptr = ratio_name;
5514 /* Function vect_update_ivs_after_vectorizer.
5516 "Advance" the induction variables of LOOP to the value they should take
5517 after the execution of LOOP. This is currently necessary because the
5518 vectorizer does not handle induction variables that are used after the
5519 loop. Such a situation occurs when the last iterations of LOOP are
5521 1. We introduced new uses after LOOP for IVs that were not originally used
5522 after LOOP: the IVs of LOOP are now used by an epilog loop.
5523 2. LOOP is going to be vectorized; this means that it will iterate N/VF
5524 times, whereas the loop IVs should be bumped N times.
5527 - LOOP - a loop that is going to be vectorized. The last few iterations
5528 of LOOP were peeled.
5529 - NITERS - the number of iterations that LOOP executes (before it is
5530 vectorized). i.e, the number of times the ivs should be bumped.
5531 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
5532 coming out from LOOP on which there are uses of the LOOP ivs
5533 (this is the path from LOOP->exit to epilog_loop->preheader).
5535 The new definitions of the ivs are placed in LOOP->exit.
5536 The phi args associated with the edge UPDATE_E in the bb
5537 UPDATE_E->dest are updated accordingly.
5539 Assumption 1: Like the rest of the vectorizer, this function assumes
5540 a single loop exit that has a single predecessor.
5542 Assumption 2: The phi nodes in the LOOP header and in update_bb are
5543 organized in the same order.
5545 Assumption 3: The access function of the ivs is simple enough (see
5546 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
5548 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
5549 coming out of LOOP on which the ivs of LOOP are used (this is the path
5550 that leads to the epilog loop; other paths skip the epilog loop). This
5551 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
5552 needs to have its phis updated.
5556 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
5559 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5560 basic_block exit_bb = single_exit (loop)->dest;
5562 basic_block update_bb = update_e->dest;
5564 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
5566 /* Make sure there exists a single-predecessor exit bb: */
5567 gcc_assert (single_pred_p (exit_bb));
5569 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
5571 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
5573 tree access_fn = NULL;
5574 tree evolution_part;
5577 tree var, ni, ni_name;
5578 block_stmt_iterator last_bsi;
5580 if (vect_print_dump_info (REPORT_DETAILS))
5582 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
5583 print_generic_expr (vect_dump, phi, TDF_SLIM);
5586 /* Skip virtual phi's. */
5587 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
5589 if (vect_print_dump_info (REPORT_DETAILS))
5590 fprintf (vect_dump, "virtual phi. skip.");
5594 /* Skip reduction phis. */
5595 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
5597 if (vect_print_dump_info (REPORT_DETAILS))
5598 fprintf (vect_dump, "reduc phi. skip.");
5602 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
5603 gcc_assert (access_fn);
5605 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
5606 gcc_assert (evolution_part != NULL_TREE);
5608 /* FORNOW: We do not support IVs whose evolution function is a polynomial
5609 of degree >= 2 or exponential. */
5610 gcc_assert (!tree_is_chrec (evolution_part));
5612 step_expr = evolution_part;
5613 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
5616 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
5617 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
5619 fold_convert (sizetype,
5620 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
5621 niters, step_expr)));
5623 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
5624 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
5625 fold_convert (TREE_TYPE (init_expr),
5632 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
5633 add_referenced_var (var);
5635 last_bsi = bsi_last (exit_bb);
5636 ni_name = force_gimple_operand_bsi (&last_bsi, ni, false, var,
5637 true, BSI_SAME_STMT);
5639 /* Fix phi expressions in the successor bb. */
5640 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
5645 /* Function vect_do_peeling_for_loop_bound
5647 Peel the last iterations of the loop represented by LOOP_VINFO.
5648 The peeled iterations form a new epilog loop. Given that the loop now
5649 iterates NITERS times, the new epilog loop iterates
5650 NITERS % VECTORIZATION_FACTOR times.
5652 The original loop will later be made to iterate
5653 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
5656 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
5658 tree ni_name, ratio_mult_vf_name;
5659 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5660 struct loop *new_loop;
5662 basic_block preheader;
5665 int min_scalar_loop_bound;
5666 int min_profitable_iters;
5668 if (vect_print_dump_info (REPORT_DETAILS))
5669 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
5671 initialize_original_copy_tables ();
5673 /* Generate the following variables on the preheader of original loop:
5675 ni_name = number of iteration the original loop executes
5676 ratio = ni_name / vf
5677 ratio_mult_vf_name = ratio * vf */
5678 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
5679 &ratio_mult_vf_name, ratio);
5681 loop_num = loop->num;
5683 /* Analyze cost to set threshhold for vectorized loop. */
5684 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
5685 min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND))
5686 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5688 /* Use the cost model only if it is more conservative than user specified
5691 th = (unsigned) min_scalar_loop_bound;
5692 if (min_profitable_iters
5693 && (!min_scalar_loop_bound
5694 || min_profitable_iters > min_scalar_loop_bound))
5695 th = (unsigned) min_profitable_iters;
5697 if (min_profitable_iters
5698 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5699 && vect_print_dump_info (REPORT_DETAILS))
5700 fprintf (vect_dump, "vectorization may not be profitable.");
5702 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
5703 ratio_mult_vf_name, ni_name, false,
5705 gcc_assert (new_loop);
5706 gcc_assert (loop_num == loop->num);
5707 #ifdef ENABLE_CHECKING
5708 slpeel_verify_cfg_after_peeling (loop, new_loop);
5711 /* A guard that controls whether the new_loop is to be executed or skipped
5712 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
5713 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
5714 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
5715 is on the path where the LOOP IVs are used and need to be updated. */
5717 preheader = loop_preheader_edge (new_loop)->src;
5718 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
5719 update_e = EDGE_PRED (preheader, 0);
5721 update_e = EDGE_PRED (preheader, 1);
5723 /* Update IVs of original loop as if they were advanced
5724 by ratio_mult_vf_name steps. */
5725 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
5727 /* After peeling we have to reset scalar evolution analyzer. */
5730 free_original_copy_tables ();
5734 /* Function vect_gen_niters_for_prolog_loop
5736 Set the number of iterations for the loop represented by LOOP_VINFO
5737 to the minimum between LOOP_NITERS (the original iteration count of the loop)
5738 and the misalignment of DR - the data reference recorded in
5739 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
5740 this loop, the data reference DR will refer to an aligned location.
5742 The following computation is generated:
5744 If the misalignment of DR is known at compile time:
5745 addr_mis = int mis = DR_MISALIGNMENT (dr);
5746 Else, compute address misalignment in bytes:
5747 addr_mis = addr & (vectype_size - 1)
5749 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
5751 (elem_size = element type size; an element is the scalar element
5752 whose type is the inner type of the vectype)
5756 prolog_niters = min ( LOOP_NITERS ,
5757 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
5758 where group_size is the size of the interleaved group.
5760 The above formulas assume that VF == number of elements in the vector. This
5761 may not hold when there are multiple-types in the loop.
5762 In this case, for some data-references in the loop the VF does not represent
5763 the number of elements that fit in the vector. Therefore, instead of VF we
5764 use TYPE_VECTOR_SUBPARTS. */
5767 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
5769 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
5770 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5772 tree iters, iters_name;
5775 tree dr_stmt = DR_STMT (dr);
5776 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
5777 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5778 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
5779 tree niters_type = TREE_TYPE (loop_niters);
5781 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
5782 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
5784 if (DR_GROUP_FIRST_DR (stmt_info))
5786 /* For interleaved access element size must be multiplied by the size of
5787 the interleaved group. */
5788 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
5789 DR_GROUP_FIRST_DR (stmt_info)));
5790 element_size *= group_size;
5793 pe = loop_preheader_edge (loop);
5795 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
5797 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
5798 int elem_misalign = byte_misalign / element_size;
5800 if (vect_print_dump_info (REPORT_DETAILS))
5801 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
5802 iters = build_int_cst (niters_type,
5803 (nelements - elem_misalign)&(nelements/group_size-1));
5807 tree new_stmts = NULL_TREE;
5809 vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE);
5810 tree ptr_type = TREE_TYPE (start_addr);
5811 tree size = TYPE_SIZE (ptr_type);
5812 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
5813 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
5814 tree elem_size_log =
5815 build_int_cst (type, exact_log2 (vectype_align/nelements));
5816 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
5817 tree nelements_tree = build_int_cst (type, nelements);
5821 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
5822 gcc_assert (!new_bb);
5824 /* Create: byte_misalign = addr & (vectype_size - 1) */
5826 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
5828 /* Create: elem_misalign = byte_misalign / element_size */
5830 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
5832 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
5833 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
5834 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
5835 iters = fold_convert (niters_type, iters);
5838 /* Create: prolog_loop_niters = min (iters, loop_niters) */
5839 /* If the loop bound is known at compile time we already verified that it is
5840 greater than vf; since the misalignment ('iters') is at most vf, there's
5841 no need to generate the MIN_EXPR in this case. */
5842 if (TREE_CODE (loop_niters) != INTEGER_CST)
5843 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
5845 if (vect_print_dump_info (REPORT_DETAILS))
5847 fprintf (vect_dump, "niters for prolog loop: ");
5848 print_generic_expr (vect_dump, iters, TDF_SLIM);
5851 var = create_tmp_var (niters_type, "prolog_loop_niters");
5852 add_referenced_var (var);
5853 iters_name = force_gimple_operand (iters, &stmt, false, var);
5855 /* Insert stmt on loop preheader edge. */
5858 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
5859 gcc_assert (!new_bb);
5866 /* Function vect_update_init_of_dr
5868 NITERS iterations were peeled from LOOP. DR represents a data reference
5869 in LOOP. This function updates the information recorded in DR to
5870 account for the fact that the first NITERS iterations had already been
5871 executed. Specifically, it updates the OFFSET field of DR. */
5874 vect_update_init_of_dr (struct data_reference *dr, tree niters)
5876 tree offset = DR_OFFSET (dr);
5878 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
5879 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
5880 DR_OFFSET (dr) = offset;
5884 /* Function vect_update_inits_of_drs
5886 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
5887 This function updates the information recorded for the data references in
5888 the loop to account for the fact that the first NITERS iterations had
5889 already been executed. Specifically, it updates the initial_condition of
5890 the access_function of all the data_references in the loop. */
5893 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
5896 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
5897 struct data_reference *dr;
5899 if (vect_print_dump_info (REPORT_DETAILS))
5900 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
5902 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
5903 vect_update_init_of_dr (dr, niters);
5907 /* Function vect_do_peeling_for_alignment
5909 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
5910 'niters' is set to the misalignment of one of the data references in the
5911 loop, thereby forcing it to refer to an aligned location at the beginning
5912 of the execution of this loop. The data reference for which we are
5913 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
5916 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
5918 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5919 tree niters_of_prolog_loop, ni_name;
5921 struct loop *new_loop;
5923 if (vect_print_dump_info (REPORT_DETAILS))
5924 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
5926 initialize_original_copy_tables ();
5928 ni_name = vect_build_loop_niters (loop_vinfo);
5929 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
5931 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
5933 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
5934 niters_of_prolog_loop, ni_name, true, 0);
5935 gcc_assert (new_loop);
5936 #ifdef ENABLE_CHECKING
5937 slpeel_verify_cfg_after_peeling (new_loop, loop);
5940 /* Update number of times loop executes. */
5941 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
5942 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
5943 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
5945 /* Update the init conditions of the access functions of all data refs. */
5946 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
5948 /* After peeling we have to reset scalar evolution analyzer. */
5951 free_original_copy_tables ();
5955 /* Function vect_create_cond_for_align_checks.
5957 Create a conditional expression that represents the alignment checks for
5958 all of data references (array element references) whose alignment must be
5962 LOOP_VINFO - two fields of the loop information are used.
5963 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
5964 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
5967 COND_EXPR_STMT_LIST - statements needed to construct the conditional
5969 The returned value is the conditional expression to be used in the if
5970 statement that controls which version of the loop gets executed at runtime.
5972 The algorithm makes two assumptions:
5973 1) The number of bytes "n" in a vector is a power of 2.
5974 2) An address "a" is aligned if a%n is zero and that this
5975 test can be done as a&(n-1) == 0. For example, for 16
5976 byte vectors the test is a&0xf == 0. */
5979 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
5980 tree *cond_expr_stmt_list)
5982 VEC(tree,heap) *may_misalign_stmts
5983 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
5985 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
5989 tree int_ptrsize_type;
5991 tree or_tmp_name = NULL_TREE;
5992 tree and_tmp, and_tmp_name, and_stmt;
5995 /* Check that mask is one less than a power of 2, i.e., mask is
5996 all zeros followed by all ones. */
5997 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
5999 /* CHECKME: what is the best integer or unsigned type to use to hold a
6000 cast from a pointer value? */
6001 psize = TYPE_SIZE (ptr_type_node);
6003 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
6005 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
6006 of the first vector of the i'th data reference. */
6008 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
6010 tree new_stmt_list = NULL_TREE;
6012 tree addr_tmp, addr_tmp_name, addr_stmt;
6013 tree or_tmp, new_or_tmp_name, or_stmt;
6015 /* create: addr_tmp = (int)(address_of_first_vector) */
6016 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
6020 if (new_stmt_list != NULL_TREE)
6021 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
6023 sprintf (tmp_name, "%s%d", "addr2int", i);
6024 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6025 add_referenced_var (addr_tmp);
6026 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
6027 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
6028 addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
6029 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
6030 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
6032 /* The addresses are OR together. */
6034 if (or_tmp_name != NULL_TREE)
6036 /* create: or_tmp = or_tmp | addr_tmp */
6037 sprintf (tmp_name, "%s%d", "orptrs", i);
6038 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6039 add_referenced_var (or_tmp);
6040 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
6041 tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
6042 or_tmp_name, addr_tmp_name);
6043 or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
6044 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
6045 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
6046 or_tmp_name = new_or_tmp_name;
6049 or_tmp_name = addr_tmp_name;
6053 mask_cst = build_int_cst (int_ptrsize_type, mask);
6055 /* create: and_tmp = or_tmp & mask */
6056 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
6057 add_referenced_var (and_tmp);
6058 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
6060 tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
6061 and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
6062 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
6063 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
6065 /* Make and_tmp the left operand of the conditional test against zero.
6066 if and_tmp has a nonzero bit then some address is unaligned. */
6067 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
6068 return build2 (EQ_EXPR, boolean_type_node,
6069 and_tmp_name, ptrsize_zero);
6072 /* Function vect_vfa_segment_size.
6074 Create an expression that computes the size of segment
6075 that will be accessed for a data reference. The functions takes into
6076 account that realignment loads may access one more vector.
6079 DR: The data reference.
6080 VECT_FACTOR: vectorization factor.
6082 Return an exrpession whose value is the size of segment which will be
6086 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
6088 tree segment_length;
6090 if (vect_supportable_dr_alignment (dr) == dr_unaligned_software_pipeline)
6093 build_int_cst (integer_type_node,
6094 GET_MODE_SIZE (TYPE_MODE (STMT_VINFO_VECTYPE
6095 (vinfo_for_stmt (DR_STMT (dr))))));
6098 fold_convert (sizetype,
6099 fold_build2 (PLUS_EXPR, integer_type_node,
6100 fold_build2 (MULT_EXPR, integer_type_node, DR_STEP (dr),
6109 fold_convert (sizetype,
6110 fold_build2 (MULT_EXPR, integer_type_node, DR_STEP (dr),
6114 return segment_length;
6117 /* Function vect_create_cond_for_alias_checks.
6119 Create a conditional expression that represents the run-time checks for
6120 overlapping of address ranges represented by a list of data references
6121 relations passed as input.
6124 COND_EXPR - input conditional expression. New conditions will be chained
6125 with logical and operation.
6126 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
6130 COND_EXPR - conditional expression.
6131 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6133 The returned value is the conditional expression to be used in the if
6134 statement that controls which version of the loop gets executed at runtime.
6138 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
6140 tree * cond_expr_stmt_list)
6142 VEC (ddr_p, heap) * may_alias_ddrs =
6143 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
6145 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
6149 tree part_cond_expr;
6151 /* Create expression
6152 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
6153 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
6157 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
6158 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
6160 if (VEC_empty (ddr_p, may_alias_ddrs))
6163 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
6165 tree stmt_a = DR_STMT (DDR_A (ddr));
6166 tree stmt_b = DR_STMT (DDR_B (ddr));
6169 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
6172 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
6175 tree segment_length_a = vect_vfa_segment_size (DDR_A (ddr), vect_factor);
6176 tree segment_length_b = vect_vfa_segment_size (DDR_B (ddr), vect_factor);
6178 if (vect_print_dump_info (REPORT_DR_DETAILS))
6181 "create runtime check for data references ");
6182 print_generic_expr (vect_dump, DR_REF (DDR_A (ddr)), TDF_SLIM);
6183 fprintf (vect_dump, " and ");
6184 print_generic_expr (vect_dump, DR_REF (DDR_B (ddr)), TDF_SLIM);
6189 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
6190 fold_build2 (LT_EXPR, boolean_type_node,
6191 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
6195 fold_build2 (LT_EXPR, boolean_type_node,
6196 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
6202 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
6203 *cond_expr, part_cond_expr);
6205 *cond_expr = part_cond_expr;
6207 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
6208 fprintf (vect_dump, "created %u versioning for alias checks.\n",
6209 VEC_length (ddr_p, may_alias_ddrs));
6213 /* Function vect_transform_loop.
6215 The analysis phase has determined that the loop is vectorizable.
6216 Vectorize the loop - created vectorized stmts to replace the scalar
6217 stmts in the loop, and update the loop exit condition. */
6220 vect_transform_loop (loop_vec_info loop_vinfo)
6222 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6223 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6224 int nbbs = loop->num_nodes;
6225 block_stmt_iterator si, next_si;
6228 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6231 if (vect_print_dump_info (REPORT_DETAILS))
6232 fprintf (vect_dump, "=== vec_transform_loop ===");
6234 /* If the loop has data references that may or may not be aligned or/and
6235 has data reference relations whose independence was not proven then
6236 two versions of the loop need to be generated, one which is vectorized
6237 and one which isn't. A test is then generated to control which of the
6238 loops is executed. The test checks for the alignment of all of the
6239 data references that may or may not be aligned. An additional
6240 sequence of runtime tests is generated for each pairs of DDRs whose
6241 independence was not proven. The vectorized version of loop is
6242 executed only if both alias and alignment tests are passed. */
6244 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6245 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
6248 tree cond_expr = NULL_TREE;
6249 tree cond_expr_stmt_list = NULL_TREE;
6250 basic_block condition_bb;
6251 block_stmt_iterator cond_exp_bsi;
6252 basic_block merge_bb;
6253 basic_block new_exit_bb;
6255 tree orig_phi, new_phi, arg;
6256 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
6257 tree gimplify_stmt_list;
6259 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
6261 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr_stmt_list);
6263 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
6264 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
6265 &cond_expr_stmt_list);
6268 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
6270 force_gimple_operand (cond_expr, &gimplify_stmt_list, true,
6272 append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list);
6274 initialize_original_copy_tables ();
6275 nloop = loop_version (loop, cond_expr, &condition_bb,
6276 prob, prob, REG_BR_PROB_BASE - prob, true);
6277 free_original_copy_tables();
6279 /** Loop versioning violates an assumption we try to maintain during
6280 vectorization - that the loop exit block has a single predecessor.
6281 After versioning, the exit block of both loop versions is the same
6282 basic block (i.e. it has two predecessors). Just in order to simplify
6283 following transformations in the vectorizer, we fix this situation
6284 here by adding a new (empty) block on the exit-edge of the loop,
6285 with the proper loop-exit phis to maintain loop-closed-form. **/
6287 merge_bb = single_exit (loop)->dest;
6288 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
6289 new_exit_bb = split_edge (single_exit (loop));
6290 new_exit_e = single_exit (loop);
6291 e = EDGE_SUCC (new_exit_bb, 0);
6293 for (orig_phi = phi_nodes (merge_bb); orig_phi;
6294 orig_phi = PHI_CHAIN (orig_phi))
6296 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
6298 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
6299 add_phi_arg (new_phi, arg, new_exit_e);
6300 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
6303 /** end loop-exit-fixes after versioning **/
6305 update_ssa (TODO_update_ssa);
6306 cond_exp_bsi = bsi_last (condition_bb);
6307 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
6310 /* CHECKME: we wouldn't need this if we called update_ssa once
6312 bitmap_zero (vect_memsyms_to_rename);
6314 /* Peel the loop if there are data refs with unknown alignment.
6315 Only one data ref with unknown store is allowed. */
6317 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
6318 vect_do_peeling_for_alignment (loop_vinfo);
6320 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6321 compile time constant), or it is a constant that doesn't divide by the
6322 vectorization factor, then an epilog loop needs to be created.
6323 We therefore duplicate the loop: the original loop will be vectorized,
6324 and will compute the first (n/VF) iterations. The second copy of the loop
6325 will remain scalar and will compute the remaining (n%VF) iterations.
6326 (VF is the vectorization factor). */
6328 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
6329 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
6330 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
6331 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
6333 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6334 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6336 /* 1) Make sure the loop header has exactly two entries
6337 2) Make sure we have a preheader basic block. */
6339 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6341 split_edge (loop_preheader_edge (loop));
6343 /* FORNOW: the vectorizer supports only loops which body consist
6344 of one basic block (header + empty latch). When the vectorizer will
6345 support more involved loop forms, the order by which the BBs are
6346 traversed need to be reconsidered. */
6348 for (i = 0; i < nbbs; i++)
6350 basic_block bb = bbs[i];
6351 stmt_vec_info stmt_info;
6354 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
6356 if (vect_print_dump_info (REPORT_DETAILS))
6358 fprintf (vect_dump, "------>vectorizing phi: ");
6359 print_generic_expr (vect_dump, phi, TDF_SLIM);
6361 stmt_info = vinfo_for_stmt (phi);
6364 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6365 && !STMT_VINFO_LIVE_P (stmt_info))
6368 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6369 != (unsigned HOST_WIDE_INT) vectorization_factor)
6370 && vect_print_dump_info (REPORT_DETAILS))
6371 fprintf (vect_dump, "multiple-types.");
6373 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6375 if (vect_print_dump_info (REPORT_DETAILS))
6376 fprintf (vect_dump, "transform phi.");
6377 vect_transform_stmt (phi, NULL, NULL);
6381 for (si = bsi_start (bb); !bsi_end_p (si);)
6383 tree stmt = bsi_stmt (si);
6386 if (vect_print_dump_info (REPORT_DETAILS))
6388 fprintf (vect_dump, "------>vectorizing statement: ");
6389 print_generic_expr (vect_dump, stmt, TDF_SLIM);
6392 stmt_info = vinfo_for_stmt (stmt);
6394 /* vector stmts created in the outer-loop during vectorization of
6395 stmts in an inner-loop may not have a stmt_info, and do not
6396 need to be vectorized. */
6403 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6404 && !STMT_VINFO_LIVE_P (stmt_info))
6410 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
6411 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6412 != (unsigned HOST_WIDE_INT) vectorization_factor)
6413 && vect_print_dump_info (REPORT_DETAILS))
6414 fprintf (vect_dump, "multiple-types.");
6416 /* -------- vectorize statement ------------ */
6417 if (vect_print_dump_info (REPORT_DETAILS))
6418 fprintf (vect_dump, "transform statement.");
6420 strided_store = false;
6421 is_store = vect_transform_stmt (stmt, &si, &strided_store);
6425 if (DR_GROUP_FIRST_DR (stmt_info))
6427 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6428 interleaving chain was completed - free all the stores in
6430 tree next = DR_GROUP_FIRST_DR (stmt_info);
6432 stmt_vec_info next_stmt_info;
6436 next_si = bsi_for_stmt (next);
6437 next_stmt_info = vinfo_for_stmt (next);
6438 /* Free the attached stmt_vec_info and remove the stmt. */
6439 ann = stmt_ann (next);
6440 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
6441 free (next_stmt_info);
6442 set_stmt_info (ann, NULL);
6443 bsi_remove (&next_si, true);
6446 bsi_remove (&si, true);
6451 /* Free the attached stmt_vec_info and remove the stmt. */
6452 ann = stmt_ann (stmt);
6454 set_stmt_info (ann, NULL);
6455 bsi_remove (&si, true);
6463 slpeel_make_loop_iterate_ntimes (loop, ratio);
6465 mark_set_for_renaming (vect_memsyms_to_rename);
6467 /* The memory tags and pointers in vectorized statements need to
6468 have their SSA forms updated. FIXME, why can't this be delayed
6469 until all the loops have been transformed? */
6470 update_ssa (TODO_update_ssa);
6472 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
6473 fprintf (vect_dump, "LOOP VECTORIZED.");
6474 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
6475 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");