1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to the Free
19 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
24 #include "coretypes.h"
30 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
40 #include "tree-data-ref.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "langhooks.h"
45 #include "tree-pass.h"
49 /* Utility functions for the code transformation. */
50 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (tree, block_stmt_iterator *, tree, tree *, tree *, bool, tree);
54 static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree);
55 static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
58 static tree vect_init_vector (tree, tree, tree);
59 static void vect_finish_stmt_generation
60 (tree stmt, tree vec_stmt, block_stmt_iterator *bsi);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void update_vuses_to_preheader (tree, struct loop*);
63 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
64 static tree get_initial_def_for_reduction (tree, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
77 /* Function vect_estimate_min_profitable_iters
79 Return the number of iterations required for the vector version of the
80 loop to be profitable relative to the cost of the scalar version of the
83 TODO: Take profile info into account before making vectorization
84 decisions, if available. */
87 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
90 int min_profitable_iters;
91 int peel_iters_prologue;
92 int peel_iters_epilogue;
93 int vec_inside_cost = 0;
94 int vec_outside_cost = 0;
95 int scalar_single_iter_cost = 0;
96 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
97 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
98 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
99 int nbbs = loop->num_nodes;
101 /* Cost model disabled. */
102 if (!flag_vect_cost_model)
104 if (vect_print_dump_info (REPORT_DETAILS))
105 fprintf (vect_dump, "cost model disabled.");
109 /* Requires loop versioning tests to handle misalignment.
110 FIXME: Make cost depend on number of stmts in may_misalign list. */
112 if (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
114 vec_outside_cost += TARG_COND_BRANCH_COST;
115 if (vect_print_dump_info (REPORT_DETAILS))
116 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
120 /* Requires a prologue loop when peeling to handle misalignment. Add cost of
121 two guards, one for the peeled loop and one for the vector loop. */
123 peel_iters_prologue = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
124 if (peel_iters_prologue)
126 vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
127 if (vect_print_dump_info (REPORT_DETAILS))
128 fprintf (vect_dump, "cost model: Adding cost of checks for "
132 /* Requires an epilogue loop to finish up remaining iterations after vector
133 loop. Add cost of two guards, one for the peeled loop and one for the
136 if ((peel_iters_prologue < 0)
137 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
138 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vf)
140 vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
141 if (vect_print_dump_info (REPORT_DETAILS))
142 fprintf (vect_dump, "cost model : Adding cost of checks for "
146 /* Count statements in scalar loop. Using this as scalar cost for a single
149 TODO: Add outer loop support.
151 TODO: Consider assigning different costs to different scalar
154 for (i = 0; i < nbbs; i++)
156 block_stmt_iterator si;
157 basic_block bb = bbs[i];
159 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
161 tree stmt = bsi_stmt (si);
162 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
163 if (!STMT_VINFO_RELEVANT_P (stmt_info)
164 && !STMT_VINFO_LIVE_P (stmt_info))
166 scalar_single_iter_cost++;
167 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info);
168 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
172 /* Add additional cost for the peeled instructions in prologue and epilogue
175 FORNOW: If we dont know the value of peel_iters for prologue or epilogue
176 at compile-time - we assume the worst.
178 TODO: Build an expression that represents peel_iters for prologue and
179 epilogue to be used in a run-time test. */
181 peel_iters_prologue = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
183 if (peel_iters_prologue < 0)
185 peel_iters_prologue = vf - 1;
186 if (vect_print_dump_info (REPORT_DETAILS))
187 fprintf (vect_dump, "cost model: "
188 "prologue peel iters set conservatively.");
190 /* If peeling for alignment is unknown, loop bound of main loop becomes
192 peel_iters_epilogue = vf - 1;
193 if (vect_print_dump_info (REPORT_DETAILS))
194 fprintf (vect_dump, "cost model: "
195 "epilogue peel iters set conservatively because "
196 "peeling for alignment is unknown .");
200 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
202 peel_iters_epilogue = vf - 1;
203 if (vect_print_dump_info (REPORT_DETAILS))
204 fprintf (vect_dump, "cost model: "
205 "epilogue peel iters set conservatively because "
206 "loop iterations are unknown .");
209 peel_iters_epilogue =
210 (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_iters_prologue)
214 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
215 + (peel_iters_epilogue * scalar_single_iter_cost);
217 /* Calculate number of iterations required to make the vector version
218 profitable, relative to the loop bodies only. The following condition
219 must hold true: ((SIC*VF)-VIC)*niters > VOC*VF, where
220 SIC = scalar iteration cost, VIC = vector iteration cost,
221 VOC = vector outside cost and VF = vectorization factor. */
223 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
225 if (vec_outside_cost == 0)
226 min_profitable_iters = 1;
229 min_profitable_iters = (vec_outside_cost * vf)
230 / ((scalar_single_iter_cost * vf)
233 if ((scalar_single_iter_cost * vf * min_profitable_iters)
234 <= ((vec_inside_cost * min_profitable_iters)
235 + (vec_outside_cost * vf)))
236 min_profitable_iters++;
239 /* vector version will never be profitable. */
242 if (vect_print_dump_info (REPORT_DETAILS))
243 fprintf (vect_dump, "cost model: vector iteration cost = %d "
244 "is divisible by scalar iteration cost = %d by a factor "
245 "greater than or equal to the vectorization factor = %d .",
246 vec_inside_cost, scalar_single_iter_cost, vf);
250 if (vect_print_dump_info (REPORT_DETAILS))
252 fprintf (vect_dump, "Cost model analysis: \n");
253 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
255 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
257 fprintf (vect_dump, " Scalar cost: %d\n", scalar_single_iter_cost);
258 fprintf (vect_dump, " prologue iterations: %d\n",
259 peel_iters_prologue);
260 fprintf (vect_dump, " epilogue iterations: %d\n",
261 peel_iters_epilogue);
262 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
263 min_profitable_iters);
264 fprintf (vect_dump, " Actual minimum iters for profitability: %d\n",
265 min_profitable_iters < vf ? vf : min_profitable_iters);
268 return min_profitable_iters < vf ? vf : min_profitable_iters;
272 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
273 functions. Design better to avoid maintenance issues. */
275 /* Function vect_model_reduction_cost.
277 Models cost for a reduction operation, including the vector ops
278 generated within the strip-mine loop, the initial definition before
279 the loop, and the epilogue code that must be generated. */
282 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
291 enum machine_mode mode;
292 tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1);
293 int op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
295 /* Cost of reduction op inside loop. */
296 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
298 reduction_op = TREE_OPERAND (operation, op_type-1);
299 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
300 mode = TYPE_MODE (vectype);
301 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
304 orig_stmt = STMT_VINFO_STMT (stmt_info);
306 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
308 /* Add in cost for initial definition. */
309 outer_cost += TARG_VEC_STMT_COST;
311 /* Determine cost of epilogue code.
313 We have a reduction operator that will reduce the vector in one statement.
314 Also requires scalar extract. */
316 if (reduc_code < NUM_TREE_CODES)
317 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
320 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
322 TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0)));
323 int element_bitsize = tree_low_cst (bitsize, 1);
324 int nelements = vec_size_in_bits / element_bitsize;
326 optab = optab_for_tree_code (code, vectype);
328 /* We have a whole vector shift available. */
329 if (!VECTOR_MODE_P (mode)
330 || optab->handlers[mode].insn_code == CODE_FOR_nothing)
331 /* Final reduction via vector shifts and the reduction operator. Also
332 requires scalar extract. */
333 outer_cost += ((exact_log2(nelements) * 2 + 1) * TARG_VEC_STMT_COST);
335 /* Use extracts and reduction op for final reduction. For N elements,
336 we have N extracts and N-1 reduction ops. */
337 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
340 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
342 if (vect_print_dump_info (REPORT_DETAILS))
343 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
344 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
345 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
349 /* Function vect_model_induction_cost.
351 Models cost for induction operations. */
354 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
356 /* loop cost for vec_loop. */
357 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
358 /* prologue cost for vec_init and vec_step. */
359 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_VEC_STMT_COST;
361 if (vect_print_dump_info (REPORT_DETAILS))
362 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
363 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
364 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
368 /* Function vect_model_simple_cost.
370 Models cost for simple operations, i.e. those that only emit ncopies of a
371 single op. Right now, this does not account for multiple insns that could
372 be generated for the single vector op. We will handle that shortly. */
375 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies)
377 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
379 if (vect_print_dump_info (REPORT_DETAILS))
380 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
381 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
382 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
386 /* Function vect_cost_strided_group_size
388 For strided load or store, return the group_size only if it is the first
389 load or store of a group, else return 1. This ensures that group size is
390 only returned once per group. */
393 vect_cost_strided_group_size (stmt_vec_info stmt_info)
395 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
397 if (first_stmt == STMT_VINFO_STMT (stmt_info))
398 return DR_GROUP_SIZE (stmt_info);
404 /* Function vect_model_store_cost
406 Models cost for stores. In the case of strided accesses, one access
407 has the overhead of the strided access attributed to it. */
410 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies)
415 /* Strided access? */
416 if (DR_GROUP_FIRST_DR (stmt_info))
417 group_size = vect_cost_strided_group_size (stmt_info);
418 /* Not a strided access. */
422 /* Is this an access in a group of stores, which provide strided access?
423 If so, add in the cost of the permutes. */
426 /* Uses a high and low interleave operation for each needed permute. */
427 cost = ncopies * exact_log2(group_size) * group_size
428 * TARG_VEC_STMT_COST;
430 if (vect_print_dump_info (REPORT_DETAILS))
431 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
436 /* Costs of the stores. */
437 cost += ncopies * TARG_VEC_STORE_COST;
439 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = cost;
441 if (vect_print_dump_info (REPORT_DETAILS))
442 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
443 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
444 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
448 /* Function vect_model_load_cost
450 Models cost for loads. In the case of strided accesses, the last access
451 has the overhead of the strided access attributed to it. Since unaligned
452 accesses are supported for loads, we also account for the costs of the
453 access scheme chosen. */
456 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies)
461 int alignment_support_cheme;
463 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
465 /* Strided accesses? */
466 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
469 group_size = vect_cost_strided_group_size (stmt_info);
470 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
472 /* Not a strided access. */
479 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
481 /* Is this an access in a group of loads providing strided access?
482 If so, add in the cost of the permutes. */
485 /* Uses an even and odd extract operations for each needed permute. */
486 inner_cost = ncopies * exact_log2(group_size) * group_size
487 * TARG_VEC_STMT_COST;
489 if (vect_print_dump_info (REPORT_DETAILS))
490 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
495 /* The loads themselves. */
496 switch (alignment_support_cheme)
500 inner_cost += ncopies * TARG_VEC_LOAD_COST;
502 if (vect_print_dump_info (REPORT_DETAILS))
503 fprintf (vect_dump, "vect_model_load_cost: aligned.");
507 case dr_unaligned_supported:
509 /* Here, we assign an additional cost for the unaligned load. */
510 inner_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
512 if (vect_print_dump_info (REPORT_DETAILS))
513 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
518 case dr_unaligned_software_pipeline:
522 if (vect_print_dump_info (REPORT_DETAILS))
523 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
526 /* Unaligned software pipeline has a load of an address, an initial
527 load, and possibly a mask operation to "prime" the loop. However,
528 if this is an access in a group of loads, which provide strided
529 access, then the above cost should only be considered for one
530 access in the group. Inside the loop, there is a load op
531 and a realignment op. */
533 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1)
535 outer_cost = 2*TARG_VEC_STMT_COST;
536 if (targetm.vectorize.builtin_mask_for_load)
537 outer_cost += TARG_VEC_STMT_COST;
540 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
542 inner_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
551 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = inner_cost;
553 if (vect_print_dump_info (REPORT_DETAILS))
554 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
555 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
556 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
561 /* Function vect_get_new_vect_var.
563 Returns a name for a new variable. The current naming scheme appends the
564 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
565 the name of vectorizer generated variables, and appends that to NAME if
569 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
576 case vect_simple_var:
579 case vect_scalar_var:
582 case vect_pointer_var:
590 new_vect_var = create_tmp_var (type, concat (prefix, name, NULL));
592 new_vect_var = create_tmp_var (type, prefix);
594 /* Mark vector typed variable as a gimple register variable. */
595 if (TREE_CODE (type) == VECTOR_TYPE)
596 DECL_GIMPLE_REG_P (new_vect_var) = true;
602 /* Function vect_create_addr_base_for_vector_ref.
604 Create an expression that computes the address of the first memory location
605 that will be accessed for a data reference.
608 STMT: The statement containing the data reference.
609 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
610 OFFSET: Optional. If supplied, it is be added to the initial address.
613 1. Return an SSA_NAME whose value is the address of the memory location of
614 the first vector of the data reference.
615 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
616 these statement(s) which define the returned SSA_NAME.
618 FORNOW: We are only handling array accesses with step 1. */
621 vect_create_addr_base_for_vector_ref (tree stmt,
625 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
626 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
627 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
628 tree base_name = build_fold_indirect_ref (data_ref_base);
630 tree addr_base, addr_expr;
632 tree base_offset = unshare_expr (DR_OFFSET (dr));
633 tree init = unshare_expr (DR_INIT (dr));
634 tree vect_ptr_type, addr_expr2;
636 /* Create base_offset */
637 base_offset = size_binop (PLUS_EXPR, base_offset, init);
638 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
639 add_referenced_var (dest);
640 base_offset = force_gimple_operand (base_offset, &new_stmt, false, dest);
641 append_to_statement_list_force (new_stmt, new_stmt_list);
645 tree tmp = create_tmp_var (TREE_TYPE (base_offset), "offset");
648 /* For interleaved access step we divide STEP by the size of the
649 interleaving group. */
650 if (DR_GROUP_SIZE (stmt_info))
651 step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr),
652 build_int_cst (TREE_TYPE (offset),
653 DR_GROUP_SIZE (stmt_info)));
657 add_referenced_var (tmp);
658 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
659 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
660 base_offset, offset);
661 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
662 append_to_statement_list_force (new_stmt, new_stmt_list);
665 /* base + base_offset */
666 addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base,
669 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
671 /* addr_expr = addr_base */
672 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
673 get_name (base_name));
674 add_referenced_var (addr_expr);
675 vec_stmt = fold_convert (vect_ptr_type, addr_base);
676 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
677 get_name (base_name));
678 add_referenced_var (addr_expr2);
679 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
680 append_to_statement_list_force (new_stmt, new_stmt_list);
682 if (vect_print_dump_info (REPORT_DETAILS))
684 fprintf (vect_dump, "created ");
685 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
691 /* Function vect_create_data_ref_ptr.
693 Create a new pointer to vector type (vp), that points to the first location
694 accessed in the loop by STMT, along with the def-use update chain to
695 appropriately advance the pointer through the loop iterations. Also set
696 aliasing information for the pointer. This vector pointer is used by the
697 callers to this function to create a memory reference expression for vector
701 1. STMT: a stmt that references memory. Expected to be of the form
702 GIMPLE_MODIFY_STMT <name, data-ref> or
703 GIMPLE_MODIFY_STMT <data-ref, name>.
704 2. BSI: block_stmt_iterator where new stmts can be added.
705 3. OFFSET (optional): an offset to be added to the initial address accessed
706 by the data-ref in STMT.
707 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
708 pointing to the initial address.
709 5. TYPE: if not NULL indicates the required type of the data-ref
712 1. Declare a new ptr to vector_type, and have it point to the base of the
713 data reference (initial addressed accessed by the data reference).
714 For example, for vector of type V8HI, the following code is generated:
717 vp = (v8hi *)initial_address;
719 if OFFSET is not supplied:
720 initial_address = &a[init];
721 if OFFSET is supplied:
722 initial_address = &a[init + OFFSET];
724 Return the initial_address in INITIAL_ADDRESS.
726 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
727 update the pointer in each iteration of the loop.
729 Return the increment stmt that updates the pointer in PTR_INCR.
731 3. Return the pointer. */
734 vect_create_data_ref_ptr (tree stmt,
735 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
736 tree offset, tree *initial_address, tree *ptr_incr,
737 bool only_init, tree type)
740 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
741 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
742 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
743 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
749 tree new_stmt_list = NULL_TREE;
750 edge pe = loop_preheader_edge (loop);
753 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
755 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
757 if (vect_print_dump_info (REPORT_DETAILS))
759 tree data_ref_base = base_name;
760 fprintf (vect_dump, "create vector-pointer variable to type: ");
761 print_generic_expr (vect_dump, vectype, TDF_SLIM);
762 if (TREE_CODE (data_ref_base) == VAR_DECL)
763 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
764 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
765 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
766 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
767 fprintf (vect_dump, " vectorizing a record based array ref: ");
768 else if (TREE_CODE (data_ref_base) == SSA_NAME)
769 fprintf (vect_dump, " vectorizing a pointer ref: ");
770 print_generic_expr (vect_dump, base_name, TDF_SLIM);
773 /** (1) Create the new vector-pointer variable: **/
775 vect_ptr_type = build_pointer_type (type);
777 vect_ptr_type = build_pointer_type (vectype);
778 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
779 get_name (base_name));
780 add_referenced_var (vect_ptr);
782 /** (2) Add aliasing information to the new vector-pointer:
783 (The points-to info (DR_PTR_INFO) may be defined later.) **/
785 tag = DR_SYMBOL_TAG (dr);
788 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
789 tag must be created with tag added to its may alias list. */
791 new_type_alias (vect_ptr, tag, DR_REF (dr));
793 set_symbol_mem_tag (vect_ptr, tag);
795 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
797 /** (3) Calculate the initial address the vector-pointer, and set
798 the vector-pointer to point to it before the loop: **/
800 /* Create: (&(base[init_val+offset]) in the loop preheader. */
801 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
803 pe = loop_preheader_edge (loop);
804 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
805 gcc_assert (!new_bb);
806 *initial_address = new_temp;
808 /* Create: p = (vectype *) initial_base */
809 vec_stmt = fold_convert (vect_ptr_type, new_temp);
810 vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
811 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
812 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
813 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
814 gcc_assert (!new_bb);
817 /** (4) Handle the updating of the vector-pointer inside the loop: **/
819 if (only_init) /* No update in loop is required. */
821 /* Copy the points-to information if it exists. */
822 if (DR_PTR_INFO (dr))
823 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
824 return vect_ptr_init;
828 block_stmt_iterator incr_bsi;
830 tree indx_before_incr, indx_after_incr;
833 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
834 create_iv (vect_ptr_init,
835 fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)),
836 NULL_TREE, loop, &incr_bsi, insert_after,
837 &indx_before_incr, &indx_after_incr);
838 incr = bsi_stmt (incr_bsi);
839 set_stmt_info (stmt_ann (incr),
840 new_stmt_vec_info (incr, loop_vinfo));
842 /* Copy the points-to information if it exists. */
843 if (DR_PTR_INFO (dr))
845 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
846 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
848 merge_alias_info (vect_ptr_init, indx_before_incr);
849 merge_alias_info (vect_ptr_init, indx_after_incr);
853 return indx_before_incr;
858 /* Function bump_vector_ptr
860 Increment a pointer (to a vector type) by vector-size. Connect the new
861 increment stmt to the existing def-use update-chain of the pointer.
863 The pointer def-use update-chain before this function:
864 DATAREF_PTR = phi (p_0, p_2)
866 PTR_INCR: p_2 = DATAREF_PTR + step
868 The pointer def-use update-chain after this function:
869 DATAREF_PTR = phi (p_0, p_2)
871 NEW_DATAREF_PTR = DATAREF_PTR + vector_size
873 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
876 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
878 PTR_INCR - the stmt that updates the pointer in each iteration of the loop.
879 The increment amount across iterations is also expected to be
881 BSI - location where the new update stmt is to be placed.
882 STMT - the original scalar memory-access stmt that is being vectorized.
884 Output: Return NEW_DATAREF_PTR as illustrated above.
889 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
892 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
893 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
894 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
895 tree vptr_type = TREE_TYPE (dataref_ptr);
896 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
897 tree update = fold_convert (vptr_type, TYPE_SIZE_UNIT (vectype));
901 tree new_dataref_ptr;
903 incr_stmt = build_gimple_modify_stmt (ptr_var,
904 build2 (PLUS_EXPR, vptr_type,
905 dataref_ptr, update));
906 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
907 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
908 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
910 /* Update the vector-pointer's cross-iteration increment. */
911 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
913 tree use = USE_FROM_PTR (use_p);
915 if (use == dataref_ptr)
916 SET_USE (use_p, new_dataref_ptr);
918 gcc_assert (tree_int_cst_compare (use, update) == 0);
921 /* Copy the points-to information if it exists. */
922 if (DR_PTR_INFO (dr))
923 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
924 merge_alias_info (new_dataref_ptr, dataref_ptr);
926 return new_dataref_ptr;
930 /* Function vect_create_destination_var.
932 Create a new temporary of type VECTYPE. */
935 vect_create_destination_var (tree scalar_dest, tree vectype)
938 const char *new_name;
940 enum vect_var_kind kind;
942 kind = vectype ? vect_simple_var : vect_scalar_var;
943 type = vectype ? vectype : TREE_TYPE (scalar_dest);
945 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
947 new_name = get_name (scalar_dest);
950 vec_dest = vect_get_new_vect_var (type, kind, new_name);
951 add_referenced_var (vec_dest);
957 /* Function vect_init_vector.
959 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
960 the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be
961 used in the vectorization of STMT. */
964 vect_init_vector (tree stmt, tree vector_var, tree vector_type)
966 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
967 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
968 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
976 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
977 add_referenced_var (new_var);
979 init_stmt = build_gimple_modify_stmt (new_var, vector_var);
980 new_temp = make_ssa_name (new_var, init_stmt);
981 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
983 pe = loop_preheader_edge (loop);
984 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
985 gcc_assert (!new_bb);
987 if (vect_print_dump_info (REPORT_DETAILS))
989 fprintf (vect_dump, "created new init_stmt: ");
990 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
993 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
998 /* Function get_initial_def_for_induction
1001 IV_PHI - the initial value of the induction variable
1004 Return a vector variable, initialized with the first VF values of
1005 the induction variable. E.g., for an iv with IV_PHI='X' and
1006 evolution S, for a vector of 4 units, we want to return:
1007 [X, X + S, X + 2*S, X + 3*S]. */
1010 get_initial_def_for_induction (tree iv_phi)
1012 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1013 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1014 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1015 tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
1016 tree vectype = get_vectype_for_scalar_type (scalar_type);
1017 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1018 edge pe = loop_preheader_edge (loop);
1020 block_stmt_iterator bsi;
1021 tree vec, vec_init, vec_step, t;
1026 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
1027 tree init_expr, step_expr;
1028 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1031 int ncopies = vf / nunits;
1033 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1035 tree stmt = NULL_TREE;
1036 block_stmt_iterator si;
1037 basic_block bb = bb_for_stmt (iv_phi);
1039 gcc_assert (phi_info);
1040 gcc_assert (ncopies >= 1);
1042 /* Find the first insertion point in the BB. */
1043 si = bsi_after_labels (bb);
1044 stmt = bsi_stmt (si);
1046 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (iv_phi));
1047 gcc_assert (access_fn);
1048 ok = vect_is_simple_iv_evolution (loop->num, access_fn,
1049 &init_expr, &step_expr);
1052 /* Create the vector that holds the initial_value of the induction. */
1053 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1054 add_referenced_var (new_var);
1056 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1059 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
1060 gcc_assert (!new_bb);
1064 t = tree_cons (NULL_TREE, new_name, t);
1065 for (i = 1; i < nunits; i++)
1069 /* Create: new_name = new_name + step_expr */
1070 tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
1071 init_stmt = build_gimple_modify_stmt (new_var, tmp);
1072 new_name = make_ssa_name (new_var, init_stmt);
1073 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
1075 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1076 gcc_assert (!new_bb);
1078 if (vect_print_dump_info (REPORT_DETAILS))
1080 fprintf (vect_dump, "created new init_stmt: ");
1081 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1083 t = tree_cons (NULL_TREE, new_name, t);
1085 vec = build_constructor_from_list (vectype, nreverse (t));
1086 vec_init = vect_init_vector (stmt, vec, vectype);
1089 /* Create the vector that holds the step of the induction. */
1090 expr = build_int_cst (scalar_type, vf);
1091 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1093 for (i = 0; i < nunits; i++)
1094 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1095 vec = build_constructor_from_list (vectype, t);
1096 vec_step = vect_init_vector (stmt, vec, vectype);
1099 /* Create the following def-use cycle:
1101 vec_init = [X, X+S, X+2*S, X+3*S]
1102 vec_step = [VF*S, VF*S, VF*S, VF*S]
1104 vec_iv = PHI <vec_init, vec_loop>
1108 vec_loop = vec_iv + vec_step; */
1110 /* Create the induction-phi that defines the induction-operand. */
1111 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1112 add_referenced_var (vec_dest);
1113 induction_phi = create_phi_node (vec_dest, loop->header);
1114 set_stmt_info (get_stmt_ann (induction_phi),
1115 new_stmt_vec_info (induction_phi, loop_vinfo));
1116 induc_def = PHI_RESULT (induction_phi);
1118 /* Create the iv update inside the loop */
1119 new_stmt = build_gimple_modify_stmt (NULL_TREE,
1120 build2 (PLUS_EXPR, vectype,
1121 induc_def, vec_step));
1122 vec_def = make_ssa_name (vec_dest, new_stmt);
1123 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1124 bsi = bsi_for_stmt (stmt);
1125 vect_finish_stmt_generation (stmt, new_stmt, &bsi);
1127 /* Set the arguments of the phi node: */
1128 add_phi_arg (induction_phi, vec_init, loop_preheader_edge (loop));
1129 add_phi_arg (induction_phi, vec_def, loop_latch_edge (loop));
1132 /* In case the vectorization factor (VF) is bigger than the number
1133 of elements that we can fit in a vectype (nunits), we have to generate
1134 more than one vector stmt - i.e - we need to "unroll" the
1135 vector stmt by a factor VF/nunits. For more details see documentation
1136 in vectorizable_operation. */
1140 stmt_vec_info prev_stmt_vinfo;
1142 /* Create the vector that holds the step of the induction. */
1143 expr = build_int_cst (scalar_type, nunits);
1144 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1146 for (i = 0; i < nunits; i++)
1147 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1148 vec = build_constructor_from_list (vectype, t);
1149 vec_step = vect_init_vector (stmt, vec, vectype);
1151 vec_def = induc_def;
1152 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1153 for (i = 1; i < ncopies; i++)
1157 /* vec_i = vec_prev + vec_{step*nunits} */
1158 tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
1159 new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
1160 vec_def = make_ssa_name (vec_dest, new_stmt);
1161 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1162 bsi = bsi_for_stmt (stmt);
1163 vect_finish_stmt_generation (stmt, new_stmt, &bsi);
1165 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1166 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1170 if (vect_print_dump_info (REPORT_DETAILS))
1172 fprintf (vect_dump, "transform induction: created def-use cycle:");
1173 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
1174 fprintf (vect_dump, "\n");
1175 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
1178 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1183 /* Function vect_get_vec_def_for_operand.
1185 OP is an operand in STMT. This function returns a (vector) def that will be
1186 used in the vectorized stmt for STMT.
1188 In the case that OP is an SSA_NAME which is defined in the loop, then
1189 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1191 In case OP is an invariant or constant, a new stmt that creates a vector def
1192 needs to be introduced. */
1195 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
1200 stmt_vec_info def_stmt_info = NULL;
1201 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1202 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1203 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1204 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1205 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1211 enum vect_def_type dt;
1215 if (vect_print_dump_info (REPORT_DETAILS))
1217 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1218 print_generic_expr (vect_dump, op, TDF_SLIM);
1221 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1222 gcc_assert (is_simple_use);
1223 if (vect_print_dump_info (REPORT_DETAILS))
1227 fprintf (vect_dump, "def = ");
1228 print_generic_expr (vect_dump, def, TDF_SLIM);
1232 fprintf (vect_dump, " def_stmt = ");
1233 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
1239 /* Case 1: operand is a constant. */
1240 case vect_constant_def:
1245 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1246 if (vect_print_dump_info (REPORT_DETAILS))
1247 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1249 for (i = nunits - 1; i >= 0; --i)
1251 t = tree_cons (NULL_TREE, op, t);
1253 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1254 vec_cst = build_vector (vector_type, t);
1256 return vect_init_vector (stmt, vec_cst, vector_type);
1259 /* Case 2: operand is defined outside the loop - loop invariant. */
1260 case vect_invariant_def:
1265 /* Create 'vec_inv = {inv,inv,..,inv}' */
1266 if (vect_print_dump_info (REPORT_DETAILS))
1267 fprintf (vect_dump, "Create vector_inv.");
1269 for (i = nunits - 1; i >= 0; --i)
1271 t = tree_cons (NULL_TREE, def, t);
1274 /* FIXME: use build_constructor directly. */
1275 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1276 vec_inv = build_constructor_from_list (vector_type, t);
1277 return vect_init_vector (stmt, vec_inv, vector_type);
1280 /* Case 3: operand is defined inside the loop. */
1284 *scalar_def = def_stmt;
1286 /* Get the def from the vectorized stmt. */
1287 def_stmt_info = vinfo_for_stmt (def_stmt);
1288 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1289 gcc_assert (vec_stmt);
1290 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
1294 /* Case 4: operand is defined by a loop header phi - reduction */
1295 case vect_reduction_def:
1297 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1299 /* Get the def before the loop */
1300 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1301 return get_initial_def_for_reduction (stmt, op, scalar_def);
1304 /* Case 5: operand is defined by loop-header phi - induction. */
1305 case vect_induction_def:
1307 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1309 /* Get the def before the loop */
1310 return get_initial_def_for_induction (def_stmt);
1319 /* Function vect_get_vec_def_for_stmt_copy
1321 Return a vector-def for an operand. This function is used when the
1322 vectorized stmt to be created (by the caller to this function) is a "copy"
1323 created in case the vectorized result cannot fit in one vector, and several
1324 copies of the vector-stmt are required. In this case the vector-def is
1325 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
1326 of the stmt that defines VEC_OPRND.
1327 DT is the type of the vector def VEC_OPRND.
1330 In case the vectorization factor (VF) is bigger than the number
1331 of elements that can fit in a vectype (nunits), we have to generate
1332 more than one vector stmt to vectorize the scalar stmt. This situation
1333 arises when there are multiple data-types operated upon in the loop; the
1334 smallest data-type determines the VF, and as a result, when vectorizing
1335 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
1336 vector stmt (each computing a vector of 'nunits' results, and together
1337 computing 'VF' results in each iteration). This function is called when
1338 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
1339 which VF=16 and nunits=4, so the number of copies required is 4):
1341 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
1343 S1: x = load VS1.0: vx.0 = memref0 VS1.1
1344 VS1.1: vx.1 = memref1 VS1.2
1345 VS1.2: vx.2 = memref2 VS1.3
1346 VS1.3: vx.3 = memref3
1348 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
1349 VSnew.1: vz1 = vx.1 + ... VSnew.2
1350 VSnew.2: vz2 = vx.2 + ... VSnew.3
1351 VSnew.3: vz3 = vx.3 + ...
1353 The vectorization of S1 is explained in vectorizable_load.
1354 The vectorization of S2:
1355 To create the first vector-stmt out of the 4 copies - VSnew.0 -
1356 the function 'vect_get_vec_def_for_operand' is called to
1357 get the relevant vector-def for each operand of S2. For operand x it
1358 returns the vector-def 'vx.0'.
1360 To create the remaining copies of the vector-stmt (VSnew.j), this
1361 function is called to get the relevant vector-def for each operand. It is
1362 obtained from the respective VS1.j stmt, which is recorded in the
1363 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
1365 For example, to obtain the vector-def 'vx.1' in order to create the
1366 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
1367 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
1368 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
1369 and return its def ('vx.1').
1370 Overall, to create the above sequence this function will be called 3 times:
1371 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
1372 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
1373 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
1376 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
1378 tree vec_stmt_for_operand;
1379 stmt_vec_info def_stmt_info;
1381 /* Do nothing; can reuse same def. */
1382 if (dt == vect_invariant_def || dt == vect_constant_def )
1385 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
1386 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
1387 gcc_assert (def_stmt_info);
1388 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
1389 gcc_assert (vec_stmt_for_operand);
1390 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
1396 /* Function vect_finish_stmt_generation.
1398 Insert a new stmt. */
1401 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
1402 block_stmt_iterator *bsi)
1404 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1405 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1407 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
1408 set_stmt_info (get_stmt_ann (vec_stmt),
1409 new_stmt_vec_info (vec_stmt, loop_vinfo));
1411 if (vect_print_dump_info (REPORT_DETAILS))
1413 fprintf (vect_dump, "add new stmt: ");
1414 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
1417 /* Make sure bsi points to the stmt that is being vectorized. */
1418 gcc_assert (stmt == bsi_stmt (*bsi));
1420 #ifdef USE_MAPPED_LOCATION
1421 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
1423 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
1428 /* Function get_initial_def_for_reduction
1431 STMT - a stmt that performs a reduction operation in the loop.
1432 INIT_VAL - the initial value of the reduction variable
1435 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
1436 of the reduction (used for adjusting the epilog - see below).
1437 Return a vector variable, initialized according to the operation that STMT
1438 performs. This vector will be used as the initial value of the
1439 vector of partial results.
1441 Option1 (adjust in epilog): Initialize the vector as follows:
1444 min/max: [init_val,init_val,..,init_val,init_val]
1445 bit and/or: [init_val,init_val,..,init_val,init_val]
1446 and when necessary (e.g. add/mult case) let the caller know
1447 that it needs to adjust the result by init_val.
1449 Option2: Initialize the vector as follows:
1450 add: [0,0,...,0,init_val]
1451 mult: [1,1,...,1,init_val]
1452 min/max: [init_val,init_val,...,init_val]
1453 bit and/or: [init_val,init_val,...,init_val]
1454 and no adjustments are needed.
1456 For example, for the following code:
1462 STMT is 's = s + a[i]', and the reduction variable is 's'.
1463 For a vector of 4 units, we want to return either [0,0,0,init_val],
1464 or [0,0,0,0] and let the caller know that it needs to adjust
1465 the result at the end by 'init_val'.
1467 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
1468 initialization vector is simpler (same element in all entries).
1469 A cost model should help decide between these two schemes. */
1472 get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
1474 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1475 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1476 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1477 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
1478 tree type = TREE_TYPE (init_val);
1486 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
1487 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
1491 case WIDEN_SUM_EXPR:
1494 *adjustment_def = init_val;
1495 /* Create a vector of zeros for init_def. */
1496 if (INTEGRAL_TYPE_P (type))
1497 def_for_init = build_int_cst (type, 0);
1499 def_for_init = build_real (type, dconst0);
1500 for (i = nunits - 1; i >= 0; --i)
1501 t = tree_cons (NULL_TREE, def_for_init, t);
1502 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
1503 init_def = build_vector (vector_type, t);
1508 *adjustment_def = NULL_TREE;
1520 /* Function vect_create_epilog_for_reduction
1522 Create code at the loop-epilog to finalize the result of a reduction
1525 VECT_DEF is a vector of partial results.
1526 REDUC_CODE is the tree-code for the epilog reduction.
1527 STMT is the scalar reduction stmt that is being vectorized.
1528 REDUCTION_PHI is the phi-node that carries the reduction computation.
1531 1. Creates the reduction def-use cycle: sets the arguments for
1533 The loop-entry argument is the vectorized initial-value of the reduction.
1534 The loop-latch argument is VECT_DEF - the vector of partial sums.
1535 2. "Reduces" the vector of partial results VECT_DEF into a single result,
1536 by applying the operation specified by REDUC_CODE if available, or by
1537 other means (whole-vector shifts or a scalar loop).
1538 The function also creates a new phi node at the loop exit to preserve
1539 loop-closed form, as illustrated below.
1541 The flow at the entry to this function:
1544 vec_def = phi <null, null> # REDUCTION_PHI
1545 VECT_DEF = vector_stmt # vectorized form of STMT
1546 s_loop = scalar_stmt # (scalar) STMT
1548 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1552 The above is transformed by this function into:
1555 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
1556 VECT_DEF = vector_stmt # vectorized form of STMT
1557 s_loop = scalar_stmt # (scalar) STMT
1559 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1560 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1561 v_out2 = reduce <v_out1>
1562 s_out3 = extract_field <v_out2, 0>
1563 s_out4 = adjust_result <s_out3>
1569 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
1570 enum tree_code reduc_code, tree reduction_phi)
1572 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1574 enum machine_mode mode;
1575 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1576 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1577 basic_block exit_bb;
1581 block_stmt_iterator exit_bsi;
1586 tree new_scalar_dest, exit_phi;
1587 tree bitsize, bitpos, bytesize;
1588 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
1589 tree scalar_initial_def;
1590 tree vec_initial_def;
1592 imm_use_iterator imm_iter;
1593 use_operand_p use_p;
1594 bool extract_scalar_result;
1598 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
1601 op_type = TREE_OPERAND_LENGTH (operation);
1602 reduction_op = TREE_OPERAND (operation, op_type-1);
1603 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
1604 mode = TYPE_MODE (vectype);
1606 /*** 1. Create the reduction def-use cycle ***/
1608 /* 1.1 set the loop-entry arg of the reduction-phi: */
1609 /* For the case of reduction, vect_get_vec_def_for_operand returns
1610 the scalar def before the loop, that defines the initial value
1611 of the reduction variable. */
1612 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
1613 &scalar_initial_def);
1614 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
1616 /* 1.2 set the loop-latch arg for the reduction-phi: */
1617 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
1619 if (vect_print_dump_info (REPORT_DETAILS))
1621 fprintf (vect_dump, "transform reduction: created def-use cycle:");
1622 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
1623 fprintf (vect_dump, "\n");
1624 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
1628 /*** 2. Create epilog code
1629 The reduction epilog code operates across the elements of the vector
1630 of partial results computed by the vectorized loop.
1631 The reduction epilog code consists of:
1632 step 1: compute the scalar result in a vector (v_out2)
1633 step 2: extract the scalar result (s_out3) from the vector (v_out2)
1634 step 3: adjust the scalar result (s_out3) if needed.
1636 Step 1 can be accomplished using one the following three schemes:
1637 (scheme 1) using reduc_code, if available.
1638 (scheme 2) using whole-vector shifts, if available.
1639 (scheme 3) using a scalar loop. In this case steps 1+2 above are
1642 The overall epilog code looks like this:
1644 s_out0 = phi <s_loop> # original EXIT_PHI
1645 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1646 v_out2 = reduce <v_out1> # step 1
1647 s_out3 = extract_field <v_out2, 0> # step 2
1648 s_out4 = adjust_result <s_out3> # step 3
1650 (step 3 is optional, and step2 1 and 2 may be combined).
1651 Lastly, the uses of s_out0 are replaced by s_out4.
1655 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
1656 v_out1 = phi <v_loop> */
1658 exit_bb = single_exit (loop)->dest;
1659 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
1660 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
1661 exit_bsi = bsi_after_labels (exit_bb);
1663 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
1664 (i.e. when reduc_code is not available) and in the final adjustment
1665 code (if needed). Also get the original scalar reduction variable as
1666 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
1667 represents a reduction pattern), the tree-code and scalar-def are
1668 taken from the original stmt that the pattern-stmt (STMT) replaces.
1669 Otherwise (it is a regular reduction) - the tree-code and scalar-def
1670 are taken from STMT. */
1672 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1675 /* Regular reduction */
1680 /* Reduction pattern */
1681 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
1682 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1683 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
1685 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1686 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
1687 scalar_type = TREE_TYPE (scalar_dest);
1688 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
1689 bitsize = TYPE_SIZE (scalar_type);
1690 bytesize = TYPE_SIZE_UNIT (scalar_type);
1692 /* 2.3 Create the reduction code, using one of the three schemes described
1695 if (reduc_code < NUM_TREE_CODES)
1699 /*** Case 1: Create:
1700 v_out2 = reduc_expr <v_out1> */
1702 if (vect_print_dump_info (REPORT_DETAILS))
1703 fprintf (vect_dump, "Reduce using direct vector reduction.");
1705 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1706 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
1707 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
1708 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1709 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1710 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
1712 extract_scalar_result = true;
1716 enum tree_code shift_code = 0;
1717 bool have_whole_vector_shift = true;
1719 int element_bitsize = tree_low_cst (bitsize, 1);
1720 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1723 if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
1724 shift_code = VEC_RSHIFT_EXPR;
1726 have_whole_vector_shift = false;
1728 /* Regardless of whether we have a whole vector shift, if we're
1729 emulating the operation via tree-vect-generic, we don't want
1730 to use it. Only the first round of the reduction is likely
1731 to still be profitable via emulation. */
1732 /* ??? It might be better to emit a reduction tree code here, so that
1733 tree-vect-generic can expand the first round via bit tricks. */
1734 if (!VECTOR_MODE_P (mode))
1735 have_whole_vector_shift = false;
1738 optab optab = optab_for_tree_code (code, vectype);
1739 if (optab->handlers[mode].insn_code == CODE_FOR_nothing)
1740 have_whole_vector_shift = false;
1743 if (have_whole_vector_shift)
1745 /*** Case 2: Create:
1746 for (offset = VS/2; offset >= element_size; offset/=2)
1748 Create: va' = vec_shift <va, offset>
1749 Create: va = vop <va, va'>
1752 if (vect_print_dump_info (REPORT_DETAILS))
1753 fprintf (vect_dump, "Reduce using vector shifts");
1755 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1756 new_temp = PHI_RESULT (new_phi);
1758 for (bit_offset = vec_size_in_bits/2;
1759 bit_offset >= element_bitsize;
1762 tree bitpos = size_int (bit_offset);
1763 tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
1764 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
1765 new_name = make_ssa_name (vec_dest, epilog_stmt);
1766 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1767 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
1769 tmp = build2 (code, vectype, new_name, new_temp);
1770 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
1771 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1772 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1773 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
1776 extract_scalar_result = true;
1782 /*** Case 3: Create:
1783 s = extract_field <v_out2, 0>
1784 for (offset = element_size;
1785 offset < vector_size;
1786 offset += element_size;)
1788 Create: s' = extract_field <v_out2, offset>
1789 Create: s = op <s, s'>
1792 if (vect_print_dump_info (REPORT_DETAILS))
1793 fprintf (vect_dump, "Reduce using scalar code. ");
1795 vec_temp = PHI_RESULT (new_phi);
1796 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1797 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1799 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1800 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
1801 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1802 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1803 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
1805 for (bit_offset = element_bitsize;
1806 bit_offset < vec_size_in_bits;
1807 bit_offset += element_bitsize)
1810 tree bitpos = bitsize_int (bit_offset);
1811 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1814 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1815 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
1816 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
1817 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1818 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
1820 tmp = build2 (code, scalar_type, new_name, new_temp);
1821 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
1822 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1823 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1824 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
1827 extract_scalar_result = false;
1831 /* 2.4 Extract the final scalar result. Create:
1832 s_out3 = extract_field <v_out2, bitpos> */
1834 if (extract_scalar_result)
1838 if (vect_print_dump_info (REPORT_DETAILS))
1839 fprintf (vect_dump, "extract scalar result");
1841 if (BYTES_BIG_ENDIAN)
1842 bitpos = size_binop (MULT_EXPR,
1843 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
1844 TYPE_SIZE (scalar_type));
1846 bitpos = bitsize_zero_node;
1848 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
1849 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1850 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
1851 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1852 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1853 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
1856 /* 2.4 Adjust the final result by the initial value of the reduction
1857 variable. (When such adjustment is not needed, then
1858 'scalar_initial_def' is zero).
1861 s_out4 = scalar_expr <s_out3, scalar_initial_def> */
1863 if (scalar_initial_def)
1865 tree tmp = build2 (code, scalar_type, new_temp, scalar_initial_def);
1866 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
1867 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1868 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1869 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
1872 /* 2.6 Replace uses of s_out0 with uses of s_out3 */
1874 /* Find the loop-closed-use at the loop exit of the original scalar result.
1875 (The reduction result is expected to have two immediate uses - one at the
1876 latch block, and one at the loop exit). */
1878 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
1880 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
1882 exit_phi = USE_STMT (use_p);
1886 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
1887 gcc_assert (exit_phi);
1888 /* Replace the uses: */
1889 orig_name = PHI_RESULT (exit_phi);
1890 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
1891 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1892 SET_USE (use_p, new_temp);
1896 /* Function vectorizable_reduction.
1898 Check if STMT performs a reduction operation that can be vectorized.
1899 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1900 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1901 Return FALSE if not a vectorizable STMT, TRUE otherwise.
1903 This function also handles reduction idioms (patterns) that have been
1904 recognized in advance during vect_pattern_recog. In this case, STMT may be
1906 X = pattern_expr (arg0, arg1, ..., X)
1907 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
1908 sequence that had been detected and replaced by the pattern-stmt (STMT).
1910 In some cases of reduction patterns, the type of the reduction variable X is
1911 different than the type of the other arguments of STMT.
1912 In such cases, the vectype that is used when transforming STMT into a vector
1913 stmt is different than the vectype that is used to determine the
1914 vectorization factor, because it consists of a different number of elements
1915 than the actual number of elements that are being operated upon in parallel.
1917 For example, consider an accumulation of shorts into an int accumulator.
1918 On some targets it's possible to vectorize this pattern operating on 8
1919 shorts at a time (hence, the vectype for purposes of determining the
1920 vectorization factor should be V8HI); on the other hand, the vectype that
1921 is used to create the vector form is actually V4SI (the type of the result).
1923 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
1924 indicates what is the actual level of parallelism (V8HI in the example), so
1925 that the right vectorization factor would be derived. This vectype
1926 corresponds to the type of arguments to the reduction stmt, and should *NOT*
1927 be used to create the vectorized stmt. The right vectype for the vectorized
1928 stmt is obtained from the type of the result X:
1929 get_vectype_for_scalar_type (TREE_TYPE (X))
1931 This means that, contrary to "regular" reductions (or "regular" stmts in
1932 general), the following equation:
1933 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
1934 does *NOT* necessarily hold for reduction patterns. */
1937 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1942 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
1943 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1944 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1945 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1946 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1948 enum tree_code code, orig_code, epilog_reduc_code = 0;
1949 enum machine_mode vec_mode;
1951 optab optab, reduc_optab;
1952 tree new_temp = NULL_TREE;
1954 enum vect_def_type dt;
1959 stmt_vec_info orig_stmt_info;
1960 tree expr = NULL_TREE;
1962 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1963 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1964 stmt_vec_info prev_stmt_info;
1966 tree new_stmt = NULL_TREE;
1969 gcc_assert (ncopies >= 1);
1971 /* 1. Is vectorizable reduction? */
1973 /* Not supportable if the reduction variable is used in the loop. */
1974 if (STMT_VINFO_RELEVANT_P (stmt_info))
1977 if (!STMT_VINFO_LIVE_P (stmt_info))
1980 /* Make sure it was already recognized as a reduction computation. */
1981 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
1984 /* 2. Has this been recognized as a reduction pattern?
1986 Check if STMT represents a pattern that has been recognized
1987 in earlier analysis stages. For stmts that represent a pattern,
1988 the STMT_VINFO_RELATED_STMT field records the last stmt in
1989 the original sequence that constitutes the pattern. */
1991 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1994 orig_stmt_info = vinfo_for_stmt (orig_stmt);
1995 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
1996 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
1997 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2000 /* 3. Check the operands of the operation. The first operands are defined
2001 inside the loop body. The last operand is the reduction variable,
2002 which is defined by the loop-header-phi. */
2004 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
2006 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2007 code = TREE_CODE (operation);
2008 op_type = TREE_OPERAND_LENGTH (operation);
2009 if (op_type != binary_op && op_type != ternary_op)
2011 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2012 scalar_type = TREE_TYPE (scalar_dest);
2014 /* All uses but the last are expected to be defined in the loop.
2015 The last use is the reduction variable. */
2016 for (i = 0; i < op_type-1; i++)
2018 op = TREE_OPERAND (operation, i);
2019 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2020 gcc_assert (is_simple_use);
2021 if (dt != vect_loop_def
2022 && dt != vect_invariant_def
2023 && dt != vect_constant_def
2024 && dt != vect_induction_def)
2028 op = TREE_OPERAND (operation, i);
2029 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2030 gcc_assert (is_simple_use);
2031 gcc_assert (dt == vect_reduction_def);
2032 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
2034 gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt));
2036 gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt));
2038 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2041 /* 4. Supportable by target? */
2043 /* 4.1. check support for the operation in the loop */
2044 optab = optab_for_tree_code (code, vectype);
2047 if (vect_print_dump_info (REPORT_DETAILS))
2048 fprintf (vect_dump, "no optab.");
2051 vec_mode = TYPE_MODE (vectype);
2052 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
2054 if (vect_print_dump_info (REPORT_DETAILS))
2055 fprintf (vect_dump, "op not supported by target.");
2056 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2057 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2058 < vect_min_worthwhile_factor (code))
2060 if (vect_print_dump_info (REPORT_DETAILS))
2061 fprintf (vect_dump, "proceeding using word mode.");
2064 /* Worthwhile without SIMD support? */
2065 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2066 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2067 < vect_min_worthwhile_factor (code))
2069 if (vect_print_dump_info (REPORT_DETAILS))
2070 fprintf (vect_dump, "not worthwhile without SIMD support.");
2074 /* 4.2. Check support for the epilog operation.
2076 If STMT represents a reduction pattern, then the type of the
2077 reduction variable may be different than the type of the rest
2078 of the arguments. For example, consider the case of accumulation
2079 of shorts into an int accumulator; The original code:
2080 S1: int_a = (int) short_a;
2081 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2084 STMT: int_acc = widen_sum <short_a, int_acc>
2087 1. The tree-code that is used to create the vector operation in the
2088 epilog code (that reduces the partial results) is not the
2089 tree-code of STMT, but is rather the tree-code of the original
2090 stmt from the pattern that STMT is replacing. I.e, in the example
2091 above we want to use 'widen_sum' in the loop, but 'plus' in the
2093 2. The type (mode) we use to check available target support
2094 for the vector operation to be created in the *epilog*, is
2095 determined by the type of the reduction variable (in the example
2096 above we'd check this: plus_optab[vect_int_mode]).
2097 However the type (mode) we use to check available target support
2098 for the vector operation to be created *inside the loop*, is
2099 determined by the type of the other arguments to STMT (in the
2100 example we'd check this: widen_sum_optab[vect_short_mode]).
2102 This is contrary to "regular" reductions, in which the types of all
2103 the arguments are the same as the type of the reduction variable.
2104 For "regular" reductions we can therefore use the same vector type
2105 (and also the same tree-code) when generating the epilog code and
2106 when generating the code inside the loop. */
2110 /* This is a reduction pattern: get the vectype from the type of the
2111 reduction variable, and get the tree-code from orig_stmt. */
2112 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2113 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
2114 vec_mode = TYPE_MODE (vectype);
2118 /* Regular reduction: use the same vectype and tree-code as used for
2119 the vector code inside the loop can be used for the epilog code. */
2123 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
2125 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
2128 if (vect_print_dump_info (REPORT_DETAILS))
2129 fprintf (vect_dump, "no optab for reduction.");
2130 epilog_reduc_code = NUM_TREE_CODES;
2132 if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
2134 if (vect_print_dump_info (REPORT_DETAILS))
2135 fprintf (vect_dump, "reduc op not supported by target.");
2136 epilog_reduc_code = NUM_TREE_CODES;
2139 if (!vec_stmt) /* transformation not required. */
2141 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
2142 vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
2148 if (vect_print_dump_info (REPORT_DETAILS))
2149 fprintf (vect_dump, "transform reduction.");
2151 /* Create the destination vector */
2152 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2154 /* Create the reduction-phi that defines the reduction-operand. */
2155 new_phi = create_phi_node (vec_dest, loop->header);
2157 /* In case the vectorization factor (VF) is bigger than the number
2158 of elements that we can fit in a vectype (nunits), we have to generate
2159 more than one vector stmt - i.e - we need to "unroll" the
2160 vector stmt by a factor VF/nunits. For more details see documentation
2161 in vectorizable_operation. */
2163 prev_stmt_info = NULL;
2164 for (j = 0; j < ncopies; j++)
2169 op = TREE_OPERAND (operation, 0);
2170 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
2171 if (op_type == ternary_op)
2173 op = TREE_OPERAND (operation, 1);
2174 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
2177 /* Get the vector def for the reduction variable from the phi node */
2178 reduc_def = PHI_RESULT (new_phi);
2182 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2183 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
2184 if (op_type == ternary_op)
2185 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
2187 /* Get the vector def for the reduction variable from the vectorized
2188 reduction operation generated in the previous iteration (j-1) */
2189 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
2192 /* Arguments are ready. create the new vector stmt. */
2193 if (op_type == binary_op)
2194 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
2196 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
2198 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2199 new_temp = make_ssa_name (vec_dest, new_stmt);
2200 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2201 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2204 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2206 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2207 prev_stmt_info = vinfo_for_stmt (new_stmt);
2210 /* Finalize the reduction-phi (set it's arguments) and create the
2211 epilog reduction code. */
2212 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
2216 /* Checks if CALL can be vectorized in type VECTYPE. Returns
2217 a function declaration if the target has a vectorized version
2218 of the function, or NULL_TREE if the function cannot be vectorized. */
2221 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
2223 tree fndecl = get_callee_fndecl (call);
2224 enum built_in_function code;
2226 /* We only handle functions that do not read or clobber memory -- i.e.
2227 const or novops ones. */
2228 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
2232 || TREE_CODE (fndecl) != FUNCTION_DECL
2233 || !DECL_BUILT_IN (fndecl))
2236 code = DECL_FUNCTION_CODE (fndecl);
2237 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
2241 /* Function vectorizable_call.
2243 Check if STMT performs a function call that can be vectorized.
2244 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2245 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2246 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2249 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2255 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
2256 tree vectype_out, vectype_in;
2257 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2258 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
2259 enum vect_def_type dt[2];
2260 int ncopies, j, nargs;
2261 call_expr_arg_iterator iter;
2263 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2266 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2269 /* FORNOW: not yet supported. */
2270 if (STMT_VINFO_LIVE_P (stmt_info))
2272 if (vect_print_dump_info (REPORT_DETAILS))
2273 fprintf (vect_dump, "value used after loop.");
2277 /* Is STMT a vectorizable call? */
2278 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2281 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2284 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2285 if (TREE_CODE (operation) != CALL_EXPR)
2288 /* Process function arguments. */
2289 rhs_type = NULL_TREE;
2291 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
2295 /* Bail out if the function has more than two arguments, we
2296 do not have interesting builtin functions to vectorize with
2297 more than two arguments. */
2301 /* We can only handle calls with arguments of the same type. */
2303 && rhs_type != TREE_TYPE (op))
2305 if (vect_print_dump_info (REPORT_DETAILS))
2306 fprintf (vect_dump, "argument types differ.");
2309 rhs_type = TREE_TYPE (op);
2311 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs-1]))
2313 if (vect_print_dump_info (REPORT_DETAILS))
2314 fprintf (vect_dump, "use not simple.");
2319 /* No arguments is also not good. */
2323 vectype_in = get_vectype_for_scalar_type (rhs_type);
2325 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
2326 vectype_out = get_vectype_for_scalar_type (lhs_type);
2328 /* Only handle the case of vectors with the same number of elements.
2329 FIXME: We need a way to handle for example the SSE2 cvtpd2dq
2330 instruction which converts V2DFmode to V4SImode but only
2331 using the lower half of the V4SImode result. */
2332 if (TYPE_VECTOR_SUBPARTS (vectype_in) != TYPE_VECTOR_SUBPARTS (vectype_out))
2335 /* For now, we only vectorize functions if a target specific builtin
2336 is available. TODO -- in some cases, it might be profitable to
2337 insert the calls for pieces of the vector, in order to be able
2338 to vectorize other operations in the loop. */
2339 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
2340 if (fndecl == NULL_TREE)
2342 if (vect_print_dump_info (REPORT_DETAILS))
2343 fprintf (vect_dump, "function is not vectorizable.");
2348 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
2350 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2351 / TYPE_VECTOR_SUBPARTS (vectype_out));
2353 if (!vec_stmt) /* transformation not required. */
2355 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
2356 if (vect_print_dump_info (REPORT_DETAILS))
2357 fprintf (vect_dump, "=== vectorizable_call ===");
2358 vect_model_simple_cost (stmt_info, ncopies);
2364 if (vect_print_dump_info (REPORT_DETAILS))
2365 fprintf (vect_dump, "transform operation.");
2367 gcc_assert (ncopies >= 1);
2370 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2371 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2373 prev_stmt_info = NULL;
2374 for (j = 0; j < ncopies; ++j)
2376 tree new_stmt, vargs;
2380 /* Build argument list for the vectorized call. */
2381 /* FIXME: Rewrite this so that it doesn't construct a temporary
2385 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
2390 vec_oprnd[n] = vect_get_vec_def_for_operand (op, stmt, NULL);
2392 vec_oprnd[n] = vect_get_vec_def_for_stmt_copy (dt[n], vec_oprnd[n]);
2394 vargs = tree_cons (NULL_TREE, vec_oprnd[n], vargs);
2396 vargs = nreverse (vargs);
2398 rhs = build_function_call_expr (fndecl, vargs);
2399 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
2400 new_temp = make_ssa_name (vec_dest, new_stmt);
2401 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2403 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2406 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2408 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2409 prev_stmt_info = vinfo_for_stmt (new_stmt);
2412 /* The call in STMT might prevent it from being removed in dce. We however
2413 cannot remove it here, due to the way the ssa name it defines is mapped
2414 to the new definition. So just replace rhs of the statement with something
2416 type = TREE_TYPE (scalar_dest);
2417 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
2424 /* Function vect_gen_widened_results_half
2426 Create a vector stmt whose code, type, number of arguments, and result
2427 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
2428 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
2429 In the case that CODE is a CALL_EXPR, this means that a call to DECL
2430 needs to be created (DECL is a function-decl of a target-builtin).
2431 STMT is the original scalar stmt that we are vectorizing. */
2434 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
2435 tree vec_oprnd0, tree vec_oprnd1, int op_type,
2436 tree vec_dest, block_stmt_iterator *bsi,
2445 /* Generate half of the widened result: */
2446 if (code == CALL_EXPR)
2448 /* Target specific support */
2449 if (op_type == binary_op)
2450 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
2452 expr = build_call_expr (decl, 1, vec_oprnd0);
2456 /* Generic support */
2457 gcc_assert (op_type == TREE_CODE_LENGTH (code));
2458 if (op_type == binary_op)
2459 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
2461 expr = build1 (code, vectype, vec_oprnd0);
2463 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2464 new_temp = make_ssa_name (vec_dest, new_stmt);
2465 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2466 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2468 if (code == CALL_EXPR)
2470 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2472 if (TREE_CODE (sym) == SSA_NAME)
2473 sym = SSA_NAME_VAR (sym);
2474 mark_sym_for_renaming (sym);
2482 /* Function vectorizable_conversion.
2484 Check if STMT performs a conversion operation, that can be vectorized.
2485 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2486 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2487 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2490 vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
2497 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
2498 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2499 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2500 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
2501 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
2504 enum vect_def_type dt0;
2506 stmt_vec_info prev_stmt_info;
2509 tree vectype_out, vectype_in;
2512 tree rhs_type, lhs_type;
2514 enum { NARROW, NONE, WIDEN } modifier;
2516 /* Is STMT a vectorizable conversion? */
2518 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2521 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2524 if (STMT_VINFO_LIVE_P (stmt_info))
2526 /* FORNOW: not yet supported. */
2527 if (vect_print_dump_info (REPORT_DETAILS))
2528 fprintf (vect_dump, "value used after loop.");
2532 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2535 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2538 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2539 code = TREE_CODE (operation);
2540 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
2543 /* Check types of lhs and rhs */
2544 op0 = TREE_OPERAND (operation, 0);
2545 rhs_type = TREE_TYPE (op0);
2546 vectype_in = get_vectype_for_scalar_type (rhs_type);
2547 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2549 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2550 lhs_type = TREE_TYPE (scalar_dest);
2551 vectype_out = get_vectype_for_scalar_type (lhs_type);
2552 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2555 if (nunits_in == nunits_out / 2)
2557 else if (nunits_out == nunits_in)
2559 else if (nunits_out == nunits_in / 2)
2564 if (modifier == NONE)
2565 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
2567 /* Bail out if the types are both integral or non-integral */
2568 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
2569 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
2572 if (modifier == NARROW)
2573 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2575 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2577 /* Sanity check: make sure that at least one copy of the vectorized stmt
2578 needs to be generated. */
2579 gcc_assert (ncopies >= 1);
2581 /* Check the operands of the operation. */
2582 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2584 if (vect_print_dump_info (REPORT_DETAILS))
2585 fprintf (vect_dump, "use not simple.");
2589 /* Supportable by target? */
2590 if ((modifier == NONE
2591 && !targetm.vectorize.builtin_conversion (code, vectype_in))
2592 || (modifier == WIDEN
2593 && !supportable_widening_operation (code, stmt, vectype_in,
2596 || (modifier == NARROW
2597 && !supportable_narrowing_operation (code, stmt, vectype_in,
2600 if (vect_print_dump_info (REPORT_DETAILS))
2601 fprintf (vect_dump, "op not supported by target.");
2605 if (modifier != NONE)
2606 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2608 if (!vec_stmt) /* transformation not required. */
2610 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
2615 if (vect_print_dump_info (REPORT_DETAILS))
2616 fprintf (vect_dump, "transform conversion.");
2619 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2621 prev_stmt_info = NULL;
2625 for (j = 0; j < ncopies; j++)
2631 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2633 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2636 targetm.vectorize.builtin_conversion (code, vectype_in);
2637 new_stmt = build_call_expr (builtin_decl, 1, vec_oprnd0);
2639 /* Arguments are ready. create the new vector stmt. */
2640 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
2641 new_temp = make_ssa_name (vec_dest, new_stmt);
2642 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2643 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2644 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2646 if (TREE_CODE (sym) == SSA_NAME)
2647 sym = SSA_NAME_VAR (sym);
2648 mark_sym_for_renaming (sym);
2652 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2654 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2655 prev_stmt_info = vinfo_for_stmt (new_stmt);
2660 /* In case the vectorization factor (VF) is bigger than the number
2661 of elements that we can fit in a vectype (nunits), we have to
2662 generate more than one vector stmt - i.e - we need to "unroll"
2663 the vector stmt by a factor VF/nunits. */
2664 for (j = 0; j < ncopies; j++)
2667 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2669 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2671 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2673 /* Generate first half of the widened result: */
2675 = vect_gen_widened_results_half (code1, vectype_out, decl1,
2676 vec_oprnd0, vec_oprnd1,
2677 unary_op, vec_dest, bsi, stmt);
2679 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2681 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2682 prev_stmt_info = vinfo_for_stmt (new_stmt);
2684 /* Generate second half of the widened result: */
2686 = vect_gen_widened_results_half (code2, vectype_out, decl2,
2687 vec_oprnd0, vec_oprnd1,
2688 unary_op, vec_dest, bsi, stmt);
2689 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2690 prev_stmt_info = vinfo_for_stmt (new_stmt);
2695 /* In case the vectorization factor (VF) is bigger than the number
2696 of elements that we can fit in a vectype (nunits), we have to
2697 generate more than one vector stmt - i.e - we need to "unroll"
2698 the vector stmt by a factor VF/nunits. */
2699 for (j = 0; j < ncopies; j++)
2704 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2705 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2709 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
2710 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2713 /* Arguments are ready. Create the new vector stmt. */
2714 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
2715 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2716 new_temp = make_ssa_name (vec_dest, new_stmt);
2717 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2718 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2721 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2723 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2725 prev_stmt_info = vinfo_for_stmt (new_stmt);
2728 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2734 /* Function vectorizable_assignment.
2736 Check if STMT performs an assignment (copy) that can be vectorized.
2737 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2738 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2739 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2742 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2748 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2749 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2750 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2753 enum vect_def_type dt;
2754 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2755 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2757 gcc_assert (ncopies >= 1);
2759 return false; /* FORNOW */
2761 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2764 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2767 /* FORNOW: not yet supported. */
2768 if (STMT_VINFO_LIVE_P (stmt_info))
2770 if (vect_print_dump_info (REPORT_DETAILS))
2771 fprintf (vect_dump, "value used after loop.");
2775 /* Is vectorizable assignment? */
2776 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2779 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2780 if (TREE_CODE (scalar_dest) != SSA_NAME)
2783 op = GIMPLE_STMT_OPERAND (stmt, 1);
2784 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
2786 if (vect_print_dump_info (REPORT_DETAILS))
2787 fprintf (vect_dump, "use not simple.");
2791 if (!vec_stmt) /* transformation not required. */
2793 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
2794 if (vect_print_dump_info (REPORT_DETAILS))
2795 fprintf (vect_dump, "=== vectorizable_assignment ===");
2796 vect_model_simple_cost (stmt_info, ncopies);
2801 if (vect_print_dump_info (REPORT_DETAILS))
2802 fprintf (vect_dump, "transform assignment.");
2805 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2808 op = GIMPLE_STMT_OPERAND (stmt, 1);
2809 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
2811 /* Arguments are ready. create the new vector stmt. */
2812 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_oprnd);
2813 new_temp = make_ssa_name (vec_dest, *vec_stmt);
2814 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
2815 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
2821 /* Function vect_min_worthwhile_factor.
2823 For a loop where we could vectorize the operation indicated by CODE,
2824 return the minimum vectorization factor that makes it worthwhile
2825 to use generic vectors. */
2827 vect_min_worthwhile_factor (enum tree_code code)
2848 /* Function vectorizable_induction
2850 Check if PHI performs an induction computation that can be vectorized.
2851 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
2852 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
2853 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2856 vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
2859 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
2860 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2861 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2862 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2863 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2866 gcc_assert (ncopies >= 1);
2868 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2871 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
2873 if (STMT_VINFO_LIVE_P (stmt_info))
2875 /* FORNOW: not yet supported. */
2876 if (vect_print_dump_info (REPORT_DETAILS))
2877 fprintf (vect_dump, "value used after loop.");
2881 if (TREE_CODE (phi) != PHI_NODE)
2884 if (!vec_stmt) /* transformation not required. */
2886 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
2887 if (vect_print_dump_info (REPORT_DETAILS))
2888 fprintf (vect_dump, "=== vectorizable_induction ===");
2889 vect_model_induction_cost (stmt_info, ncopies);
2895 if (vect_print_dump_info (REPORT_DETAILS))
2896 fprintf (vect_dump, "transform induction phi.");
2898 vec_def = get_initial_def_for_induction (phi);
2899 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
2904 /* Function vectorizable_operation.
2906 Check if STMT performs a binary or unary operation that can be vectorized.
2907 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2908 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2909 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2912 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2917 tree op0, op1 = NULL;
2918 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
2919 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2920 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2921 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2922 enum tree_code code;
2923 enum machine_mode vec_mode;
2928 enum machine_mode optab_op2_mode;
2930 enum vect_def_type dt0, dt1;
2932 stmt_vec_info prev_stmt_info;
2933 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
2936 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2939 gcc_assert (ncopies >= 1);
2941 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2944 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2947 /* FORNOW: not yet supported. */
2948 if (STMT_VINFO_LIVE_P (stmt_info))
2950 if (vect_print_dump_info (REPORT_DETAILS))
2951 fprintf (vect_dump, "value used after loop.");
2955 /* Is STMT a vectorizable binary/unary operation? */
2956 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2959 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2962 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2963 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2964 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2965 if (nunits_out != nunits_in)
2968 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2969 code = TREE_CODE (operation);
2970 optab = optab_for_tree_code (code, vectype);
2972 /* Support only unary or binary operations. */
2973 op_type = TREE_OPERAND_LENGTH (operation);
2974 if (op_type != unary_op && op_type != binary_op)
2976 if (vect_print_dump_info (REPORT_DETAILS))
2977 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
2981 op0 = TREE_OPERAND (operation, 0);
2982 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2984 if (vect_print_dump_info (REPORT_DETAILS))
2985 fprintf (vect_dump, "use not simple.");
2989 if (op_type == binary_op)
2991 op1 = TREE_OPERAND (operation, 1);
2992 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2994 if (vect_print_dump_info (REPORT_DETAILS))
2995 fprintf (vect_dump, "use not simple.");
3000 /* Supportable by target? */
3003 if (vect_print_dump_info (REPORT_DETAILS))
3004 fprintf (vect_dump, "no optab.");
3007 vec_mode = TYPE_MODE (vectype);
3008 icode = (int) optab->handlers[(int) vec_mode].insn_code;
3009 if (icode == CODE_FOR_nothing)
3011 if (vect_print_dump_info (REPORT_DETAILS))
3012 fprintf (vect_dump, "op not supported by target.");
3013 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3014 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3015 < vect_min_worthwhile_factor (code))
3017 if (vect_print_dump_info (REPORT_DETAILS))
3018 fprintf (vect_dump, "proceeding using word mode.");
3021 /* Worthwhile without SIMD support? */
3022 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3023 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3024 < vect_min_worthwhile_factor (code))
3026 if (vect_print_dump_info (REPORT_DETAILS))
3027 fprintf (vect_dump, "not worthwhile without SIMD support.");
3031 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
3033 /* FORNOW: not yet supported. */
3034 if (!VECTOR_MODE_P (vec_mode))
3037 /* Invariant argument is needed for a vector shift
3038 by a scalar shift operand. */
3039 optab_op2_mode = insn_data[icode].operand[2].mode;
3040 if (! (VECTOR_MODE_P (optab_op2_mode)
3041 || dt1 == vect_constant_def
3042 || dt1 == vect_invariant_def))
3044 if (vect_print_dump_info (REPORT_DETAILS))
3045 fprintf (vect_dump, "operand mode requires invariant argument.");
3050 if (!vec_stmt) /* transformation not required. */
3052 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
3053 if (vect_print_dump_info (REPORT_DETAILS))
3054 fprintf (vect_dump, "=== vectorizable_operation ===");
3055 vect_model_simple_cost (stmt_info, ncopies);
3061 if (vect_print_dump_info (REPORT_DETAILS))
3062 fprintf (vect_dump, "transform binary/unary operation.");
3065 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3067 /* In case the vectorization factor (VF) is bigger than the number
3068 of elements that we can fit in a vectype (nunits), we have to generate
3069 more than one vector stmt - i.e - we need to "unroll" the
3070 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3071 from one copy of the vector stmt to the next, in the field
3072 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3073 stages to find the correct vector defs to be used when vectorizing
3074 stmts that use the defs of the current stmt. The example below illustrates
3075 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3076 4 vectorized stmts):
3078 before vectorization:
3079 RELATED_STMT VEC_STMT
3083 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
3085 RELATED_STMT VEC_STMT
3086 VS1_0: vx0 = memref0 VS1_1 -
3087 VS1_1: vx1 = memref1 VS1_2 -
3088 VS1_2: vx2 = memref2 VS1_3 -
3089 VS1_3: vx3 = memref3 - -
3090 S1: x = load - VS1_0
3093 step2: vectorize stmt S2 (done here):
3094 To vectorize stmt S2 we first need to find the relevant vector
3095 def for the first operand 'x'. This is, as usual, obtained from
3096 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
3097 that defines 'x' (S1). This way we find the stmt VS1_0, and the
3098 relevant vector def 'vx0'. Having found 'vx0' we can generate
3099 the vector stmt VS2_0, and as usual, record it in the
3100 STMT_VINFO_VEC_STMT of stmt S2.
3101 When creating the second copy (VS2_1), we obtain the relevant vector
3102 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
3103 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
3104 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
3105 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
3106 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
3107 chain of stmts and pointers:
3108 RELATED_STMT VEC_STMT
3109 VS1_0: vx0 = memref0 VS1_1 -
3110 VS1_1: vx1 = memref1 VS1_2 -
3111 VS1_2: vx2 = memref2 VS1_3 -
3112 VS1_3: vx3 = memref3 - -
3113 S1: x = load - VS1_0
3114 VS2_0: vz0 = vx0 + v1 VS2_1 -
3115 VS2_1: vz1 = vx1 + v1 VS2_2 -
3116 VS2_2: vz2 = vx2 + v1 VS2_3 -
3117 VS2_3: vz3 = vx3 + v1 - -
3118 S2: z = x + 1 - VS2_0 */
3120 prev_stmt_info = NULL;
3121 for (j = 0; j < ncopies; j++)
3126 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3127 if (op_type == binary_op)
3129 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
3131 /* Vector shl and shr insn patterns can be defined with
3132 scalar operand 2 (shift operand). In this case, use
3133 constant or loop invariant op1 directly, without
3134 extending it to vector mode first. */
3135 optab_op2_mode = insn_data[icode].operand[2].mode;
3136 if (!VECTOR_MODE_P (optab_op2_mode))
3138 if (vect_print_dump_info (REPORT_DETAILS))
3139 fprintf (vect_dump, "operand 1 using scalar mode.");
3144 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
3149 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
3150 if (op_type == binary_op)
3151 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
3154 /* Arguments are ready. create the new vector stmt. */
3156 if (op_type == binary_op)
3157 new_stmt = build_gimple_modify_stmt (vec_dest,
3158 build2 (code, vectype, vec_oprnd0, vec_oprnd1));
3160 new_stmt = build_gimple_modify_stmt (vec_dest,
3161 build1 (code, vectype, vec_oprnd0));
3162 new_temp = make_ssa_name (vec_dest, new_stmt);
3163 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3164 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3167 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3169 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3170 prev_stmt_info = vinfo_for_stmt (new_stmt);
3177 /* Function vectorizable_type_demotion
3179 Check if STMT performs a binary or unary operation that involves
3180 type demotion, and if it can be vectorized.
3181 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3182 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3183 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3186 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
3193 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
3194 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3195 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3196 enum tree_code code, code1 = ERROR_MARK;
3199 enum vect_def_type dt0;
3201 stmt_vec_info prev_stmt_info;
3210 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3213 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3216 /* FORNOW: not yet supported. */
3217 if (STMT_VINFO_LIVE_P (stmt_info))
3219 if (vect_print_dump_info (REPORT_DETAILS))
3220 fprintf (vect_dump, "value used after loop.");
3224 /* Is STMT a vectorizable type-demotion operation? */
3225 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3228 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3231 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3232 code = TREE_CODE (operation);
3233 if (code != NOP_EXPR && code != CONVERT_EXPR)
3236 op0 = TREE_OPERAND (operation, 0);
3237 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
3238 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3240 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3241 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3242 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3243 if (nunits_in != nunits_out / 2) /* FORNOW */
3246 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3247 gcc_assert (ncopies >= 1);
3249 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
3250 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3251 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
3252 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
3253 && (code == NOP_EXPR || code == CONVERT_EXPR))))
3256 /* Check the operands of the operation. */
3257 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
3259 if (vect_print_dump_info (REPORT_DETAILS))
3260 fprintf (vect_dump, "use not simple.");
3264 /* Supportable by target? */
3265 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1))
3268 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3270 if (!vec_stmt) /* transformation not required. */
3272 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
3273 if (vect_print_dump_info (REPORT_DETAILS))
3274 fprintf (vect_dump, "=== vectorizable_demotion ===");
3275 vect_model_simple_cost (stmt_info, ncopies);
3280 if (vect_print_dump_info (REPORT_DETAILS))
3281 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
3285 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3287 /* In case the vectorization factor (VF) is bigger than the number
3288 of elements that we can fit in a vectype (nunits), we have to generate
3289 more than one vector stmt - i.e - we need to "unroll" the
3290 vector stmt by a factor VF/nunits. */
3291 prev_stmt_info = NULL;
3292 for (j = 0; j < ncopies; j++)
3297 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3298 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
3302 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
3303 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
3306 /* Arguments are ready. Create the new vector stmt. */
3307 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3308 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3309 new_temp = make_ssa_name (vec_dest, new_stmt);
3310 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3311 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3314 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3316 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3318 prev_stmt_info = vinfo_for_stmt (new_stmt);
3321 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3326 /* Function vectorizable_type_promotion
3328 Check if STMT performs a binary or unary operation that involves
3329 type promotion, and if it can be vectorized.
3330 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3331 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3332 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3335 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
3341 tree op0, op1 = NULL;
3342 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
3343 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3344 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3345 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3346 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3349 enum vect_def_type dt0, dt1;
3351 stmt_vec_info prev_stmt_info;
3359 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3362 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3365 /* FORNOW: not yet supported. */
3366 if (STMT_VINFO_LIVE_P (stmt_info))
3368 if (vect_print_dump_info (REPORT_DETAILS))
3369 fprintf (vect_dump, "value used after loop.");
3373 /* Is STMT a vectorizable type-promotion operation? */
3374 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3377 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3380 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3381 code = TREE_CODE (operation);
3382 if (code != NOP_EXPR && code != CONVERT_EXPR
3383 && code != WIDEN_MULT_EXPR)
3386 op0 = TREE_OPERAND (operation, 0);
3387 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
3388 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3390 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3391 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3392 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3393 if (nunits_out != nunits_in / 2) /* FORNOW */
3396 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3397 gcc_assert (ncopies >= 1);
3399 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
3400 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3401 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
3402 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
3403 && (code == CONVERT_EXPR || code == NOP_EXPR))))
3406 /* Check the operands of the operation. */
3407 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
3409 if (vect_print_dump_info (REPORT_DETAILS))
3410 fprintf (vect_dump, "use not simple.");
3414 op_type = TREE_CODE_LENGTH (code);
3415 if (op_type == binary_op)
3417 op1 = TREE_OPERAND (operation, 1);
3418 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
3420 if (vect_print_dump_info (REPORT_DETAILS))
3421 fprintf (vect_dump, "use not simple.");
3426 /* Supportable by target? */
3427 if (!supportable_widening_operation (code, stmt, vectype_in,
3428 &decl1, &decl2, &code1, &code2))
3431 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3433 if (!vec_stmt) /* transformation not required. */
3435 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
3436 if (vect_print_dump_info (REPORT_DETAILS))
3437 fprintf (vect_dump, "=== vectorizable_promotion ===");
3438 vect_model_simple_cost (stmt_info, 2*ncopies);
3444 if (vect_print_dump_info (REPORT_DETAILS))
3445 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
3449 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3451 /* In case the vectorization factor (VF) is bigger than the number
3452 of elements that we can fit in a vectype (nunits), we have to generate
3453 more than one vector stmt - i.e - we need to "unroll" the
3454 vector stmt by a factor VF/nunits. */
3456 prev_stmt_info = NULL;
3457 for (j = 0; j < ncopies; j++)
3462 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3463 if (op_type == binary_op)
3464 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
3468 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
3469 if (op_type == binary_op)
3470 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
3473 /* Arguments are ready. Create the new vector stmt. We are creating
3474 two vector defs because the widened result does not fit in one vector.
3475 The vectorized stmt can be expressed as a call to a taregt builtin,
3476 or a using a tree-code. */
3477 /* Generate first half of the widened result: */
3478 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
3479 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
3481 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3483 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3484 prev_stmt_info = vinfo_for_stmt (new_stmt);
3486 /* Generate second half of the widened result: */
3487 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
3488 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
3489 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3490 prev_stmt_info = vinfo_for_stmt (new_stmt);
3494 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3499 /* Function vect_strided_store_supported.
3501 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
3502 and FALSE otherwise. */
3505 vect_strided_store_supported (tree vectype)
3507 optab interleave_high_optab, interleave_low_optab;
3510 mode = (int) TYPE_MODE (vectype);
3512 /* Check that the operation is supported. */
3513 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
3515 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
3517 if (!interleave_high_optab || !interleave_low_optab)
3519 if (vect_print_dump_info (REPORT_DETAILS))
3520 fprintf (vect_dump, "no optab for interleave.");
3524 if (interleave_high_optab->handlers[(int) mode].insn_code
3526 || interleave_low_optab->handlers[(int) mode].insn_code
3527 == CODE_FOR_nothing)
3529 if (vect_print_dump_info (REPORT_DETAILS))
3530 fprintf (vect_dump, "interleave op not supported by target.");
3537 /* Function vect_permute_store_chain.
3539 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
3540 a power of 2, generate interleave_high/low stmts to reorder the data
3541 correctly for the stores. Return the final references for stores in
3544 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3545 The input is 4 vectors each containing 8 elements. We assign a number to each
3546 element, the input sequence is:
3548 1st vec: 0 1 2 3 4 5 6 7
3549 2nd vec: 8 9 10 11 12 13 14 15
3550 3rd vec: 16 17 18 19 20 21 22 23
3551 4th vec: 24 25 26 27 28 29 30 31
3553 The output sequence should be:
3555 1st vec: 0 8 16 24 1 9 17 25
3556 2nd vec: 2 10 18 26 3 11 19 27
3557 3rd vec: 4 12 20 28 5 13 21 30
3558 4th vec: 6 14 22 30 7 15 23 31
3560 i.e., we interleave the contents of the four vectors in their order.
3562 We use interleave_high/low instructions to create such output. The input of
3563 each interleave_high/low operation is two vectors:
3566 the even elements of the result vector are obtained left-to-right from the
3567 high/low elements of the first vector. The odd elements of the result are
3568 obtained left-to-right from the high/low elements of the second vector.
3569 The output of interleave_high will be: 0 4 1 5
3570 and of interleave_low: 2 6 3 7
3573 The permutation is done in log LENGTH stages. In each stage interleave_high
3574 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
3575 where the first argument is taken from the first half of DR_CHAIN and the
3576 second argument from it's second half.
3579 I1: interleave_high (1st vec, 3rd vec)
3580 I2: interleave_low (1st vec, 3rd vec)
3581 I3: interleave_high (2nd vec, 4th vec)
3582 I4: interleave_low (2nd vec, 4th vec)
3584 The output for the first stage is:
3586 I1: 0 16 1 17 2 18 3 19
3587 I2: 4 20 5 21 6 22 7 23
3588 I3: 8 24 9 25 10 26 11 27
3589 I4: 12 28 13 29 14 30 15 31
3591 The output of the second stage, i.e. the final result is:
3593 I1: 0 8 16 24 1 9 17 25
3594 I2: 2 10 18 26 3 11 19 27
3595 I3: 4 12 20 28 5 13 21 30
3596 I4: 6 14 22 30 7 15 23 31. */
3599 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
3600 unsigned int length,
3602 block_stmt_iterator *bsi,
3603 VEC(tree,heap) **result_chain)
3605 tree perm_dest, perm_stmt, vect1, vect2, high, low;
3606 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3607 tree scalar_dest, tmp;
3610 VEC(tree,heap) *first, *second;
3612 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3613 first = VEC_alloc (tree, heap, length/2);
3614 second = VEC_alloc (tree, heap, length/2);
3616 /* Check that the operation is supported. */
3617 if (!vect_strided_store_supported (vectype))
3620 *result_chain = VEC_copy (tree, heap, dr_chain);
3622 for (i = 0; i < exact_log2 (length); i++)
3624 for (j = 0; j < length/2; j++)
3626 vect1 = VEC_index (tree, dr_chain, j);
3627 vect2 = VEC_index (tree, dr_chain, j+length/2);
3629 /* Create interleaving stmt:
3630 in the case of big endian:
3631 high = interleave_high (vect1, vect2)
3632 and in the case of little endian:
3633 high = interleave_low (vect1, vect2). */
3634 perm_dest = create_tmp_var (vectype, "vect_inter_high");
3635 DECL_GIMPLE_REG_P (perm_dest) = 1;
3636 add_referenced_var (perm_dest);
3637 if (BYTES_BIG_ENDIAN)
3638 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
3640 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
3641 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
3642 high = make_ssa_name (perm_dest, perm_stmt);
3643 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
3644 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3645 VEC_replace (tree, *result_chain, 2*j, high);
3647 /* Create interleaving stmt:
3648 in the case of big endian:
3649 low = interleave_low (vect1, vect2)
3650 and in the case of little endian:
3651 low = interleave_high (vect1, vect2). */
3652 perm_dest = create_tmp_var (vectype, "vect_inter_low");
3653 DECL_GIMPLE_REG_P (perm_dest) = 1;
3654 add_referenced_var (perm_dest);
3655 if (BYTES_BIG_ENDIAN)
3656 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
3658 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
3659 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
3660 low = make_ssa_name (perm_dest, perm_stmt);
3661 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
3662 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3663 VEC_replace (tree, *result_chain, 2*j+1, low);
3665 dr_chain = VEC_copy (tree, heap, *result_chain);
3671 /* Function vectorizable_store.
3673 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
3675 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3676 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3677 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3680 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3685 tree vec_oprnd = NULL_TREE;
3686 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3687 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
3688 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3689 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3690 enum machine_mode vec_mode;
3692 enum dr_alignment_support alignment_support_cheme;
3694 def_operand_p def_p;
3696 enum vect_def_type dt;
3697 stmt_vec_info prev_stmt_info = NULL;
3698 tree dataref_ptr = NULL_TREE;
3699 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3700 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3702 tree next_stmt, first_stmt;
3703 bool strided_store = false;
3704 unsigned int group_size, i;
3705 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
3706 gcc_assert (ncopies >= 1);
3708 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3711 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3714 if (STMT_VINFO_LIVE_P (stmt_info))
3716 if (vect_print_dump_info (REPORT_DETAILS))
3717 fprintf (vect_dump, "value used after loop.");
3721 /* Is vectorizable store? */
3723 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3726 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3727 if (TREE_CODE (scalar_dest) != ARRAY_REF
3728 && TREE_CODE (scalar_dest) != INDIRECT_REF
3729 && !DR_GROUP_FIRST_DR (stmt_info))
3732 op = GIMPLE_STMT_OPERAND (stmt, 1);
3733 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
3735 if (vect_print_dump_info (REPORT_DETAILS))
3736 fprintf (vect_dump, "use not simple.");
3740 vec_mode = TYPE_MODE (vectype);
3741 /* FORNOW. In some cases can vectorize even if data-type not supported
3742 (e.g. - array initialization with 0). */
3743 if (mov_optab->handlers[(int)vec_mode].insn_code == CODE_FOR_nothing)
3746 if (!STMT_VINFO_DATA_REF (stmt_info))
3749 if (DR_GROUP_FIRST_DR (stmt_info))
3751 strided_store = true;
3752 if (!vect_strided_store_supported (vectype))
3756 if (!vec_stmt) /* transformation not required. */
3758 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
3759 vect_model_store_cost (stmt_info, ncopies);
3767 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3768 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3769 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3771 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
3773 /* We vectorize all the stmts of the interleaving group when we
3774 reach the last stmt in the group. */
3775 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
3776 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
3778 *vec_stmt = NULL_TREE;
3789 if (vect_print_dump_info (REPORT_DETAILS))
3790 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
3792 dr_chain = VEC_alloc (tree, heap, group_size);
3793 oprnds = VEC_alloc (tree, heap, group_size);
3795 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3796 gcc_assert (alignment_support_cheme);
3797 gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */
3799 /* In case the vectorization factor (VF) is bigger than the number
3800 of elements that we can fit in a vectype (nunits), we have to generate
3801 more than one vector stmt - i.e - we need to "unroll" the
3802 vector stmt by a factor VF/nunits. For more details see documentation in
3803 vect_get_vec_def_for_copy_stmt. */
3805 /* In case of interleaving (non-unit strided access):
3812 We create vectorized stores starting from base address (the access of the
3813 first stmt in the chain (S2 in the above example), when the last store stmt
3814 of the chain (S4) is reached:
3817 VS2: &base + vec_size*1 = vx0
3818 VS3: &base + vec_size*2 = vx1
3819 VS4: &base + vec_size*3 = vx3
3821 Then permutation statements are generated:
3823 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
3824 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
3827 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3828 (the order of the data-refs in the output of vect_permute_store_chain
3829 corresponds to the order of scalar stmts in the interleaving chain - see
3830 the documentation of vect_permute_store_chain()).
3832 In case of both multiple types and interleaving, above vector stores and
3833 permutation stmts are created for every copy. The result vector stmts are
3834 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3835 STMT_VINFO_RELATED_STMT for the next copies.
3838 prev_stmt_info = NULL;
3839 for (j = 0; j < ncopies; j++)
3846 /* For interleaved stores we collect vectorized defs for all the
3847 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
3848 as an input to vect_permute_store_chain(), and OPRNDS as an input
3849 to vect_get_vec_def_for_stmt_copy() for the next copy.
3850 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
3851 OPRNDS are of size 1. */
3852 next_stmt = first_stmt;
3853 for (i = 0; i < group_size; i++)
3855 /* Since gaps are not supported for interleaved stores, GROUP_SIZE
3856 is the exact number of stmts in the chain. Therefore, NEXT_STMT
3857 can't be NULL_TREE. In case that there is no interleaving,
3858 GROUP_SIZE is 1, and only one iteration of the loop will be
3860 gcc_assert (next_stmt);
3861 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
3862 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
3863 VEC_quick_push(tree, dr_chain, vec_oprnd);
3864 VEC_quick_push(tree, oprnds, vec_oprnd);
3865 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3867 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE,
3868 &dummy, &ptr_incr, false,
3869 TREE_TYPE (vec_oprnd));
3873 /* For interleaved stores we created vectorized defs for all the
3874 defs stored in OPRNDS in the previous iteration (previous copy).
3875 DR_CHAIN is then used as an input to vect_permute_store_chain(),
3876 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
3878 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
3879 OPRNDS are of size 1. */
3880 for (i = 0; i < group_size; i++)
3882 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
3883 VEC_index (tree, oprnds, i));
3884 VEC_replace(tree, dr_chain, i, vec_oprnd);
3885 VEC_replace(tree, oprnds, i, vec_oprnd);
3887 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3892 result_chain = VEC_alloc (tree, heap, group_size);
3894 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
3899 next_stmt = first_stmt;
3900 for (i = 0; i < group_size; i++)
3902 /* For strided stores vectorized defs are interleaved in
3903 vect_permute_store_chain(). */
3905 vec_oprnd = VEC_index(tree, result_chain, i);
3907 data_ref = build_fold_indirect_ref (dataref_ptr);
3908 /* Arguments are ready. Create the new vector stmt. */
3909 new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
3910 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3912 /* Set the VDEFs for the vector pointer. If this virtual def
3913 has a use outside the loop and a loop peel is performed
3914 then the def may be renamed by the peel. Mark it for
3915 renaming so the later use will also be renamed. */
3916 copy_virtual_operands (new_stmt, next_stmt);
3919 /* The original store is deleted so the same SSA_NAMEs
3921 FOR_EACH_SSA_TREE_OPERAND (def, next_stmt, iter, SSA_OP_VDEF)
3923 SSA_NAME_DEF_STMT (def) = new_stmt;
3924 mark_sym_for_renaming (SSA_NAME_VAR (def));
3927 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3931 /* Create new names for all the definitions created by COPY and
3932 add replacement mappings for each new name. */
3933 FOR_EACH_SSA_DEF_OPERAND (def_p, new_stmt, iter, SSA_OP_VDEF)
3935 create_new_def_for (DEF_FROM_PTR (def_p), new_stmt, def_p);
3936 mark_sym_for_renaming (SSA_NAME_VAR (DEF_FROM_PTR (def_p)));
3939 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3942 prev_stmt_info = vinfo_for_stmt (new_stmt);
3943 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3946 /* Bump the vector pointer. */
3947 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3955 /* Function vect_setup_realignment
3957 This function is called when vectorizing an unaligned load using
3958 the dr_unaligned_software_pipeline scheme.
3959 This function generates the following code at the loop prolog:
3962 msq_init = *(floor(p)); # prolog load
3963 realignment_token = call target_builtin;
3965 msq = phi (msq_init, ---)
3967 The code above sets up a new (vector) pointer, pointing to the first
3968 location accessed by STMT, and a "floor-aligned" load using that pointer.
3969 It also generates code to compute the "realignment-token" (if the relevant
3970 target hook was defined), and creates a phi-node at the loop-header bb
3971 whose arguments are the result of the prolog-load (created by this
3972 function) and the result of a load that takes place in the loop (to be
3973 created by the caller to this function).
3974 The caller to this function uses the phi-result (msq) to create the
3975 realignment code inside the loop, and sets up the missing phi argument,
3979 msq = phi (msq_init, lsq)
3980 lsq = *(floor(p')); # load in loop
3981 result = realign_load (msq, lsq, realignment_token);
3984 STMT - (scalar) load stmt to be vectorized. This load accesses
3985 a memory location that may be unaligned.
3986 BSI - place where new code is to be inserted.
3989 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
3990 target hook, if defined.
3991 Return value - the result of the loop-header phi node. */
3994 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
3995 tree *realignment_token)
3997 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3998 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3999 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4000 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4001 edge pe = loop_preheader_edge (loop);
4002 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4015 /* 1. Create msq_init = *(floor(p1)) in the loop preheader */
4016 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4017 ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true,
4019 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
4020 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
4021 new_temp = make_ssa_name (vec_dest, new_stmt);
4022 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4023 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
4024 gcc_assert (!new_bb);
4025 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
4026 copy_virtual_operands (new_stmt, stmt);
4027 update_vuses_to_preheader (new_stmt, loop);
4029 /* 2. Create permutation mask, if required, in loop preheader. */
4030 if (targetm.vectorize.builtin_mask_for_load)
4034 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4035 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
4036 vec_dest = vect_create_destination_var (scalar_dest,
4037 TREE_TYPE (new_stmt));
4038 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
4039 new_temp = make_ssa_name (vec_dest, new_stmt);
4040 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4041 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
4042 gcc_assert (!new_bb);
4043 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
4045 /* The result of the CALL_EXPR to this builtin is determined from
4046 the value of the parameter and no global variables are touched
4047 which makes the builtin a "const" function. Requiring the
4048 builtin to have the "const" attribute makes it unnecessary
4049 to call mark_call_clobbered. */
4050 gcc_assert (TREE_READONLY (builtin_decl));
4053 /* 3. Create msq = phi <msq_init, lsq> in loop */
4054 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4055 msq = make_ssa_name (vec_dest, NULL_TREE);
4056 phi_stmt = create_phi_node (msq, loop->header);
4057 SSA_NAME_DEF_STMT (msq) = phi_stmt;
4058 add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop));
4064 /* Function vect_strided_load_supported.
4066 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
4067 and FALSE otherwise. */
4070 vect_strided_load_supported (tree vectype)
4072 optab perm_even_optab, perm_odd_optab;
4075 mode = (int) TYPE_MODE (vectype);
4077 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
4078 if (!perm_even_optab)
4080 if (vect_print_dump_info (REPORT_DETAILS))
4081 fprintf (vect_dump, "no optab for perm_even.");
4085 if (perm_even_optab->handlers[mode].insn_code == CODE_FOR_nothing)
4087 if (vect_print_dump_info (REPORT_DETAILS))
4088 fprintf (vect_dump, "perm_even op not supported by target.");
4092 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
4093 if (!perm_odd_optab)
4095 if (vect_print_dump_info (REPORT_DETAILS))
4096 fprintf (vect_dump, "no optab for perm_odd.");
4100 if (perm_odd_optab->handlers[mode].insn_code == CODE_FOR_nothing)
4102 if (vect_print_dump_info (REPORT_DETAILS))
4103 fprintf (vect_dump, "perm_odd op not supported by target.");
4110 /* Function vect_permute_load_chain.
4112 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4113 a power of 2, generate extract_even/odd stmts to reorder the input data
4114 correctly. Return the final references for loads in RESULT_CHAIN.
4116 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4117 The input is 4 vectors each containing 8 elements. We assign a number to each
4118 element, the input sequence is:
4120 1st vec: 0 1 2 3 4 5 6 7
4121 2nd vec: 8 9 10 11 12 13 14 15
4122 3rd vec: 16 17 18 19 20 21 22 23
4123 4th vec: 24 25 26 27 28 29 30 31
4125 The output sequence should be:
4127 1st vec: 0 4 8 12 16 20 24 28
4128 2nd vec: 1 5 9 13 17 21 25 29
4129 3rd vec: 2 6 10 14 18 22 26 30
4130 4th vec: 3 7 11 15 19 23 27 31
4132 i.e., the first output vector should contain the first elements of each
4133 interleaving group, etc.
4135 We use extract_even/odd instructions to create such output. The input of each
4136 extract_even/odd operation is two vectors
4140 and the output is the vector of extracted even/odd elements. The output of
4141 extract_even will be: 0 2 4 6
4142 and of extract_odd: 1 3 5 7
4145 The permutation is done in log LENGTH stages. In each stage extract_even and
4146 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
4147 order. In our example,
4149 E1: extract_even (1st vec, 2nd vec)
4150 E2: extract_odd (1st vec, 2nd vec)
4151 E3: extract_even (3rd vec, 4th vec)
4152 E4: extract_odd (3rd vec, 4th vec)
4154 The output for the first stage will be:
4156 E1: 0 2 4 6 8 10 12 14
4157 E2: 1 3 5 7 9 11 13 15
4158 E3: 16 18 20 22 24 26 28 30
4159 E4: 17 19 21 23 25 27 29 31
4161 In order to proceed and create the correct sequence for the next stage (or
4162 for the correct output, if the second stage is the last one, as in our
4163 example), we first put the output of extract_even operation and then the
4164 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4165 The input for the second stage is:
4167 1st vec (E1): 0 2 4 6 8 10 12 14
4168 2nd vec (E3): 16 18 20 22 24 26 28 30
4169 3rd vec (E2): 1 3 5 7 9 11 13 15
4170 4th vec (E4): 17 19 21 23 25 27 29 31
4172 The output of the second stage:
4174 E1: 0 4 8 12 16 20 24 28
4175 E2: 2 6 10 14 18 22 26 30
4176 E3: 1 5 9 13 17 21 25 29
4177 E4: 3 7 11 15 19 23 27 31
4179 And RESULT_CHAIN after reordering:
4181 1st vec (E1): 0 4 8 12 16 20 24 28
4182 2nd vec (E3): 1 5 9 13 17 21 25 29
4183 3rd vec (E2): 2 6 10 14 18 22 26 30
4184 4th vec (E4): 3 7 11 15 19 23 27 31. */
4187 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
4188 unsigned int length,
4190 block_stmt_iterator *bsi,
4191 VEC(tree,heap) **result_chain)
4193 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
4194 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4199 /* Check that the operation is supported. */
4200 if (!vect_strided_load_supported (vectype))
4203 *result_chain = VEC_copy (tree, heap, dr_chain);
4204 for (i = 0; i < exact_log2 (length); i++)
4206 for (j = 0; j < length; j +=2)
4208 first_vect = VEC_index (tree, dr_chain, j);
4209 second_vect = VEC_index (tree, dr_chain, j+1);
4211 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4212 perm_dest = create_tmp_var (vectype, "vect_perm_even");
4213 DECL_GIMPLE_REG_P (perm_dest) = 1;
4214 add_referenced_var (perm_dest);
4216 tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
4217 first_vect, second_vect);
4218 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4220 data_ref = make_ssa_name (perm_dest, perm_stmt);
4221 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
4222 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4223 mark_symbols_for_renaming (perm_stmt);
4225 VEC_replace (tree, *result_chain, j/2, data_ref);
4227 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4228 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
4229 DECL_GIMPLE_REG_P (perm_dest) = 1;
4230 add_referenced_var (perm_dest);
4232 tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
4233 first_vect, second_vect);
4234 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4235 data_ref = make_ssa_name (perm_dest, perm_stmt);
4236 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
4237 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4238 mark_symbols_for_renaming (perm_stmt);
4240 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
4242 dr_chain = VEC_copy (tree, heap, *result_chain);
4248 /* Function vect_transform_strided_load.
4250 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4251 to perform their permutation and ascribe the result vectorized statements to
4252 the scalar statements.
4256 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
4257 block_stmt_iterator *bsi)
4259 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4260 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4261 tree next_stmt, new_stmt;
4262 VEC(tree,heap) *result_chain = NULL;
4263 unsigned int i, gap_count;
4266 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4267 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4268 vectors, that are ready for vector computation. */
4269 result_chain = VEC_alloc (tree, heap, size);
4271 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
4274 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4275 Since we scan the chain starting from it's first node, their order
4276 corresponds the order of data-refs in RESULT_CHAIN. */
4277 next_stmt = first_stmt;
4279 for (i = 0; VEC_iterate(tree, result_chain, i, tmp_data_ref); i++)
4284 /* Skip the gaps. Loads created for the gaps will be removed by dead
4285 code elimination pass later.
4286 DR_GROUP_GAP is the number of steps in elements from the previous
4287 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
4288 correspond to the gaps.
4290 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
4298 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4299 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4300 copies, and we put the new vector statement in the first available
4302 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4303 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4306 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4307 tree rel_stmt = STMT_VINFO_RELATED_STMT (
4308 vinfo_for_stmt (prev_stmt));
4311 prev_stmt = rel_stmt;
4312 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4314 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
4316 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4318 /* If NEXT_STMT accesses the same DR as the previous statement,
4319 put the same TMP_DATA_REF as its vectorized statement; otherwise
4320 get the next data-ref from RESULT_CHAIN. */
4321 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4329 /* vectorizable_load.
4331 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
4333 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4334 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4335 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4338 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
4341 tree vec_dest = NULL;
4342 tree data_ref = NULL;
4344 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4345 stmt_vec_info prev_stmt_info;
4346 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4347 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4348 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
4349 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4352 tree new_stmt = NULL_TREE;
4354 enum dr_alignment_support alignment_support_cheme;
4355 tree dataref_ptr = NULL_TREE;
4357 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4358 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4359 int i, j, group_size;
4360 tree msq = NULL_TREE, lsq;
4361 tree offset = NULL_TREE;
4362 tree realignment_token = NULL_TREE;
4363 tree phi_stmt = NULL_TREE;
4364 VEC(tree,heap) *dr_chain = NULL;
4365 bool strided_load = false;
4368 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4371 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4374 /* FORNOW: not yet supported. */
4375 if (STMT_VINFO_LIVE_P (stmt_info))
4377 if (vect_print_dump_info (REPORT_DETAILS))
4378 fprintf (vect_dump, "value used after loop.");
4382 /* Is vectorizable load? */
4383 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4386 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4387 if (TREE_CODE (scalar_dest) != SSA_NAME)
4390 op = GIMPLE_STMT_OPERAND (stmt, 1);
4391 if (TREE_CODE (op) != ARRAY_REF
4392 && TREE_CODE (op) != INDIRECT_REF
4393 && !DR_GROUP_FIRST_DR (stmt_info))
4396 if (!STMT_VINFO_DATA_REF (stmt_info))
4399 mode = (int) TYPE_MODE (vectype);
4401 /* FORNOW. In some cases can vectorize even if data-type not supported
4402 (e.g. - data copies). */
4403 if (mov_optab->handlers[mode].insn_code == CODE_FOR_nothing)
4405 if (vect_print_dump_info (REPORT_DETAILS))
4406 fprintf (vect_dump, "Aligned load, but unsupported type.");
4410 /* Check if the load is a part of an interleaving chain. */
4411 if (DR_GROUP_FIRST_DR (stmt_info))
4413 strided_load = true;
4415 /* Check if interleaving is supported. */
4416 if (!vect_strided_load_supported (vectype))
4420 if (!vec_stmt) /* transformation not required. */
4422 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
4423 vect_model_load_cost (stmt_info, ncopies);
4427 if (vect_print_dump_info (REPORT_DETAILS))
4428 fprintf (vect_dump, "transform load.");
4434 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4435 /* Check if the chain of loads is already vectorized. */
4436 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
4438 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4441 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
4442 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
4443 dr_chain = VEC_alloc (tree, heap, group_size);
4452 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
4453 gcc_assert (alignment_support_cheme);
4456 /* In case the vectorization factor (VF) is bigger than the number
4457 of elements that we can fit in a vectype (nunits), we have to generate
4458 more than one vector stmt - i.e - we need to "unroll" the
4459 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4460 from one copy of the vector stmt to the next, in the field
4461 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4462 stages to find the correct vector defs to be used when vectorizing
4463 stmts that use the defs of the current stmt. The example below illustrates
4464 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4465 4 vectorized stmts):
4467 before vectorization:
4468 RELATED_STMT VEC_STMT
4472 step 1: vectorize stmt S1:
4473 We first create the vector stmt VS1_0, and, as usual, record a
4474 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
4475 Next, we create the vector stmt VS1_1, and record a pointer to
4476 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
4477 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
4479 RELATED_STMT VEC_STMT
4480 VS1_0: vx0 = memref0 VS1_1 -
4481 VS1_1: vx1 = memref1 VS1_2 -
4482 VS1_2: vx2 = memref2 VS1_3 -
4483 VS1_3: vx3 = memref3 - -
4484 S1: x = load - VS1_0
4487 See in documentation in vect_get_vec_def_for_stmt_copy for how the
4488 information we recorded in RELATED_STMT field is used to vectorize
4491 /* In case of interleaving (non-unit strided access):
4498 Vectorized loads are created in the order of memory accesses
4499 starting from the access of the first stmt of the chain:
4502 VS2: vx1 = &base + vec_size*1
4503 VS3: vx3 = &base + vec_size*2
4504 VS4: vx4 = &base + vec_size*3
4506 Then permutation statements are generated:
4508 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
4509 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
4512 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
4513 (the order of the data-refs in the output of vect_permute_load_chain
4514 corresponds to the order of scalar stmts in the interleaving chain - see
4515 the documentation of vect_permute_load_chain()).
4516 The generation of permutation stmts and recording them in
4517 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
4519 In case of both multiple types and interleaving, the vector loads and
4520 permutation stmts above are created for every copy. The result vector stmts
4521 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
4522 STMT_VINFO_RELATED_STMT for the next copies. */
4524 /* If the data reference is aligned (dr_aligned) or potentially unaligned
4525 on a target that supports unaligned accesses (dr_unaligned_supported)
4526 we generate the following code:
4530 p = p + indx * vectype_size;
4535 Otherwise, the data reference is potentially unaligned on a target that
4536 does not support unaligned accesses (dr_unaligned_software_pipeline) -
4537 then generate the following code, in which the data in each iteration is
4538 obtained by two vector loads, one from the previous iteration, and one
4539 from the current iteration:
4541 msq_init = *(floor(p1))
4542 p2 = initial_addr + VS - 1;
4543 realignment_token = call target_builtin;
4546 p2 = p2 + indx * vectype_size
4548 vec_dest = realign_load (msq, lsq, realignment_token)
4553 if (alignment_support_cheme == dr_unaligned_software_pipeline)
4555 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token);
4556 phi_stmt = SSA_NAME_DEF_STMT (msq);
4557 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
4560 prev_stmt_info = NULL;
4561 for (j = 0; j < ncopies; j++)
4563 /* 1. Create the vector pointer update chain. */
4565 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset, &dummy,
4566 &ptr_incr, false, NULL_TREE);
4568 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
4570 for (i = 0; i < group_size; i++)
4572 /* 2. Create the vector-load in the loop. */
4573 switch (alignment_support_cheme)
4576 gcc_assert (aligned_access_p (first_dr));
4577 data_ref = build_fold_indirect_ref (dataref_ptr);
4579 case dr_unaligned_supported:
4581 int mis = DR_MISALIGNMENT (first_dr);
4582 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
4584 gcc_assert (!aligned_access_p (first_dr));
4585 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
4587 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
4590 case dr_unaligned_software_pipeline:
4591 gcc_assert (!aligned_access_p (first_dr));
4592 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
4597 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4598 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
4599 new_temp = make_ssa_name (vec_dest, new_stmt);
4600 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4601 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4602 copy_virtual_operands (new_stmt, stmt);
4603 mark_symbols_for_renaming (new_stmt);
4605 /* 3. Handle explicit realignment if necessary/supported. */
4606 if (alignment_support_cheme == dr_unaligned_software_pipeline)
4609 <vec_dest = realign_load (msq, lsq, realignment_token)> */
4610 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
4611 if (!realignment_token)
4612 realignment_token = dataref_ptr;
4613 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4615 build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token);
4616 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
4617 new_temp = make_ssa_name (vec_dest, new_stmt);
4618 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4619 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4620 if (i == group_size - 1 && j == ncopies - 1)
4621 add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop));
4625 VEC_quick_push (tree, dr_chain, new_temp);
4626 if (i < group_size - 1)
4627 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
4632 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
4634 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4635 dr_chain = VEC_alloc (tree, heap, group_size);
4640 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4642 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4643 prev_stmt_info = vinfo_for_stmt (new_stmt);
4651 /* Function vectorizable_live_operation.
4653 STMT computes a value that is used outside the loop. Check if
4654 it can be supported. */
4657 vectorizable_live_operation (tree stmt,
4658 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
4659 tree *vec_stmt ATTRIBUTE_UNUSED)
4662 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4663 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4668 enum vect_def_type dt;
4670 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4672 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4675 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4678 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4681 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4682 op_type = TREE_OPERAND_LENGTH (operation);
4684 /* FORNOW: support only if all uses are invariant. This means
4685 that the scalar operations can remain in place, unvectorized.
4686 The original last scalar value that they compute will be used. */
4688 for (i = 0; i < op_type; i++)
4690 op = TREE_OPERAND (operation, i);
4691 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4693 if (vect_print_dump_info (REPORT_DETAILS))
4694 fprintf (vect_dump, "use not simple.");
4698 if (dt != vect_invariant_def && dt != vect_constant_def)
4702 /* No transformation is required for the cases we currently support. */
4707 /* Function vect_is_simple_cond.
4710 LOOP - the loop that is being vectorized.
4711 COND - Condition that is checked for simple use.
4713 Returns whether a COND can be vectorized. Checks whether
4714 condition operands are supportable using vec_is_simple_use. */
4717 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
4721 enum vect_def_type dt;
4723 if (!COMPARISON_CLASS_P (cond))
4726 lhs = TREE_OPERAND (cond, 0);
4727 rhs = TREE_OPERAND (cond, 1);
4729 if (TREE_CODE (lhs) == SSA_NAME)
4731 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
4732 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
4735 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST)
4738 if (TREE_CODE (rhs) == SSA_NAME)
4740 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
4741 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
4744 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST)
4750 /* vectorizable_condition.
4752 Check if STMT is conditional modify expression that can be vectorized.
4753 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4754 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
4757 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4760 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
4762 tree scalar_dest = NULL_TREE;
4763 tree vec_dest = NULL_TREE;
4764 tree op = NULL_TREE;
4765 tree cond_expr, then_clause, else_clause;
4766 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4767 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4768 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
4769 tree vec_compare, vec_cond_expr;
4771 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4772 enum machine_mode vec_mode;
4774 enum vect_def_type dt;
4775 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4776 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4778 gcc_assert (ncopies >= 1);
4780 return false; /* FORNOW */
4782 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4785 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4788 /* FORNOW: not yet supported. */
4789 if (STMT_VINFO_LIVE_P (stmt_info))
4791 if (vect_print_dump_info (REPORT_DETAILS))
4792 fprintf (vect_dump, "value used after loop.");
4796 /* Is vectorizable conditional operation? */
4797 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4800 op = GIMPLE_STMT_OPERAND (stmt, 1);
4802 if (TREE_CODE (op) != COND_EXPR)
4805 cond_expr = TREE_OPERAND (op, 0);
4806 then_clause = TREE_OPERAND (op, 1);
4807 else_clause = TREE_OPERAND (op, 2);
4809 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
4812 /* We do not handle two different vector types for the condition
4814 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
4817 if (TREE_CODE (then_clause) == SSA_NAME)
4819 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
4820 if (!vect_is_simple_use (then_clause, loop_vinfo,
4821 &then_def_stmt, &def, &dt))
4824 else if (TREE_CODE (then_clause) != INTEGER_CST
4825 && TREE_CODE (then_clause) != REAL_CST)
4828 if (TREE_CODE (else_clause) == SSA_NAME)
4830 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
4831 if (!vect_is_simple_use (else_clause, loop_vinfo,
4832 &else_def_stmt, &def, &dt))
4835 else if (TREE_CODE (else_clause) != INTEGER_CST
4836 && TREE_CODE (else_clause) != REAL_CST)
4840 vec_mode = TYPE_MODE (vectype);
4844 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
4845 return expand_vec_cond_expr_p (op, vec_mode);
4851 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4852 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4854 /* Handle cond expr. */
4856 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
4858 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
4859 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
4860 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
4862 /* Arguments are ready. create the new vector stmt. */
4863 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
4864 vec_cond_lhs, vec_cond_rhs);
4865 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
4866 vec_compare, vec_then_clause, vec_else_clause);
4868 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
4869 new_temp = make_ssa_name (vec_dest, *vec_stmt);
4870 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
4871 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
4876 /* Function vect_transform_stmt.
4878 Create a vectorized stmt to replace STMT, and insert it at BSI. */
4881 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
4883 bool is_store = false;
4884 tree vec_stmt = NULL_TREE;
4885 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4886 tree orig_stmt_in_pattern;
4889 switch (STMT_VINFO_TYPE (stmt_info))
4891 case type_demotion_vec_info_type:
4892 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
4896 case type_promotion_vec_info_type:
4897 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
4901 case type_conversion_vec_info_type:
4902 done = vectorizable_conversion (stmt, bsi, &vec_stmt);
4906 case induc_vec_info_type:
4907 done = vectorizable_induction (stmt, bsi, &vec_stmt);
4911 case op_vec_info_type:
4912 done = vectorizable_operation (stmt, bsi, &vec_stmt);
4916 case assignment_vec_info_type:
4917 done = vectorizable_assignment (stmt, bsi, &vec_stmt);
4921 case load_vec_info_type:
4922 done = vectorizable_load (stmt, bsi, &vec_stmt);
4926 case store_vec_info_type:
4927 done = vectorizable_store (stmt, bsi, &vec_stmt);
4929 if (DR_GROUP_FIRST_DR (stmt_info))
4931 /* In case of interleaving, the whole chain is vectorized when the
4932 last store in the chain is reached. Store stmts before the last
4933 one are skipped, and there vec_stmt_info shouldn't be freed
4935 *strided_store = true;
4936 if (STMT_VINFO_VEC_STMT (stmt_info))
4943 case condition_vec_info_type:
4944 done = vectorizable_condition (stmt, bsi, &vec_stmt);
4948 case call_vec_info_type:
4949 done = vectorizable_call (stmt, bsi, &vec_stmt);
4952 case reduc_vec_info_type:
4953 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
4958 if (!STMT_VINFO_LIVE_P (stmt_info))
4960 if (vect_print_dump_info (REPORT_DETAILS))
4961 fprintf (vect_dump, "stmt not supported.");
4966 if (STMT_VINFO_LIVE_P (stmt_info)
4967 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
4969 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
4975 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
4976 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
4977 if (orig_stmt_in_pattern)
4979 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
4980 /* STMT was inserted by the vectorizer to replace a computation idiom.
4981 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
4982 computed this idiom. We need to record a pointer to VEC_STMT in
4983 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
4984 documentation of vect_pattern_recog. */
4985 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
4987 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4988 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
4997 /* This function builds ni_name = number of iterations loop executes
4998 on the loop preheader. */
5001 vect_build_loop_niters (loop_vec_info loop_vinfo)
5003 tree ni_name, stmt, var;
5005 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5006 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5008 var = create_tmp_var (TREE_TYPE (ni), "niters");
5009 add_referenced_var (var);
5010 ni_name = force_gimple_operand (ni, &stmt, false, var);
5012 pe = loop_preheader_edge (loop);
5015 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
5016 gcc_assert (!new_bb);
5023 /* This function generates the following statements:
5025 ni_name = number of iterations loop executes
5026 ratio = ni_name / vf
5027 ratio_mult_vf_name = ratio * vf
5029 and places them at the loop preheader edge. */
5032 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5034 tree *ratio_mult_vf_name_ptr,
5035 tree *ratio_name_ptr)
5043 tree ratio_mult_vf_name;
5044 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5045 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
5046 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5049 pe = loop_preheader_edge (loop);
5051 /* Generate temporary variable that contains
5052 number of iterations loop executes. */
5054 ni_name = vect_build_loop_niters (loop_vinfo);
5055 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
5057 /* Create: ratio = ni >> log2(vf) */
5059 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
5060 if (!is_gimple_val (ratio_name))
5062 var = create_tmp_var (TREE_TYPE (ni), "bnd");
5063 add_referenced_var (var);
5065 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
5066 pe = loop_preheader_edge (loop);
5067 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
5068 gcc_assert (!new_bb);
5071 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5073 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5074 ratio_name, log_vf);
5075 if (!is_gimple_val (ratio_mult_vf_name))
5077 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
5078 add_referenced_var (var);
5080 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
5082 pe = loop_preheader_edge (loop);
5083 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
5084 gcc_assert (!new_bb);
5087 *ni_name_ptr = ni_name;
5088 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5089 *ratio_name_ptr = ratio_name;
5095 /* Function update_vuses_to_preheader.
5098 STMT - a statement with potential VUSEs.
5099 LOOP - the loop whose preheader will contain STMT.
5101 It's possible to vectorize a loop even though an SSA_NAME from a VUSE
5102 appears to be defined in a VDEF in another statement in a loop.
5103 One such case is when the VUSE is at the dereference of a __restricted__
5104 pointer in a load and the VDEF is at the dereference of a different
5105 __restricted__ pointer in a store. Vectorization may result in
5106 copy_virtual_uses being called to copy the problematic VUSE to a new
5107 statement that is being inserted in the loop preheader. This procedure
5108 is called to change the SSA_NAME in the new statement's VUSE from the
5109 SSA_NAME updated in the loop to the related SSA_NAME available on the
5110 path entering the loop.
5112 When this function is called, we have the following situation:
5117 # name1 = phi < name0 , name2>
5122 # name2 = vdef <name1>
5127 Stmt S1 was created in the loop preheader block as part of misaligned-load
5128 handling. This function fixes the name of the vuse of S1 from 'name1' to
5132 update_vuses_to_preheader (tree stmt, struct loop *loop)
5134 basic_block header_bb = loop->header;
5135 edge preheader_e = loop_preheader_edge (loop);
5137 use_operand_p use_p;
5139 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VUSE)
5141 tree ssa_name = USE_FROM_PTR (use_p);
5142 tree def_stmt = SSA_NAME_DEF_STMT (ssa_name);
5143 tree name_var = SSA_NAME_VAR (ssa_name);
5144 basic_block bb = bb_for_stmt (def_stmt);
5146 /* For a use before any definitions, def_stmt is a NOP_EXPR. */
5147 if (!IS_EMPTY_STMT (def_stmt)
5148 && flow_bb_inside_loop_p (loop, bb))
5150 /* If the block containing the statement defining the SSA_NAME
5151 is in the loop then it's necessary to find the definition
5152 outside the loop using the PHI nodes of the header. */
5154 bool updated = false;
5156 for (phi = phi_nodes (header_bb); phi; phi = PHI_CHAIN (phi))
5158 if (SSA_NAME_VAR (PHI_RESULT (phi)) == name_var)
5160 SET_USE (use_p, PHI_ARG_DEF (phi, preheader_e->dest_idx));
5165 gcc_assert (updated);
5171 /* Function vect_update_ivs_after_vectorizer.
5173 "Advance" the induction variables of LOOP to the value they should take
5174 after the execution of LOOP. This is currently necessary because the
5175 vectorizer does not handle induction variables that are used after the
5176 loop. Such a situation occurs when the last iterations of LOOP are
5178 1. We introduced new uses after LOOP for IVs that were not originally used
5179 after LOOP: the IVs of LOOP are now used by an epilog loop.
5180 2. LOOP is going to be vectorized; this means that it will iterate N/VF
5181 times, whereas the loop IVs should be bumped N times.
5184 - LOOP - a loop that is going to be vectorized. The last few iterations
5185 of LOOP were peeled.
5186 - NITERS - the number of iterations that LOOP executes (before it is
5187 vectorized). i.e, the number of times the ivs should be bumped.
5188 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
5189 coming out from LOOP on which there are uses of the LOOP ivs
5190 (this is the path from LOOP->exit to epilog_loop->preheader).
5192 The new definitions of the ivs are placed in LOOP->exit.
5193 The phi args associated with the edge UPDATE_E in the bb
5194 UPDATE_E->dest are updated accordingly.
5196 Assumption 1: Like the rest of the vectorizer, this function assumes
5197 a single loop exit that has a single predecessor.
5199 Assumption 2: The phi nodes in the LOOP header and in update_bb are
5200 organized in the same order.
5202 Assumption 3: The access function of the ivs is simple enough (see
5203 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
5205 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
5206 coming out of LOOP on which the ivs of LOOP are used (this is the path
5207 that leads to the epilog loop; other paths skip the epilog loop). This
5208 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
5209 needs to have its phis updated.
5213 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
5216 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5217 basic_block exit_bb = single_exit (loop)->dest;
5219 basic_block update_bb = update_e->dest;
5221 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
5223 /* Make sure there exists a single-predecessor exit bb: */
5224 gcc_assert (single_pred_p (exit_bb));
5226 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
5228 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
5230 tree access_fn = NULL;
5231 tree evolution_part;
5234 tree var, stmt, ni, ni_name;
5235 block_stmt_iterator last_bsi;
5237 if (vect_print_dump_info (REPORT_DETAILS))
5239 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
5240 print_generic_expr (vect_dump, phi, TDF_SLIM);
5243 /* Skip virtual phi's. */
5244 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
5246 if (vect_print_dump_info (REPORT_DETAILS))
5247 fprintf (vect_dump, "virtual phi. skip.");
5251 /* Skip reduction phis. */
5252 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
5254 if (vect_print_dump_info (REPORT_DETAILS))
5255 fprintf (vect_dump, "reduc phi. skip.");
5259 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
5260 gcc_assert (access_fn);
5262 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
5263 gcc_assert (evolution_part != NULL_TREE);
5265 /* FORNOW: We do not support IVs whose evolution function is a polynomial
5266 of degree >= 2 or exponential. */
5267 gcc_assert (!tree_is_chrec (evolution_part));
5269 step_expr = evolution_part;
5270 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
5273 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
5274 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
5275 fold_convert (TREE_TYPE (init_expr),
5280 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
5281 add_referenced_var (var);
5283 ni_name = force_gimple_operand (ni, &stmt, false, var);
5285 /* Insert stmt into exit_bb. */
5286 last_bsi = bsi_last (exit_bb);
5288 bsi_insert_before (&last_bsi, stmt, BSI_SAME_STMT);
5290 /* Fix phi expressions in the successor bb. */
5291 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
5296 /* Function vect_do_peeling_for_loop_bound
5298 Peel the last iterations of the loop represented by LOOP_VINFO.
5299 The peeled iterations form a new epilog loop. Given that the loop now
5300 iterates NITERS times, the new epilog loop iterates
5301 NITERS % VECTORIZATION_FACTOR times.
5303 The original loop will later be made to iterate
5304 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
5307 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
5309 tree ni_name, ratio_mult_vf_name;
5310 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5311 struct loop *new_loop;
5313 basic_block preheader;
5316 int min_scalar_loop_bound;
5317 int min_profitable_iters;
5319 if (vect_print_dump_info (REPORT_DETAILS))
5320 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
5322 initialize_original_copy_tables ();
5324 /* Generate the following variables on the preheader of original loop:
5326 ni_name = number of iteration the original loop executes
5327 ratio = ni_name / vf
5328 ratio_mult_vf_name = ratio * vf */
5329 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
5330 &ratio_mult_vf_name, ratio);
5332 loop_num = loop->num;
5334 /* Analyze cost to set threshhold for vectorized loop. */
5335 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
5337 min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND))
5338 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5340 /* Use the cost model only if it is more conservative than user specified
5343 th = (unsigned) min_scalar_loop_bound;
5344 if (min_profitable_iters
5345 && (!min_scalar_loop_bound
5346 || min_profitable_iters > min_scalar_loop_bound))
5347 th = (unsigned) min_profitable_iters;
5349 if (vect_print_dump_info (REPORT_DETAILS))
5350 fprintf (vect_dump, "vectorization may not be profitable.");
5352 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
5353 ratio_mult_vf_name, ni_name, false,
5355 gcc_assert (new_loop);
5356 gcc_assert (loop_num == loop->num);
5357 #ifdef ENABLE_CHECKING
5358 slpeel_verify_cfg_after_peeling (loop, new_loop);
5361 /* A guard that controls whether the new_loop is to be executed or skipped
5362 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
5363 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
5364 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
5365 is on the path where the LOOP IVs are used and need to be updated. */
5367 preheader = loop_preheader_edge (new_loop)->src;
5368 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
5369 update_e = EDGE_PRED (preheader, 0);
5371 update_e = EDGE_PRED (preheader, 1);
5373 /* Update IVs of original loop as if they were advanced
5374 by ratio_mult_vf_name steps. */
5375 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
5377 /* After peeling we have to reset scalar evolution analyzer. */
5380 free_original_copy_tables ();
5384 /* Function vect_gen_niters_for_prolog_loop
5386 Set the number of iterations for the loop represented by LOOP_VINFO
5387 to the minimum between LOOP_NITERS (the original iteration count of the loop)
5388 and the misalignment of DR - the data reference recorded in
5389 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
5390 this loop, the data reference DR will refer to an aligned location.
5392 The following computation is generated:
5394 If the misalignment of DR is known at compile time:
5395 addr_mis = int mis = DR_MISALIGNMENT (dr);
5396 Else, compute address misalignment in bytes:
5397 addr_mis = addr & (vectype_size - 1)
5399 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
5401 (elem_size = element type size; an element is the scalar element
5402 whose type is the inner type of the vectype)
5406 prolog_niters = min ( LOOP_NITERS ,
5407 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
5408 where group_size is the size of the interleaved group.
5410 The above formulas assume that VF == number of elements in the vector. This
5411 may not hold when there are multiple-types in the loop.
5412 In this case, for some data-references in the loop the VF does not represent
5413 the number of elements that fit in the vector. Therefore, instead of VF we
5414 use TYPE_VECTOR_SUBPARTS. */
5417 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
5419 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
5420 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5422 tree iters, iters_name;
5425 tree dr_stmt = DR_STMT (dr);
5426 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
5427 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5428 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
5429 tree niters_type = TREE_TYPE (loop_niters);
5431 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
5432 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
5434 if (DR_GROUP_FIRST_DR (stmt_info))
5436 /* For interleaved access element size must be multiplied by the size of
5437 the interleaved group. */
5438 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
5439 DR_GROUP_FIRST_DR (stmt_info)));
5440 element_size *= group_size;
5443 pe = loop_preheader_edge (loop);
5445 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
5447 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
5448 int elem_misalign = byte_misalign / element_size;
5450 if (vect_print_dump_info (REPORT_DETAILS))
5451 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
5452 iters = build_int_cst (niters_type,
5453 (nelements - elem_misalign)&(nelements/group_size-1));
5457 tree new_stmts = NULL_TREE;
5459 vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE);
5460 tree ptr_type = TREE_TYPE (start_addr);
5461 tree size = TYPE_SIZE (ptr_type);
5462 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
5463 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
5464 tree elem_size_log =
5465 build_int_cst (type, exact_log2 (vectype_align/nelements));
5466 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
5467 tree nelements_tree = build_int_cst (type, nelements);
5471 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
5472 gcc_assert (!new_bb);
5474 /* Create: byte_misalign = addr & (vectype_size - 1) */
5476 fold_build2 (BIT_AND_EXPR, type, start_addr, vectype_size_minus_1);
5478 /* Create: elem_misalign = byte_misalign / element_size */
5480 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
5482 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
5483 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
5484 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
5485 iters = fold_convert (niters_type, iters);
5488 /* Create: prolog_loop_niters = min (iters, loop_niters) */
5489 /* If the loop bound is known at compile time we already verified that it is
5490 greater than vf; since the misalignment ('iters') is at most vf, there's
5491 no need to generate the MIN_EXPR in this case. */
5492 if (TREE_CODE (loop_niters) != INTEGER_CST)
5493 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
5495 if (vect_print_dump_info (REPORT_DETAILS))
5497 fprintf (vect_dump, "niters for prolog loop: ");
5498 print_generic_expr (vect_dump, iters, TDF_SLIM);
5501 var = create_tmp_var (niters_type, "prolog_loop_niters");
5502 add_referenced_var (var);
5503 iters_name = force_gimple_operand (iters, &stmt, false, var);
5505 /* Insert stmt on loop preheader edge. */
5508 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
5509 gcc_assert (!new_bb);
5516 /* Function vect_update_init_of_dr
5518 NITERS iterations were peeled from LOOP. DR represents a data reference
5519 in LOOP. This function updates the information recorded in DR to
5520 account for the fact that the first NITERS iterations had already been
5521 executed. Specifically, it updates the OFFSET field of DR. */
5524 vect_update_init_of_dr (struct data_reference *dr, tree niters)
5526 tree offset = DR_OFFSET (dr);
5528 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
5529 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
5530 DR_OFFSET (dr) = offset;
5534 /* Function vect_update_inits_of_drs
5536 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
5537 This function updates the information recorded for the data references in
5538 the loop to account for the fact that the first NITERS iterations had
5539 already been executed. Specifically, it updates the initial_condition of
5540 the access_function of all the data_references in the loop. */
5543 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
5546 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
5547 struct data_reference *dr;
5549 if (vect_print_dump_info (REPORT_DETAILS))
5550 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
5552 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
5553 vect_update_init_of_dr (dr, niters);
5557 /* Function vect_do_peeling_for_alignment
5559 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
5560 'niters' is set to the misalignment of one of the data references in the
5561 loop, thereby forcing it to refer to an aligned location at the beginning
5562 of the execution of this loop. The data reference for which we are
5563 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
5566 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
5568 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5569 tree niters_of_prolog_loop, ni_name;
5571 struct loop *new_loop;
5573 if (vect_print_dump_info (REPORT_DETAILS))
5574 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
5576 initialize_original_copy_tables ();
5578 ni_name = vect_build_loop_niters (loop_vinfo);
5579 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
5581 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
5583 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
5584 niters_of_prolog_loop, ni_name, true, 0);
5585 gcc_assert (new_loop);
5586 #ifdef ENABLE_CHECKING
5587 slpeel_verify_cfg_after_peeling (new_loop, loop);
5590 /* Update number of times loop executes. */
5591 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
5592 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
5593 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
5595 /* Update the init conditions of the access functions of all data refs. */
5596 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
5598 /* After peeling we have to reset scalar evolution analyzer. */
5601 free_original_copy_tables ();
5605 /* Function vect_create_cond_for_align_checks.
5607 Create a conditional expression that represents the alignment checks for
5608 all of data references (array element references) whose alignment must be
5612 LOOP_VINFO - two fields of the loop information are used.
5613 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
5614 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
5617 COND_EXPR_STMT_LIST - statements needed to construct the conditional
5619 The returned value is the conditional expression to be used in the if
5620 statement that controls which version of the loop gets executed at runtime.
5622 The algorithm makes two assumptions:
5623 1) The number of bytes "n" in a vector is a power of 2.
5624 2) An address "a" is aligned if a%n is zero and that this
5625 test can be done as a&(n-1) == 0. For example, for 16
5626 byte vectors the test is a&0xf == 0. */
5629 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
5630 tree *cond_expr_stmt_list)
5632 VEC(tree,heap) *may_misalign_stmts
5633 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
5635 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
5639 tree int_ptrsize_type;
5641 tree or_tmp_name = NULL_TREE;
5642 tree and_tmp, and_tmp_name, and_stmt;
5645 /* Check that mask is one less than a power of 2, i.e., mask is
5646 all zeros followed by all ones. */
5647 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
5649 /* CHECKME: what is the best integer or unsigned type to use to hold a
5650 cast from a pointer value? */
5651 psize = TYPE_SIZE (ptr_type_node);
5653 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
5655 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
5656 of the first vector of the i'th data reference. */
5658 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
5660 tree new_stmt_list = NULL_TREE;
5662 tree addr_tmp, addr_tmp_name, addr_stmt;
5663 tree or_tmp, new_or_tmp_name, or_stmt;
5665 /* create: addr_tmp = (int)(address_of_first_vector) */
5666 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
5670 if (new_stmt_list != NULL_TREE)
5671 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
5673 sprintf (tmp_name, "%s%d", "addr2int", i);
5674 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
5675 add_referenced_var (addr_tmp);
5676 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
5677 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
5678 addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
5679 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
5680 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
5682 /* The addresses are OR together. */
5684 if (or_tmp_name != NULL_TREE)
5686 /* create: or_tmp = or_tmp | addr_tmp */
5687 sprintf (tmp_name, "%s%d", "orptrs", i);
5688 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
5689 add_referenced_var (or_tmp);
5690 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
5691 tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
5692 or_tmp_name, addr_tmp_name);
5693 or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
5694 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
5695 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
5696 or_tmp_name = new_or_tmp_name;
5699 or_tmp_name = addr_tmp_name;
5703 mask_cst = build_int_cst (int_ptrsize_type, mask);
5705 /* create: and_tmp = or_tmp & mask */
5706 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
5707 add_referenced_var (and_tmp);
5708 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
5710 tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
5711 and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
5712 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
5713 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
5715 /* Make and_tmp the left operand of the conditional test against zero.
5716 if and_tmp has a nonzero bit then some address is unaligned. */
5717 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
5718 return build2 (EQ_EXPR, boolean_type_node,
5719 and_tmp_name, ptrsize_zero);
5723 /* Function vect_transform_loop.
5725 The analysis phase has determined that the loop is vectorizable.
5726 Vectorize the loop - created vectorized stmts to replace the scalar
5727 stmts in the loop, and update the loop exit condition. */
5730 vect_transform_loop (loop_vec_info loop_vinfo)
5732 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5733 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5734 int nbbs = loop->num_nodes;
5735 block_stmt_iterator si, next_si;
5738 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5741 if (vect_print_dump_info (REPORT_DETAILS))
5742 fprintf (vect_dump, "=== vec_transform_loop ===");
5744 /* If the loop has data references that may or may not be aligned then
5745 two versions of the loop need to be generated, one which is vectorized
5746 and one which isn't. A test is then generated to control which of the
5747 loops is executed. The test checks for the alignment of all of the
5748 data references that may or may not be aligned. */
5750 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
5754 tree cond_expr_stmt_list = NULL_TREE;
5755 basic_block condition_bb;
5756 block_stmt_iterator cond_exp_bsi;
5757 basic_block merge_bb;
5758 basic_block new_exit_bb;
5760 tree orig_phi, new_phi, arg;
5761 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
5763 cond_expr = vect_create_cond_for_align_checks (loop_vinfo,
5764 &cond_expr_stmt_list);
5765 initialize_original_copy_tables ();
5766 nloop = loop_version (loop, cond_expr, &condition_bb,
5767 prob, prob, REG_BR_PROB_BASE - prob, true);
5768 free_original_copy_tables();
5770 /** Loop versioning violates an assumption we try to maintain during
5771 vectorization - that the loop exit block has a single predecessor.
5772 After versioning, the exit block of both loop versions is the same
5773 basic block (i.e. it has two predecessors). Just in order to simplify
5774 following transformations in the vectorizer, we fix this situation
5775 here by adding a new (empty) block on the exit-edge of the loop,
5776 with the proper loop-exit phis to maintain loop-closed-form. **/
5778 merge_bb = single_exit (loop)->dest;
5779 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
5780 new_exit_bb = split_edge (single_exit (loop));
5781 new_exit_e = single_exit (loop);
5782 e = EDGE_SUCC (new_exit_bb, 0);
5784 for (orig_phi = phi_nodes (merge_bb); orig_phi;
5785 orig_phi = PHI_CHAIN (orig_phi))
5787 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
5789 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
5790 add_phi_arg (new_phi, arg, new_exit_e);
5791 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
5794 /** end loop-exit-fixes after versioning **/
5796 update_ssa (TODO_update_ssa);
5797 cond_exp_bsi = bsi_last (condition_bb);
5798 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
5801 /* CHECKME: we wouldn't need this if we called update_ssa once
5803 bitmap_zero (vect_memsyms_to_rename);
5805 /* Peel the loop if there are data refs with unknown alignment.
5806 Only one data ref with unknown store is allowed. */
5808 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5809 vect_do_peeling_for_alignment (loop_vinfo);
5811 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5812 compile time constant), or it is a constant that doesn't divide by the
5813 vectorization factor, then an epilog loop needs to be created.
5814 We therefore duplicate the loop: the original loop will be vectorized,
5815 and will compute the first (n/VF) iterations. The second copy of the loop
5816 will remain scalar and will compute the remaining (n%VF) iterations.
5817 (VF is the vectorization factor). */
5819 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5820 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5821 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
5822 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
5824 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5825 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5827 /* 1) Make sure the loop header has exactly two entries
5828 2) Make sure we have a preheader basic block. */
5830 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5832 split_edge (loop_preheader_edge (loop));
5834 /* FORNOW: the vectorizer supports only loops which body consist
5835 of one basic block (header + empty latch). When the vectorizer will
5836 support more involved loop forms, the order by which the BBs are
5837 traversed need to be reconsidered. */
5839 for (i = 0; i < nbbs; i++)
5841 basic_block bb = bbs[i];
5842 stmt_vec_info stmt_info;
5845 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
5847 if (vect_print_dump_info (REPORT_DETAILS))
5849 fprintf (vect_dump, "------>vectorizing phi: ");
5850 print_generic_expr (vect_dump, phi, TDF_SLIM);
5852 stmt_info = vinfo_for_stmt (phi);
5855 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5856 && !STMT_VINFO_LIVE_P (stmt_info))
5859 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5860 != (unsigned HOST_WIDE_INT) vectorization_factor)
5861 && vect_print_dump_info (REPORT_DETAILS))
5862 fprintf (vect_dump, "multiple-types.");
5864 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5866 if (vect_print_dump_info (REPORT_DETAILS))
5867 fprintf (vect_dump, "transform phi.");
5868 vect_transform_stmt (phi, NULL, NULL);
5872 for (si = bsi_start (bb); !bsi_end_p (si);)
5874 tree stmt = bsi_stmt (si);
5877 if (vect_print_dump_info (REPORT_DETAILS))
5879 fprintf (vect_dump, "------>vectorizing statement: ");
5880 print_generic_expr (vect_dump, stmt, TDF_SLIM);
5882 stmt_info = vinfo_for_stmt (stmt);
5883 gcc_assert (stmt_info);
5884 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5885 && !STMT_VINFO_LIVE_P (stmt_info))
5891 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5892 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5893 != (unsigned HOST_WIDE_INT) vectorization_factor)
5894 && vect_print_dump_info (REPORT_DETAILS))
5895 fprintf (vect_dump, "multiple-types.");
5897 /* -------- vectorize statement ------------ */
5898 if (vect_print_dump_info (REPORT_DETAILS))
5899 fprintf (vect_dump, "transform statement.");
5901 strided_store = false;
5902 is_store = vect_transform_stmt (stmt, &si, &strided_store);
5906 if (DR_GROUP_FIRST_DR (stmt_info))
5908 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5909 interleaving chain was completed - free all the stores in
5911 tree next = DR_GROUP_FIRST_DR (stmt_info);
5913 stmt_vec_info next_stmt_info;
5917 next_si = bsi_for_stmt (next);
5918 next_stmt_info = vinfo_for_stmt (next);
5919 /* Free the attached stmt_vec_info and remove the stmt. */
5920 ann = stmt_ann (next);
5921 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
5922 free (next_stmt_info);
5923 set_stmt_info (ann, NULL);
5924 bsi_remove (&next_si, true);
5927 bsi_remove (&si, true);
5932 /* Free the attached stmt_vec_info and remove the stmt. */
5933 ann = stmt_ann (stmt);
5935 set_stmt_info (ann, NULL);
5936 bsi_remove (&si, true);
5944 slpeel_make_loop_iterate_ntimes (loop, ratio);
5946 mark_set_for_renaming (vect_memsyms_to_rename);
5948 /* The memory tags and pointers in vectorized statements need to
5949 have their SSA forms updated. FIXME, why can't this be delayed
5950 until all the loops have been transformed? */
5951 update_ssa (TODO_update_ssa);
5953 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
5954 fprintf (vect_dump, "LOOP VECTORIZED.");