1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to the Free
19 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
24 #include "coretypes.h"
30 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
50 static tree vect_create_destination_var (tree, tree);
51 static tree vect_create_data_ref_ptr
52 (tree, block_stmt_iterator *, tree, tree *, tree *, bool);
53 static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree);
54 static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *);
55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
57 static tree vect_init_vector (tree, tree);
58 static void vect_finish_stmt_generation
59 (tree stmt, tree vec_stmt, block_stmt_iterator *bsi);
60 static bool vect_is_simple_cond (tree, loop_vec_info);
61 static void update_vuses_to_preheader (tree, struct loop*);
62 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
63 static tree get_initial_def_for_reduction (tree, tree, tree *);
65 /* Utility function dealing with loop peeling (not peeling itself). */
66 static void vect_generate_tmps_on_preheader
67 (loop_vec_info, tree *, tree *, tree *);
68 static tree vect_build_loop_niters (loop_vec_info);
69 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
70 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
71 static void vect_update_init_of_dr (struct data_reference *, tree niters);
72 static void vect_update_inits_of_drs (loop_vec_info, tree);
73 static int vect_min_worthwhile_factor (enum tree_code);
76 /* Function vect_get_new_vect_var.
78 Returns a name for a new variable. The current naming scheme appends the
79 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
80 the name of vectorizer generated variables, and appends that to NAME if
84 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
97 case vect_pointer_var:
105 new_vect_var = create_tmp_var (type, concat (prefix, name, NULL));
107 new_vect_var = create_tmp_var (type, prefix);
113 /* Function vect_create_addr_base_for_vector_ref.
115 Create an expression that computes the address of the first memory location
116 that will be accessed for a data reference.
119 STMT: The statement containing the data reference.
120 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
121 OFFSET: Optional. If supplied, it is be added to the initial address.
124 1. Return an SSA_NAME whose value is the address of the memory location of
125 the first vector of the data reference.
126 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
127 these statement(s) which define the returned SSA_NAME.
129 FORNOW: We are only handling array accesses with step 1. */
132 vect_create_addr_base_for_vector_ref (tree stmt,
136 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
137 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
138 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
139 tree base_name = build_fold_indirect_ref (data_ref_base);
140 tree ref = DR_REF (dr);
141 tree scalar_type = TREE_TYPE (ref);
142 tree scalar_ptr_type = build_pointer_type (scalar_type);
145 tree addr_base, addr_expr;
147 tree base_offset = unshare_expr (DR_OFFSET (dr));
148 tree init = unshare_expr (DR_INIT (dr));
150 /* Create base_offset */
151 base_offset = size_binop (PLUS_EXPR, base_offset, init);
152 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
153 add_referenced_var (dest);
154 base_offset = force_gimple_operand (base_offset, &new_stmt, false, dest);
155 append_to_statement_list_force (new_stmt, new_stmt_list);
159 tree tmp = create_tmp_var (TREE_TYPE (base_offset), "offset");
162 /* For interleaved access step we divide STEP by the size of the
163 interleaving group. */
164 if (DR_GROUP_SIZE (stmt_info))
165 step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr),
166 build_int_cst (TREE_TYPE (offset),
167 DR_GROUP_SIZE (stmt_info)));
171 add_referenced_var (tmp);
172 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
173 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
174 base_offset, offset);
175 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
176 append_to_statement_list_force (new_stmt, new_stmt_list);
179 /* base + base_offset */
180 addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base,
183 /* addr_expr = addr_base */
184 addr_expr = vect_get_new_vect_var (scalar_ptr_type, vect_pointer_var,
185 get_name (base_name));
186 add_referenced_var (addr_expr);
187 vec_stmt = build2 (MODIFY_EXPR, void_type_node, addr_expr, addr_base);
188 new_temp = make_ssa_name (addr_expr, vec_stmt);
189 TREE_OPERAND (vec_stmt, 0) = new_temp;
190 append_to_statement_list_force (vec_stmt, new_stmt_list);
192 if (vect_print_dump_info (REPORT_DETAILS))
194 fprintf (vect_dump, "created ");
195 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
201 /* Function vect_create_data_ref_ptr.
203 Create a new pointer to vector type (vp), that points to the first location
204 accessed in the loop by STMT, along with the def-use update chain to
205 appropriately advance the pointer through the loop iterations. Also set
206 aliasing information for the pointer. This vector pointer is used by the
207 callers to this function to create a memory reference expression for vector
211 1. STMT: a stmt that references memory. Expected to be of the form
212 MODIFY_EXPR <name, data-ref> or MODIFY_EXPR <data-ref, name>.
213 2. BSI: block_stmt_iterator where new stmts can be added.
214 3. OFFSET (optional): an offset to be added to the initial address accessed
215 by the data-ref in STMT.
216 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
217 pointing to the initial address.
220 1. Declare a new ptr to vector_type, and have it point to the base of the
221 data reference (initial addressed accessed by the data reference).
222 For example, for vector of type V8HI, the following code is generated:
225 vp = (v8hi *)initial_address;
227 if OFFSET is not supplied:
228 initial_address = &a[init];
229 if OFFSET is supplied:
230 initial_address = &a[init + OFFSET];
232 Return the initial_address in INITIAL_ADDRESS.
234 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
235 update the pointer in each iteration of the loop.
237 Return the increment stmt that updates the pointer in PTR_INCR.
239 3. Return the pointer. */
242 vect_create_data_ref_ptr (tree stmt,
243 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
244 tree offset, tree *initial_address, tree *ptr_incr,
248 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
249 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
250 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
251 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
257 tree new_stmt_list = NULL_TREE;
258 edge pe = loop_preheader_edge (loop);
261 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
263 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
265 if (vect_print_dump_info (REPORT_DETAILS))
267 tree data_ref_base = base_name;
268 fprintf (vect_dump, "create vector-pointer variable to type: ");
269 print_generic_expr (vect_dump, vectype, TDF_SLIM);
270 if (TREE_CODE (data_ref_base) == VAR_DECL)
271 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
272 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
273 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
274 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
275 fprintf (vect_dump, " vectorizing a record based array ref: ");
276 else if (TREE_CODE (data_ref_base) == SSA_NAME)
277 fprintf (vect_dump, " vectorizing a pointer ref: ");
278 print_generic_expr (vect_dump, base_name, TDF_SLIM);
281 /** (1) Create the new vector-pointer variable: **/
283 vect_ptr_type = build_pointer_type (vectype);
284 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
285 get_name (base_name));
286 add_referenced_var (vect_ptr);
289 /** (2) Add aliasing information to the new vector-pointer:
290 (The points-to info (DR_PTR_INFO) may be defined later.) **/
292 tag = DR_MEMTAG (dr);
295 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
296 tag must be created with tag added to its may alias list. */
298 new_type_alias (vect_ptr, tag, DR_REF (dr));
300 var_ann (vect_ptr)->symbol_mem_tag = tag;
302 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
304 /** (3) Calculate the initial address the vector-pointer, and set
305 the vector-pointer to point to it before the loop: **/
307 /* Create: (&(base[init_val+offset]) in the loop preheader. */
308 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
310 pe = loop_preheader_edge (loop);
311 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
312 gcc_assert (!new_bb);
313 *initial_address = new_temp;
315 /* Create: p = (vectype *) initial_base */
316 vec_stmt = fold_convert (vect_ptr_type, new_temp);
317 vec_stmt = build2 (MODIFY_EXPR, void_type_node, vect_ptr, vec_stmt);
318 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
319 TREE_OPERAND (vec_stmt, 0) = vect_ptr_init;
320 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
321 gcc_assert (!new_bb);
324 /** (4) Handle the updating of the vector-pointer inside the loop: **/
326 if (only_init) /* No update in loop is required. */
328 /* Copy the points-to information if it exists. */
329 if (DR_PTR_INFO (dr))
330 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
331 return vect_ptr_init;
335 block_stmt_iterator incr_bsi;
337 tree indx_before_incr, indx_after_incr;
340 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
341 create_iv (vect_ptr_init,
342 fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)),
343 NULL_TREE, loop, &incr_bsi, insert_after,
344 &indx_before_incr, &indx_after_incr);
345 incr = bsi_stmt (incr_bsi);
346 set_stmt_info (stmt_ann (incr),
347 new_stmt_vec_info (incr, loop_vinfo));
349 /* Copy the points-to information if it exists. */
350 if (DR_PTR_INFO (dr))
352 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
353 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
355 merge_alias_info (vect_ptr_init, indx_before_incr);
356 merge_alias_info (vect_ptr_init, indx_after_incr);
360 return indx_before_incr;
365 /* Function bump_vector_ptr
367 Increment a pointer (to a vector type) by vector-size. Connect the new
368 increment stmt to the exising def-use update-chain of the pointer.
370 The pointer def-use update-chain before this function:
371 DATAREF_PTR = phi (p_0, p_2)
373 PTR_INCR: p_2 = DATAREF_PTR + step
375 The pointer def-use update-chain after this function:
376 DATAREF_PTR = phi (p_0, p_2)
378 NEW_DATAREF_PTR = DATAREF_PTR + vector_size
380 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
383 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
385 PTR_INCR - the stmt that updates the pointer in each iteration of the loop.
386 The increment amount across iterations is also expected to be
388 BSI - location where the new update stmt is to be placed.
389 STMT - the original scalar memory-access stmt that is being vectorized.
391 Output: Return NEW_DATAREF_PTR as illustrated above.
396 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
399 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
400 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
401 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
402 tree vptr_type = TREE_TYPE (dataref_ptr);
403 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
404 tree update = fold_convert (vptr_type, TYPE_SIZE_UNIT (vectype));
408 tree new_dataref_ptr;
410 incr_stmt = build2 (MODIFY_EXPR, void_type_node, ptr_var,
411 build2 (PLUS_EXPR, vptr_type, dataref_ptr, update));
412 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
413 TREE_OPERAND (incr_stmt, 0) = new_dataref_ptr;
414 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
416 /* Update the vector-pointer's cross-iteration increment. */
417 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
419 tree use = USE_FROM_PTR (use_p);
421 if (use == dataref_ptr)
422 SET_USE (use_p, new_dataref_ptr);
424 gcc_assert (tree_int_cst_compare (use, update) == 0);
427 /* Copy the points-to information if it exists. */
428 if (DR_PTR_INFO (dr))
429 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
430 merge_alias_info (new_dataref_ptr, dataref_ptr);
432 return new_dataref_ptr;
436 /* Function vect_create_destination_var.
438 Create a new temporary of type VECTYPE. */
441 vect_create_destination_var (tree scalar_dest, tree vectype)
444 const char *new_name;
446 enum vect_var_kind kind;
448 kind = vectype ? vect_simple_var : vect_scalar_var;
449 type = vectype ? vectype : TREE_TYPE (scalar_dest);
451 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
453 new_name = get_name (scalar_dest);
456 vec_dest = vect_get_new_vect_var (type, vect_simple_var, new_name);
457 add_referenced_var (vec_dest);
463 /* Function vect_init_vector.
465 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
466 the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be
467 used in the vectorization of STMT. */
470 vect_init_vector (tree stmt, tree vector_var)
472 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
473 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
474 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
477 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
483 new_var = vect_get_new_vect_var (vectype, vect_simple_var, "cst_");
484 add_referenced_var (new_var);
486 init_stmt = build2 (MODIFY_EXPR, vectype, new_var, vector_var);
487 new_temp = make_ssa_name (new_var, init_stmt);
488 TREE_OPERAND (init_stmt, 0) = new_temp;
490 pe = loop_preheader_edge (loop);
491 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
492 gcc_assert (!new_bb);
494 if (vect_print_dump_info (REPORT_DETAILS))
496 fprintf (vect_dump, "created new init_stmt: ");
497 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
500 vec_oprnd = TREE_OPERAND (init_stmt, 0);
505 /* Function vect_get_vec_def_for_operand.
507 OP is an operand in STMT. This function returns a (vector) def that will be
508 used in the vectorized stmt for STMT.
510 In the case that OP is an SSA_NAME which is defined in the loop, then
511 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
513 In case OP is an invariant or constant, a new stmt that creates a vector def
514 needs to be introduced. */
517 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
522 stmt_vec_info def_stmt_info = NULL;
523 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
524 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
525 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
526 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
527 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
533 enum vect_def_type dt;
536 if (vect_print_dump_info (REPORT_DETAILS))
538 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
539 print_generic_expr (vect_dump, op, TDF_SLIM);
542 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
543 gcc_assert (is_simple_use);
544 if (vect_print_dump_info (REPORT_DETAILS))
548 fprintf (vect_dump, "def = ");
549 print_generic_expr (vect_dump, def, TDF_SLIM);
553 fprintf (vect_dump, " def_stmt = ");
554 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
560 /* Case 1: operand is a constant. */
561 case vect_constant_def:
566 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
567 if (vect_print_dump_info (REPORT_DETAILS))
568 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
570 for (i = nunits - 1; i >= 0; --i)
572 t = tree_cons (NULL_TREE, op, t);
574 vec_cst = build_vector (vectype, t);
575 return vect_init_vector (stmt, vec_cst);
578 /* Case 2: operand is defined outside the loop - loop invariant. */
579 case vect_invariant_def:
584 /* Create 'vec_inv = {inv,inv,..,inv}' */
585 if (vect_print_dump_info (REPORT_DETAILS))
586 fprintf (vect_dump, "Create vector_inv.");
588 for (i = nunits - 1; i >= 0; --i)
590 t = tree_cons (NULL_TREE, def, t);
593 /* FIXME: use build_constructor directly. */
594 vec_inv = build_constructor_from_list (vectype, t);
595 return vect_init_vector (stmt, vec_inv);
598 /* Case 3: operand is defined inside the loop. */
602 *scalar_def = def_stmt;
604 /* Get the def from the vectorized stmt. */
605 def_stmt_info = vinfo_for_stmt (def_stmt);
606 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
607 gcc_assert (vec_stmt);
608 vec_oprnd = TREE_OPERAND (vec_stmt, 0);
612 /* Case 4: operand is defined by a loop header phi - reduction */
613 case vect_reduction_def:
615 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
617 /* Get the def before the loop */
618 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
619 return get_initial_def_for_reduction (stmt, op, scalar_def);
622 /* Case 5: operand is defined by loop-header phi - induction. */
623 case vect_induction_def:
625 if (vect_print_dump_info (REPORT_DETAILS))
626 fprintf (vect_dump, "induction - unsupported.");
627 internal_error ("no support for induction"); /* FORNOW */
636 /* Function vect_get_vec_def_for_stmt_copy
638 Return a vector-def for an operand. This function is used when the
639 vectorized stmt to be created (by the caller to this function) is a "copy"
640 created in case the vectorized result cannot fit in one vector, and several
641 copies of the vector-stmt are required. In this case the vector-def is
642 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
643 of the stmt that defines VEC_OPRND.
644 DT is the type of the vector def VEC_OPRND.
647 In case the vectorization factor (VF) is bigger than the number
648 of elements that can fit in a vectype (nunits), we have to generate
649 more than one vector stmt to vectorize the scalar stmt. This situation
650 arises when there are multiple data-types operated upon in the loop; the
651 smallest data-type determines the VF, and as a result, when vectorizing
652 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
653 vector stmt (each computing a vector of 'nunits' results, and together
654 computing 'VF' results in each iteration). This function is called when
655 vectorizing such a stmt (e.g. vectorizing S2 in the illusration below, in
656 which VF=16 and nuniti=4, so the number of copies required is 4):
658 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
660 S1: x = load VS1.0: vx.0 = memref0 VS1.1
661 VS1.1: vx.1 = memref1 VS1.2
662 VS1.2: vx.2 = memref2 VS1.3
663 VS1.3: vx.3 = memref3
665 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
666 VSnew.1: vz1 = vx.1 + ... VSnew.2
667 VSnew.2: vz2 = vx.2 + ... VSnew.3
668 VSnew.3: vz3 = vx.3 + ...
670 The vectorization of S1 is explained in vectorizable_load.
671 The vectorization of S2:
672 To create the first vector-stmt out of the 4 copies - VSnew.0 -
673 the function 'vect_get_vec_def_for_operand' is called to
674 get the relevant vector-def for each operand of S2. For operand x it
675 returns the vector-def 'vx.0'.
677 To create the remaining copies of the vector-stmt (VSnew.j), this
678 function is called to get the relevant vector-def for each operand. It is
679 obtained from the respective VS1.j stmt, which is recorded in the
680 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
682 For example, to obtain the vector-def 'vx.1' in order to create the
683 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
684 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
685 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
686 and return its def ('vx.1').
687 Overall, to create the above sequence this function will be called 3 times:
688 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
689 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
690 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
693 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
695 tree vec_stmt_for_operand;
696 stmt_vec_info def_stmt_info;
698 if (dt == vect_invariant_def || dt == vect_constant_def)
700 /* Do nothing; can reuse same def. */ ;
704 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
705 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
706 gcc_assert (def_stmt_info);
707 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
708 gcc_assert (vec_stmt_for_operand);
709 vec_oprnd = TREE_OPERAND (vec_stmt_for_operand, 0);
715 /* Function vect_finish_stmt_generation.
717 Insert a new stmt. */
720 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
721 block_stmt_iterator *bsi)
723 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
724 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
726 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
727 set_stmt_info (get_stmt_ann (vec_stmt),
728 new_stmt_vec_info (vec_stmt, loop_vinfo));
730 if (vect_print_dump_info (REPORT_DETAILS))
732 fprintf (vect_dump, "add new stmt: ");
733 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
736 /* Make sure bsi points to the stmt that is being vectorized. */
737 gcc_assert (stmt == bsi_stmt (*bsi));
739 #ifdef USE_MAPPED_LOCATION
740 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
742 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
747 #define ADJUST_IN_EPILOG 1
749 /* Function get_initial_def_for_reduction
752 STMT - a stmt that performs a reduction operation in the loop.
753 INIT_VAL - the initial value of the reduction variable
756 SCALAR_DEF - a tree that holds a value to be added to the final result
757 of the reduction (used for "ADJUST_IN_EPILOG" - see below).
758 Return a vector variable, initialized according to the operation that STMT
759 performs. This vector will be used as the initial value of the
760 vector of partial results.
762 Option1 ("ADJUST_IN_EPILOG"): Initialize the vector as follows:
765 min/max: [init_val,init_val,..,init_val,init_val]
766 bit and/or: [init_val,init_val,..,init_val,init_val]
767 and when necessary (e.g. add/mult case) let the caller know
768 that it needs to adjust the result by init_val.
770 Option2: Initialize the vector as follows:
771 add: [0,0,...,0,init_val]
772 mult: [1,1,...,1,init_val]
773 min/max: [init_val,init_val,...,init_val]
774 bit and/or: [init_val,init_val,...,init_val]
775 and no adjustments are needed.
777 For example, for the following code:
783 STMT is 's = s + a[i]', and the reduction variable is 's'.
784 For a vector of 4 units, we want to return either [0,0,0,init_val],
785 or [0,0,0,0] and let the caller know that it needs to adjust
786 the result at the end by 'init_val'.
788 FORNOW: We use the "ADJUST_IN_EPILOG" scheme.
789 TODO: Use some cost-model to estimate which scheme is more profitable.
793 get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def)
795 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
796 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
797 int nunits = GET_MODE_NUNITS (TYPE_MODE (vectype));
799 enum tree_code code = TREE_CODE (TREE_OPERAND (stmt, 1));
800 tree type = TREE_TYPE (init_val);
802 tree vec, t = NULL_TREE;
803 bool need_epilog_adjust;
806 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
813 if (INTEGRAL_TYPE_P (type))
814 def = build_int_cst (type, 0);
816 def = build_real (type, dconst0);
818 #ifdef ADJUST_IN_EPILOG
819 /* All the 'nunits' elements are set to 0. The final result will be
820 adjusted by 'init_val' at the loop epilog. */
822 need_epilog_adjust = true;
824 /* 'nunits - 1' elements are set to 0; The last element is set to
825 'init_val'. No further adjustments at the epilog are needed. */
826 nelements = nunits - 1;
827 need_epilog_adjust = false;
835 need_epilog_adjust = false;
842 for (i = nelements - 1; i >= 0; --i)
843 t = tree_cons (NULL_TREE, def, t);
845 if (nelements == nunits - 1)
847 /* Set the last element of the vector. */
848 t = tree_cons (NULL_TREE, init_val, t);
851 gcc_assert (nelements == nunits);
853 if (TREE_CODE (init_val) == INTEGER_CST || TREE_CODE (init_val) == REAL_CST)
854 vec = build_vector (vectype, t);
856 vec = build_constructor_from_list (vectype, t);
858 if (!need_epilog_adjust)
859 *scalar_def = NULL_TREE;
861 *scalar_def = init_val;
863 return vect_init_vector (stmt, vec);
867 /* Function vect_create_epilog_for_reduction
869 Create code at the loop-epilog to finalize the result of a reduction
872 VECT_DEF is a vector of partial results.
873 REDUC_CODE is the tree-code for the epilog reduction.
874 STMT is the scalar reduction stmt that is being vectorized.
875 REDUCTION_PHI is the phi-node that carries the reduction computation.
878 1. Creates the reduction def-use cycle: sets the the arguments for
880 The loop-entry argument is the vectorized initial-value of the reduction.
881 The loop-latch argument is VECT_DEF - the vector of partial sums.
882 2. "Reduces" the vector of partial results VECT_DEF into a single result,
883 by applying the operation specified by REDUC_CODE if available, or by
884 other means (whole-vector shifts or a scalar loop).
885 The function also creates a new phi node at the loop exit to preserve
886 loop-closed form, as illustrated below.
888 The flow at the entry to this function:
891 vec_def = phi <null, null> # REDUCTION_PHI
892 VECT_DEF = vector_stmt # vectorized form of STMT
893 s_loop = scalar_stmt # (scalar) STMT
895 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
899 The above is transformed by this function into:
902 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
903 VECT_DEF = vector_stmt # vectorized form of STMT
904 s_loop = scalar_stmt # (scalar) STMT
906 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
907 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
908 v_out2 = reduce <v_out1>
909 s_out3 = extract_field <v_out2, 0>
910 s_out4 = adjust_result <s_out3>
916 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
917 enum tree_code reduc_code, tree reduction_phi)
919 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
921 enum machine_mode mode;
922 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
923 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
928 block_stmt_iterator exit_bsi;
933 tree new_scalar_dest, exit_phi;
934 tree bitsize, bitpos, bytesize;
935 enum tree_code code = TREE_CODE (TREE_OPERAND (stmt, 1));
936 tree scalar_initial_def;
937 tree vec_initial_def;
939 imm_use_iterator imm_iter;
941 bool extract_scalar_result;
945 tree operation = TREE_OPERAND (stmt, 1);
948 op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
949 reduction_op = TREE_OPERAND (operation, op_type-1);
950 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
951 mode = TYPE_MODE (vectype);
953 /*** 1. Create the reduction def-use cycle ***/
955 /* 1.1 set the loop-entry arg of the reduction-phi: */
956 /* For the case of reduction, vect_get_vec_def_for_operand returns
957 the scalar def before the loop, that defines the initial value
958 of the reduction variable. */
959 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
960 &scalar_initial_def);
961 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
963 /* 1.2 set the loop-latch arg for the reduction-phi: */
964 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
966 if (vect_print_dump_info (REPORT_DETAILS))
968 fprintf (vect_dump, "transform reduction: created def-use cycle:");
969 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
970 fprintf (vect_dump, "\n");
971 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
975 /*** 2. Create epilog code
976 The reduction epilog code operates across the elements of the vector
977 of partial results computed by the vectorized loop.
978 The reduction epilog code consists of:
979 step 1: compute the scalar result in a vector (v_out2)
980 step 2: extract the scalar result (s_out3) from the vector (v_out2)
981 step 3: adjust the scalar result (s_out3) if needed.
983 Step 1 can be accomplished using one the following three schemes:
984 (scheme 1) using reduc_code, if available.
985 (scheme 2) using whole-vector shifts, if available.
986 (scheme 3) using a scalar loop. In this case steps 1+2 above are
989 The overall epilog code looks like this:
991 s_out0 = phi <s_loop> # original EXIT_PHI
992 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
993 v_out2 = reduce <v_out1> # step 1
994 s_out3 = extract_field <v_out2, 0> # step 2
995 s_out4 = adjust_result <s_out3> # step 3
997 (step 3 is optional, and step2 1 and 2 may be combined).
998 Lastly, the uses of s_out0 are replaced by s_out4.
1002 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
1003 v_out1 = phi <v_loop> */
1005 exit_bb = single_exit (loop)->dest;
1006 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
1007 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
1008 exit_bsi = bsi_start (exit_bb);
1010 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
1011 (i.e. when reduc_code is not available) and in the final adjustment code
1012 (if needed). Also get the original scalar reduction variable as
1013 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
1014 represents a reduction pattern), the tree-code and scalar-def are
1015 taken from the original stmt that the pattern-stmt (STMT) replaces.
1016 Otherwise (it is a regular reduction) - the tree-code and scalar-def
1017 are taken from STMT. */
1019 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1022 /* Regular reduction */
1027 /* Reduction pattern */
1028 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
1029 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1030 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
1032 code = TREE_CODE (TREE_OPERAND (orig_stmt, 1));
1033 scalar_dest = TREE_OPERAND (orig_stmt, 0);
1034 scalar_type = TREE_TYPE (scalar_dest);
1035 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
1036 bitsize = TYPE_SIZE (scalar_type);
1037 bytesize = TYPE_SIZE_UNIT (scalar_type);
1039 /* 2.3 Create the reduction code, using one of the three schemes described
1042 if (reduc_code < NUM_TREE_CODES)
1044 /*** Case 1: Create:
1045 v_out2 = reduc_expr <v_out1> */
1047 if (vect_print_dump_info (REPORT_DETAILS))
1048 fprintf (vect_dump, "Reduce using direct vector reduction.");
1050 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1051 epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest,
1052 build1 (reduc_code, vectype, PHI_RESULT (new_phi)));
1053 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1054 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1055 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1057 extract_scalar_result = true;
1061 enum tree_code shift_code = 0;
1062 bool have_whole_vector_shift = true;
1064 int element_bitsize = tree_low_cst (bitsize, 1);
1065 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1068 if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
1069 shift_code = VEC_RSHIFT_EXPR;
1071 have_whole_vector_shift = false;
1073 /* Regardless of whether we have a whole vector shift, if we're
1074 emulating the operation via tree-vect-generic, we don't want
1075 to use it. Only the first round of the reduction is likely
1076 to still be profitable via emulation. */
1077 /* ??? It might be better to emit a reduction tree code here, so that
1078 tree-vect-generic can expand the first round via bit tricks. */
1079 if (!VECTOR_MODE_P (mode))
1080 have_whole_vector_shift = false;
1083 optab optab = optab_for_tree_code (code, vectype);
1084 if (optab->handlers[mode].insn_code == CODE_FOR_nothing)
1085 have_whole_vector_shift = false;
1088 if (have_whole_vector_shift)
1090 /*** Case 2: Create:
1091 for (offset = VS/2; offset >= element_size; offset/=2)
1093 Create: va' = vec_shift <va, offset>
1094 Create: va = vop <va, va'>
1097 if (vect_print_dump_info (REPORT_DETAILS))
1098 fprintf (vect_dump, "Reduce using vector shifts");
1100 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1101 new_temp = PHI_RESULT (new_phi);
1103 for (bit_offset = vec_size_in_bits/2;
1104 bit_offset >= element_bitsize;
1107 tree bitpos = size_int (bit_offset);
1109 epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest,
1110 build2 (shift_code, vectype,
1112 new_name = make_ssa_name (vec_dest, epilog_stmt);
1113 TREE_OPERAND (epilog_stmt, 0) = new_name;
1114 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1116 epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest,
1117 build2 (code, vectype,
1118 new_name, new_temp));
1119 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1120 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1121 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1124 extract_scalar_result = true;
1130 /*** Case 3: Create:
1131 s = extract_field <v_out2, 0>
1132 for (offset = element_size;
1133 offset < vector_size;
1134 offset += element_size;)
1136 Create: s' = extract_field <v_out2, offset>
1137 Create: s = op <s, s'>
1140 if (vect_print_dump_info (REPORT_DETAILS))
1141 fprintf (vect_dump, "Reduce using scalar code. ");
1143 vec_temp = PHI_RESULT (new_phi);
1144 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1145 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1147 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1148 epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, rhs);
1149 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1150 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1151 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1153 for (bit_offset = element_bitsize;
1154 bit_offset < vec_size_in_bits;
1155 bit_offset += element_bitsize)
1157 tree bitpos = bitsize_int (bit_offset);
1158 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1161 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1162 epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest,
1164 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
1165 TREE_OPERAND (epilog_stmt, 0) = new_name;
1166 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1168 epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest,
1169 build2 (code, scalar_type, new_name, new_temp));
1170 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1171 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1172 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1175 extract_scalar_result = false;
1179 /* 2.4 Extract the final scalar result. Create:
1180 s_out3 = extract_field <v_out2, bitpos> */
1182 if (extract_scalar_result)
1186 if (vect_print_dump_info (REPORT_DETAILS))
1187 fprintf (vect_dump, "extract scalar result");
1189 if (BYTES_BIG_ENDIAN)
1190 bitpos = size_binop (MULT_EXPR,
1191 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
1192 TYPE_SIZE (scalar_type));
1194 bitpos = bitsize_zero_node;
1196 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
1197 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1198 epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, rhs);
1199 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1200 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1201 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1204 /* 2.4 Adjust the final result by the initial value of the reduction
1205 variable. (When such adjustment is not needed, then
1206 'scalar_initial_def' is zero).
1209 s_out4 = scalar_expr <s_out3, scalar_initial_def> */
1211 if (scalar_initial_def)
1213 epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest,
1214 build2 (code, scalar_type, new_temp, scalar_initial_def));
1215 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1216 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1217 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1220 /* 2.6 Replace uses of s_out0 with uses of s_out3 */
1222 /* Find the loop-closed-use at the loop exit of the original scalar result.
1223 (The reduction result is expected to have two immediate uses - one at the
1224 latch block, and one at the loop exit). */
1226 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
1228 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
1230 exit_phi = USE_STMT (use_p);
1234 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
1235 gcc_assert (exit_phi);
1236 /* Replace the uses: */
1237 orig_name = PHI_RESULT (exit_phi);
1238 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
1239 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1240 SET_USE (use_p, new_temp);
1244 /* Function vectorizable_reduction.
1246 Check if STMT performs a reduction operation that can be vectorized.
1247 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1248 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1249 Return FALSE if not a vectorizable STMT, TRUE otherwise.
1251 This function also handles reduction idioms (patterns) that have been
1252 recognized in advance during vect_pattern_recog. In this case, STMT may be
1254 X = pattern_expr (arg0, arg1, ..., X)
1255 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
1256 sequence that had been detected and replaced by the pattern-stmt (STMT).
1258 In some cases of reduction patterns, the type of the reduction variable X is
1259 different than the type of the other arguments of STMT.
1260 In such cases, the vectype that is used when transforming STMT into a vector
1261 stmt is different than the vectype that is used to determine the
1262 vectorization factor, because it consists of a different number of elements
1263 than the actual number of elements that are being operated upon in parallel.
1265 For example, consider an accumulation of shorts into an int accumulator.
1266 On some targets it's possible to vectorize this pattern operating on 8
1267 shorts at a time (hence, the vectype for purposes of determining the
1268 vectorization factor should be V8HI); on the other hand, the vectype that
1269 is used to create the vector form is actually V4SI (the type of the result).
1271 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
1272 indicates what is the actual level of parallelism (V8HI in the example), so
1273 that the right vectorization factor would be derived. This vectype
1274 corresponds to the type of arguments to the reduction stmt, and should *NOT*
1275 be used to create the vectorized stmt. The right vectype for the vectorized
1276 stmt is obtained from the type of the result X:
1277 get_vectype_for_scalar_type (TREE_TYPE (X))
1279 This means that, contrary to "regular" reductions (or "regular" stmts in
1280 general), the following equation:
1281 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
1282 does *NOT* necessarily hold for reduction patterns. */
1285 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1290 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
1291 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1292 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1293 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1294 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1296 enum tree_code code, orig_code, epilog_reduc_code = 0;
1297 enum machine_mode vec_mode;
1299 optab optab, reduc_optab;
1300 tree new_temp = NULL_TREE;
1302 enum vect_def_type dt;
1307 stmt_vec_info orig_stmt_info;
1308 tree expr = NULL_TREE;
1310 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1311 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1312 stmt_vec_info prev_stmt_info;
1314 tree new_stmt = NULL_TREE;
1317 gcc_assert (ncopies >= 1);
1319 /* 1. Is vectorizable reduction? */
1321 /* Not supportable if the reduction variable is used in the loop. */
1322 if (STMT_VINFO_RELEVANT_P (stmt_info))
1325 if (!STMT_VINFO_LIVE_P (stmt_info))
1328 /* Make sure it was already recognized as a reduction computation. */
1329 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
1332 /* 2. Has this been recognized as a reduction pattern?
1334 Check if STMT represents a pattern that has been recognized
1335 in earlier analysis stages. For stmts that represent a pattern,
1336 the STMT_VINFO_RELATED_STMT field records the last stmt in
1337 the original sequence that constitutes the pattern. */
1339 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1342 orig_stmt_info = vinfo_for_stmt (orig_stmt);
1343 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
1344 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
1345 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
1348 /* 3. Check the operands of the operation. The first operands are defined
1349 inside the loop body. The last operand is the reduction variable,
1350 which is defined by the loop-header-phi. */
1352 gcc_assert (TREE_CODE (stmt) == MODIFY_EXPR);
1354 operation = TREE_OPERAND (stmt, 1);
1355 code = TREE_CODE (operation);
1356 op_type = TREE_CODE_LENGTH (code);
1357 if (op_type != binary_op && op_type != ternary_op)
1359 scalar_dest = TREE_OPERAND (stmt, 0);
1360 scalar_type = TREE_TYPE (scalar_dest);
1362 /* All uses but the last are expected to be defined in the loop.
1363 The last use is the reduction variable. */
1364 for (i = 0; i < op_type-1; i++)
1366 op = TREE_OPERAND (operation, i);
1367 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1368 gcc_assert (is_simple_use);
1369 gcc_assert (dt == vect_loop_def || dt == vect_invariant_def ||
1370 dt == vect_constant_def);
1373 op = TREE_OPERAND (operation, i);
1374 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1375 gcc_assert (is_simple_use);
1376 gcc_assert (dt == vect_reduction_def);
1377 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1379 gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt));
1381 gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt));
1383 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
1386 /* 4. Supportable by target? */
1388 /* 4.1. check support for the operation in the loop */
1389 optab = optab_for_tree_code (code, vectype);
1392 if (vect_print_dump_info (REPORT_DETAILS))
1393 fprintf (vect_dump, "no optab.");
1396 vec_mode = TYPE_MODE (vectype);
1397 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1399 if (vect_print_dump_info (REPORT_DETAILS))
1400 fprintf (vect_dump, "op not supported by target.");
1401 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1402 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1403 < vect_min_worthwhile_factor (code))
1405 if (vect_print_dump_info (REPORT_DETAILS))
1406 fprintf (vect_dump, "proceeding using word mode.");
1409 /* Worthwhile without SIMD support? */
1410 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1411 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1412 < vect_min_worthwhile_factor (code))
1414 if (vect_print_dump_info (REPORT_DETAILS))
1415 fprintf (vect_dump, "not worthwhile without SIMD support.");
1419 /* 4.2. Check support for the epilog operation.
1421 If STMT represents a reduction pattern, then the type of the
1422 reduction variable may be different than the type of the rest
1423 of the arguments. For example, consider the case of accumulation
1424 of shorts into an int accumulator; The original code:
1425 S1: int_a = (int) short_a;
1426 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
1429 STMT: int_acc = widen_sum <short_a, int_acc>
1432 1. The tree-code that is used to create the vector operation in the
1433 epilog code (that reduces the partial results) is not the
1434 tree-code of STMT, but is rather the tree-code of the original
1435 stmt from the pattern that STMT is replacing. I.e, in the example
1436 above we want to use 'widen_sum' in the loop, but 'plus' in the
1438 2. The type (mode) we use to check available target support
1439 for the vector operation to be created in the *epilog*, is
1440 determined by the type of the reduction variable (in the example
1441 above we'd check this: plus_optab[vect_int_mode]).
1442 However the type (mode) we use to check available target support
1443 for the vector operation to be created *inside the loop*, is
1444 determined by the type of the other arguments to STMT (in the
1445 example we'd check this: widen_sum_optab[vect_short_mode]).
1447 This is contrary to "regular" reductions, in which the types of all
1448 the arguments are the same as the type of the reduction variable.
1449 For "regular" reductions we can therefore use the same vector type
1450 (and also the same tree-code) when generating the epilog code and
1451 when generating the code inside the loop. */
1455 /* This is a reduction pattern: get the vectype from the type of the
1456 reduction variable, and get the tree-code from orig_stmt. */
1457 orig_code = TREE_CODE (TREE_OPERAND (orig_stmt, 1));
1458 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
1459 vec_mode = TYPE_MODE (vectype);
1463 /* Regular reduction: use the same vectype and tree-code as used for
1464 the vector code inside the loop can be used for the epilog code. */
1468 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
1470 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
1473 if (vect_print_dump_info (REPORT_DETAILS))
1474 fprintf (vect_dump, "no optab for reduction.");
1475 epilog_reduc_code = NUM_TREE_CODES;
1477 if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1479 if (vect_print_dump_info (REPORT_DETAILS))
1480 fprintf (vect_dump, "reduc op not supported by target.");
1481 epilog_reduc_code = NUM_TREE_CODES;
1484 if (!vec_stmt) /* transformation not required. */
1486 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
1492 if (vect_print_dump_info (REPORT_DETAILS))
1493 fprintf (vect_dump, "transform reduction.");
1495 /* Create the destination vector */
1496 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1498 /* Create the reduction-phi that defines the reduction-operand. */
1499 new_phi = create_phi_node (vec_dest, loop->header);
1501 /* In case the vectorization factor (VF) is bigger than the number
1502 of elements that we can fit in a vectype (nunits), we have to generate
1503 more than one vector stmt - i.e - we need to "unroll" the
1504 vector stmt by a factor VF/nunits. For more details see documentation
1505 in vectorizable_operation. */
1507 prev_stmt_info = NULL;
1508 for (j = 0; j < ncopies; j++)
1513 op = TREE_OPERAND (operation, 0);
1514 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
1515 if (op_type == ternary_op)
1517 op = TREE_OPERAND (operation, 1);
1518 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
1521 /* Get the vector def for the reduction variable from the phi node */
1522 reduc_def = PHI_RESULT (new_phi);
1526 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
1527 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
1528 if (op_type == ternary_op)
1529 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
1531 /* Get the vector def for the reduction variable from the vectorized
1532 reduction operation generated in the previous iteration (j-1) */
1533 reduc_def = TREE_OPERAND (new_stmt ,0);
1536 /* Arguments are ready. create the new vector stmt. */
1538 if (op_type == binary_op)
1539 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
1541 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
1543 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, expr);
1544 new_temp = make_ssa_name (vec_dest, new_stmt);
1545 TREE_OPERAND (new_stmt, 0) = new_temp;
1546 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1549 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1551 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1552 prev_stmt_info = vinfo_for_stmt (new_stmt);
1555 /* Finalize the reduction-phi (set it's arguments) and create the
1556 epilog reduction code. */
1557 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
1562 /* Function vectorizable_assignment.
1564 Check if STMT performs an assignment (copy) that can be vectorized.
1565 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1566 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1567 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1570 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1576 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1577 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1578 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1581 enum vect_def_type dt;
1582 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1583 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1585 gcc_assert (ncopies >= 1);
1587 return false; /* FORNOW */
1589 /* Is vectorizable assignment? */
1590 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1593 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1595 if (TREE_CODE (stmt) != MODIFY_EXPR)
1598 scalar_dest = TREE_OPERAND (stmt, 0);
1599 if (TREE_CODE (scalar_dest) != SSA_NAME)
1602 op = TREE_OPERAND (stmt, 1);
1603 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
1605 if (vect_print_dump_info (REPORT_DETAILS))
1606 fprintf (vect_dump, "use not simple.");
1610 if (!vec_stmt) /* transformation not required. */
1612 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
1617 if (vect_print_dump_info (REPORT_DETAILS))
1618 fprintf (vect_dump, "transform assignment.");
1621 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1624 op = TREE_OPERAND (stmt, 1);
1625 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
1627 /* Arguments are ready. create the new vector stmt. */
1628 *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, vec_oprnd);
1629 new_temp = make_ssa_name (vec_dest, *vec_stmt);
1630 TREE_OPERAND (*vec_stmt, 0) = new_temp;
1631 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
1637 /* Function vect_min_worthwhile_factor.
1639 For a loop where we could vectorize the operation indicated by CODE,
1640 return the minimum vectorization factor that makes it worthwhile
1641 to use generic vectors. */
1643 vect_min_worthwhile_factor (enum tree_code code)
1664 /* Function vectorizable_operation.
1666 Check if STMT performs a binary or unary operation that can be vectorized.
1667 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1668 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1669 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1672 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1677 tree op0, op1 = NULL;
1678 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
1679 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1680 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1681 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1682 enum tree_code code;
1683 enum machine_mode vec_mode;
1688 enum machine_mode optab_op2_mode;
1690 enum vect_def_type dt0, dt1;
1692 stmt_vec_info prev_stmt_info;
1693 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
1696 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
1699 gcc_assert (ncopies >= 1);
1701 /* Is STMT a vectorizable binary/unary operation? */
1702 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1705 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1707 if (STMT_VINFO_LIVE_P (stmt_info))
1709 /* FORNOW: not yet supported. */
1710 if (vect_print_dump_info (REPORT_DETAILS))
1711 fprintf (vect_dump, "value used after loop.");
1715 if (TREE_CODE (stmt) != MODIFY_EXPR)
1718 if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME)
1721 scalar_dest = TREE_OPERAND (stmt, 0);
1722 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
1723 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
1724 if (nunits_out != nunits_in)
1727 operation = TREE_OPERAND (stmt, 1);
1728 code = TREE_CODE (operation);
1729 optab = optab_for_tree_code (code, vectype);
1731 /* Support only unary or binary operations. */
1732 op_type = TREE_CODE_LENGTH (code);
1733 if (op_type != unary_op && op_type != binary_op)
1735 if (vect_print_dump_info (REPORT_DETAILS))
1736 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
1740 op0 = TREE_OPERAND (operation, 0);
1741 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
1743 if (vect_print_dump_info (REPORT_DETAILS))
1744 fprintf (vect_dump, "use not simple.");
1748 if (op_type == binary_op)
1750 op1 = TREE_OPERAND (operation, 1);
1751 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
1753 if (vect_print_dump_info (REPORT_DETAILS))
1754 fprintf (vect_dump, "use not simple.");
1759 /* Supportable by target? */
1762 if (vect_print_dump_info (REPORT_DETAILS))
1763 fprintf (vect_dump, "no optab.");
1766 vec_mode = TYPE_MODE (vectype);
1767 icode = (int) optab->handlers[(int) vec_mode].insn_code;
1768 if (icode == CODE_FOR_nothing)
1770 if (vect_print_dump_info (REPORT_DETAILS))
1771 fprintf (vect_dump, "op not supported by target.");
1772 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1773 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1774 < vect_min_worthwhile_factor (code))
1776 if (vect_print_dump_info (REPORT_DETAILS))
1777 fprintf (vect_dump, "proceeding using word mode.");
1780 /* Worthwhile without SIMD support? */
1781 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1782 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1783 < vect_min_worthwhile_factor (code))
1785 if (vect_print_dump_info (REPORT_DETAILS))
1786 fprintf (vect_dump, "not worthwhile without SIMD support.");
1790 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
1792 /* FORNOW: not yet supported. */
1793 if (!VECTOR_MODE_P (vec_mode))
1796 /* Invariant argument is needed for a vector shift
1797 by a scalar shift operand. */
1798 optab_op2_mode = insn_data[icode].operand[2].mode;
1799 if (! (VECTOR_MODE_P (optab_op2_mode)
1800 || dt1 == vect_constant_def
1801 || dt1 == vect_invariant_def))
1803 if (vect_print_dump_info (REPORT_DETAILS))
1804 fprintf (vect_dump, "operand mode requires invariant argument.");
1809 if (!vec_stmt) /* transformation not required. */
1811 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
1817 if (vect_print_dump_info (REPORT_DETAILS))
1818 fprintf (vect_dump, "transform binary/unary operation.");
1821 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1823 /* In case the vectorization factor (VF) is bigger than the number
1824 of elements that we can fit in a vectype (nunits), we have to generate
1825 more than one vector stmt - i.e - we need to "unroll" the
1826 vector stmt by a factor VF/nunits. In doing so, we record a pointer
1827 from one copy of the vector stmt to the next, in the field
1828 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
1829 stages to find the correct vector defs to be used when vectorizing
1830 stmts that use the defs of the current stmt. The example below illustrates
1831 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
1832 4 vectorized stmts):
1834 before vectorization:
1835 RELATED_STMT VEC_STMT
1839 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
1841 RELATED_STMT VEC_STMT
1842 VS1_0: vx0 = memref0 VS1_1 -
1843 VS1_1: vx1 = memref1 VS1_2 -
1844 VS1_2: vx2 = memref2 VS1_3 -
1845 VS1_3: vx3 = memref3 - -
1846 S1: x = load - VS1_0
1849 step2: vectorize stmt S2 (done here):
1850 To vectorize stmt S2 we first need to find the relevant vector
1851 def for the first operand 'x'. This is, as usual, obtained from
1852 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
1853 that defines 'x' (S1). This way we find the stmt VS1_0, and the
1854 relevant vector def 'vx0'. Having found 'vx0' we can generate
1855 the vector stmt VS2_0, and as usual, record it in the
1856 STMT_VINFO_VEC_STMT of stmt S2.
1857 When creating the second copy (VS2_1), we obtain the relevant vector
1858 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
1859 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
1860 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
1861 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
1862 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
1863 chain of stmts and pointers:
1864 RELATED_STMT VEC_STMT
1865 VS1_0: vx0 = memref0 VS1_1 -
1866 VS1_1: vx1 = memref1 VS1_2 -
1867 VS1_2: vx2 = memref2 VS1_3 -
1868 VS1_3: vx3 = memref3 - -
1869 S1: x = load - VS1_0
1870 VS2_0: vz0 = vx0 + v1 VS2_1 -
1871 VS2_1: vz1 = vx1 + v1 VS2_2 -
1872 VS2_2: vz2 = vx2 + v1 VS2_3 -
1873 VS2_3: vz3 = vx3 + v1 - -
1874 S2: z = x + 1 - VS2_0 */
1876 prev_stmt_info = NULL;
1877 for (j = 0; j < ncopies; j++)
1882 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
1883 if (op_type == binary_op)
1885 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
1887 /* Vector shl and shr insn patterns can be defined with
1888 scalar operand 2 (shift operand). In this case, use
1889 constant or loop invariant op1 directly, without
1890 extending it to vector mode first. */
1891 optab_op2_mode = insn_data[icode].operand[2].mode;
1892 if (!VECTOR_MODE_P (optab_op2_mode))
1894 if (vect_print_dump_info (REPORT_DETAILS))
1895 fprintf (vect_dump, "operand 1 using scalar mode.");
1900 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
1905 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
1906 if (op_type == binary_op)
1907 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
1910 /* Arguments are ready. create the new vector stmt. */
1912 if (op_type == binary_op)
1913 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest,
1914 build2 (code, vectype, vec_oprnd0, vec_oprnd1));
1916 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest,
1917 build1 (code, vectype, vec_oprnd0));
1918 new_temp = make_ssa_name (vec_dest, new_stmt);
1919 TREE_OPERAND (new_stmt, 0) = new_temp;
1920 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1923 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1925 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1926 prev_stmt_info = vinfo_for_stmt (new_stmt);
1933 /* Function vectorizable_type_demotion
1935 Check if STMT performs a binary or unary operation that involves
1936 type demotion, and if it can be vectorized.
1937 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1938 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1939 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1942 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
1949 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
1950 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1951 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1952 enum tree_code code;
1955 enum vect_def_type dt0;
1957 stmt_vec_info prev_stmt_info;
1967 enum machine_mode vec_mode;
1969 /* Is STMT a vectorizable type-demotion operation? */
1971 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1974 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1976 if (STMT_VINFO_LIVE_P (stmt_info))
1978 /* FORNOW: not yet supported. */
1979 if (vect_print_dump_info (REPORT_DETAILS))
1980 fprintf (vect_dump, "value used after loop.");
1984 if (TREE_CODE (stmt) != MODIFY_EXPR)
1987 if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME)
1990 operation = TREE_OPERAND (stmt, 1);
1991 code = TREE_CODE (operation);
1992 if (code != NOP_EXPR && code != CONVERT_EXPR)
1995 op0 = TREE_OPERAND (operation, 0);
1996 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
1997 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
1999 scalar_dest = TREE_OPERAND (stmt, 0);
2000 scalar_type = TREE_TYPE (scalar_dest);
2001 vectype_out = get_vectype_for_scalar_type (scalar_type);
2002 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2003 if (nunits_in != nunits_out / 2) /* FORNOW */
2006 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2007 gcc_assert (ncopies >= 1);
2009 /* Check the operands of the operation. */
2010 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2012 if (vect_print_dump_info (REPORT_DETAILS))
2013 fprintf (vect_dump, "use not simple.");
2017 /* Supportable by target? */
2018 code = VEC_PACK_MOD_EXPR;
2019 optab = optab_for_tree_code (VEC_PACK_MOD_EXPR, vectype_in);
2023 vec_mode = TYPE_MODE (vectype_in);
2024 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
2027 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2029 if (!vec_stmt) /* transformation not required. */
2031 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
2037 if (vect_print_dump_info (REPORT_DETAILS))
2038 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
2042 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2044 /* In case the vectorization factor (VF) is bigger than the number
2045 of elements that we can fit in a vectype (nunits), we have to generate
2046 more than one vector stmt - i.e - we need to "unroll" the
2047 vector stmt by a factor VF/nunits. */
2048 prev_stmt_info = NULL;
2049 for (j = 0; j < ncopies; j++)
2054 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2055 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2056 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd0);
2060 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
2061 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2064 /* Arguments are ready. Create the new vector stmt. */
2065 expr = build2 (code, vectype_out, vec_oprnd0, vec_oprnd1);
2066 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, expr);
2067 new_temp = make_ssa_name (vec_dest, new_stmt);
2068 TREE_OPERAND (new_stmt, 0) = new_temp;
2069 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2072 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2074 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2076 prev_stmt_info = vinfo_for_stmt (new_stmt);
2079 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2084 /* Function vect_gen_widened_results_half
2086 Create a vector stmt whose code, type, number of arguments, and result
2087 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
2088 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
2089 In the case that CODE is a CALL_EXPR, this means that a call to DECL
2090 needs to be created (DECL is a function-decl of a target-builtin).
2091 STMT is the original scalar stmt that we are vectorizing. */
2094 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
2095 tree vec_oprnd0, tree vec_oprnd1, int op_type,
2096 tree vec_dest, block_stmt_iterator *bsi,
2106 /* Generate half of the widened result: */
2107 if (code == CALL_EXPR)
2109 /* Target specific support */
2110 vec_params = build_tree_list (NULL_TREE, vec_oprnd0);
2111 if (op_type == binary_op)
2112 vec_params = tree_cons (NULL_TREE, vec_oprnd1, vec_params);
2113 expr = build_function_call_expr (decl, vec_params);
2117 /* Generic support */
2118 gcc_assert (op_type == TREE_CODE_LENGTH (code));
2119 if (op_type == binary_op)
2120 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
2122 expr = build1 (code, vectype, vec_oprnd0);
2124 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, expr);
2125 new_temp = make_ssa_name (vec_dest, new_stmt);
2126 TREE_OPERAND (new_stmt, 0) = new_temp;
2127 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2129 if (code == CALL_EXPR)
2131 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2133 if (TREE_CODE (sym) == SSA_NAME)
2134 sym = SSA_NAME_VAR (sym);
2135 mark_sym_for_renaming (sym);
2143 /* Function vectorizable_type_promotion
2145 Check if STMT performs a binary or unary operation that involves
2146 type promotion, and if it can be vectorized.
2147 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2148 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2149 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2152 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
2158 tree op0, op1 = NULL;
2159 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2160 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2161 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2162 enum tree_code code, code1 = CODE_FOR_nothing, code2 = CODE_FOR_nothing;
2163 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
2166 enum vect_def_type dt0, dt1;
2168 stmt_vec_info prev_stmt_info;
2176 /* Is STMT a vectorizable type-promotion operation? */
2178 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2181 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2183 if (STMT_VINFO_LIVE_P (stmt_info))
2185 /* FORNOW: not yet supported. */
2186 if (vect_print_dump_info (REPORT_DETAILS))
2187 fprintf (vect_dump, "value used after loop.");
2191 if (TREE_CODE (stmt) != MODIFY_EXPR)
2194 if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME)
2197 operation = TREE_OPERAND (stmt, 1);
2198 code = TREE_CODE (operation);
2199 if (code != NOP_EXPR && code != WIDEN_MULT_EXPR)
2202 op0 = TREE_OPERAND (operation, 0);
2203 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2204 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2205 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2206 gcc_assert (ncopies >= 1);
2208 scalar_dest = TREE_OPERAND (stmt, 0);
2209 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2210 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2211 if (nunits_out != nunits_in / 2) /* FORNOW */
2214 /* Check the operands of the operation. */
2215 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2217 if (vect_print_dump_info (REPORT_DETAILS))
2218 fprintf (vect_dump, "use not simple.");
2222 op_type = TREE_CODE_LENGTH (code);
2223 if (op_type == binary_op)
2225 op1 = TREE_OPERAND (operation, 1);
2226 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2228 if (vect_print_dump_info (REPORT_DETAILS))
2229 fprintf (vect_dump, "use not simple.");
2234 /* Supportable by target? */
2235 if (!supportable_widening_operation (code, stmt, vectype_in,
2236 &decl1, &decl2, &code1, &code2))
2239 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2241 if (!vec_stmt) /* transformation not required. */
2243 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
2249 if (vect_print_dump_info (REPORT_DETAILS))
2250 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
2254 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2256 /* In case the vectorization factor (VF) is bigger than the number
2257 of elements that we can fit in a vectype (nunits), we have to generate
2258 more than one vector stmt - i.e - we need to "unroll" the
2259 vector stmt by a factor VF/nunits. */
2261 prev_stmt_info = NULL;
2262 for (j = 0; j < ncopies; j++)
2267 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2268 if (op_type == binary_op)
2269 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2273 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2274 if (op_type == binary_op)
2275 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2278 /* Arguments are ready. Create the new vector stmt. We are creating
2279 two vector defs because the widened result does not fit in one vector.
2280 The vectorized stmt can be expressed as a call to a taregt builtin,
2281 or a using a tree-code. */
2282 /* Generate first half of the widened result: */
2283 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
2284 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2286 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2288 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2289 prev_stmt_info = vinfo_for_stmt (new_stmt);
2291 /* Generate second half of the widened result: */
2292 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
2293 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2294 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2295 prev_stmt_info = vinfo_for_stmt (new_stmt);
2299 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2304 /* Function vect_strided_store_supported.
2306 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
2307 and FALSE otherwise. */
2310 vect_strided_store_supported (tree vectype)
2312 optab interleave_high_optab, interleave_low_optab;
2315 mode = (int) TYPE_MODE (vectype);
2317 /* Check that the operation is supported. */
2318 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
2320 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
2322 if (!interleave_high_optab || !interleave_low_optab)
2324 if (vect_print_dump_info (REPORT_DETAILS))
2325 fprintf (vect_dump, "no optab for interleave.");
2329 if (interleave_high_optab->handlers[(int) mode].insn_code
2331 || interleave_low_optab->handlers[(int) mode].insn_code
2332 == CODE_FOR_nothing)
2334 if (vect_print_dump_info (REPORT_DETAILS))
2335 fprintf (vect_dump, "interleave op not supported by target.");
2342 /* Function vect_permute_store_chain.
2344 Given a chain of interleaved strores in DR_CHAIN of LENGTH that must be
2345 a power of 2, generate interleave_high/low stmts to reorder the data
2346 correctly for the stores. Return the final references for stores in
2349 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
2350 The input is 4 vectors each containg 8 elements. We assign a number to each
2351 element, the input sequence is:
2353 1st vec: 0 1 2 3 4 5 6 7
2354 2nd vec: 8 9 10 11 12 13 14 15
2355 3rd vec: 16 17 18 19 20 21 22 23
2356 4th vec: 24 25 26 27 28 29 30 31
2358 The output sequence should be:
2360 1st vec: 0 8 16 24 1 9 17 25
2361 2nd vec: 2 10 18 26 3 11 19 27
2362 3rd vec: 4 12 20 28 5 13 21 30
2363 4th vec: 6 14 22 30 7 15 23 31
2365 i.e., we interleave the contents of the four vectors in their order.
2367 We use interleave_high/low instructions to create such output. The input of
2368 each interleave_high/low operation is two vectors:
2371 the even elements of the result vector are obtained left-to-right from the
2372 high/low elements of the first vector. The odd elements of the result are
2373 obtained left-to-right from the high/low elements of the second vector.
2374 The output of interleave_high will be: 0 4 1 5
2375 and of interleave_low: 2 6 3 7
2378 The permutaion is done in log LENGTH stages. In each stage interleave_high
2379 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
2380 where the first argument is taken from the first half of DR_CHAIN and the
2381 second argument from it's second half.
2384 I1: interleave_high (1st vec, 3rd vec)
2385 I2: interleave_low (1st vec, 3rd vec)
2386 I3: interleave_high (2nd vec, 4th vec)
2387 I4: interleave_low (2nd vec, 4th vec)
2389 The output for the first stage is:
2391 I1: 0 16 1 17 2 18 3 19
2392 I2: 4 20 5 21 6 22 7 23
2393 I3: 8 24 9 25 10 26 11 27
2394 I4: 12 28 13 29 14 30 15 31
2396 The output of the second stage, i.e. the final result is:
2398 I1: 0 8 16 24 1 9 17 25
2399 I2: 2 10 18 26 3 11 19 27
2400 I3: 4 12 20 28 5 13 21 30
2401 I4: 6 14 22 30 7 15 23 31. */
2404 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
2405 unsigned int length,
2407 block_stmt_iterator *bsi,
2408 VEC(tree,heap) **result_chain)
2410 tree perm_dest, perm_stmt, vect1, vect2, high, low;
2411 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2415 VEC(tree,heap) *first, *second;
2417 scalar_dest = TREE_OPERAND (stmt, 0);
2418 first = VEC_alloc (tree, heap, length/2);
2419 second = VEC_alloc (tree, heap, length/2);
2421 /* Check that the operation is supported. */
2422 if (!vect_strided_store_supported (vectype))
2425 *result_chain = VEC_copy (tree, heap, dr_chain);
2427 for (i = 0; i < exact_log2 (length); i++)
2429 for (j = 0; j < length/2; j++)
2431 vect1 = VEC_index (tree, dr_chain, j);
2432 vect2 = VEC_index (tree, dr_chain, j+length/2);
2434 /* high = interleave_high (vect1, vect2); */
2435 perm_dest = create_tmp_var (vectype, "vect_inter_high");
2436 add_referenced_var (perm_dest);
2437 perm_stmt = build2 (MODIFY_EXPR, vectype, perm_dest,
2438 build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1,
2440 high = make_ssa_name (perm_dest, perm_stmt);
2441 TREE_OPERAND (perm_stmt, 0) = high;
2442 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
2443 VEC_replace (tree, *result_chain, 2*j, high);
2445 /* low = interleave_low (vect1, vect2); */
2446 perm_dest = create_tmp_var (vectype, "vect_inter_low");
2447 add_referenced_var (perm_dest);
2448 perm_stmt = build2 (MODIFY_EXPR, vectype, perm_dest,
2449 build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1,
2451 low = make_ssa_name (perm_dest, perm_stmt);
2452 TREE_OPERAND (perm_stmt, 0) = low;
2453 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
2454 VEC_replace (tree, *result_chain, 2*j+1, low);
2456 dr_chain = VEC_copy (tree, heap, *result_chain);
2462 /* Function vectorizable_store.
2464 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
2466 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2467 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2468 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2471 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2476 tree vec_oprnd = NULL_TREE;
2477 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2478 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
2479 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2480 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2481 enum machine_mode vec_mode;
2483 enum dr_alignment_support alignment_support_cheme;
2485 def_operand_p def_p;
2487 enum vect_def_type dt;
2488 stmt_vec_info prev_stmt_info = NULL;
2489 tree dataref_ptr = NULL_TREE;
2490 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2491 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2493 tree next_stmt, first_stmt;
2494 bool strided_store = false;
2495 unsigned int group_size, i;
2496 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
2497 gcc_assert (ncopies >= 1);
2499 /* Is vectorizable store? */
2501 if (TREE_CODE (stmt) != MODIFY_EXPR)
2504 scalar_dest = TREE_OPERAND (stmt, 0);
2505 if (TREE_CODE (scalar_dest) != ARRAY_REF
2506 && TREE_CODE (scalar_dest) != INDIRECT_REF
2507 && !DR_GROUP_FIRST_DR (stmt_info))
2510 op = TREE_OPERAND (stmt, 1);
2511 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
2513 if (vect_print_dump_info (REPORT_DETAILS))
2514 fprintf (vect_dump, "use not simple.");
2518 vec_mode = TYPE_MODE (vectype);
2519 /* FORNOW. In some cases can vectorize even if data-type not supported
2520 (e.g. - array initialization with 0). */
2521 if (mov_optab->handlers[(int)vec_mode].insn_code == CODE_FOR_nothing)
2524 if (!STMT_VINFO_DATA_REF (stmt_info))
2527 if (DR_GROUP_FIRST_DR (stmt_info))
2529 strided_store = true;
2530 if (!vect_strided_store_supported (vectype))
2534 if (!vec_stmt) /* transformation not required. */
2536 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
2542 if (vect_print_dump_info (REPORT_DETAILS))
2543 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
2547 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
2548 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2549 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
2551 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
2553 /* We vectorize all the stmts of the interleaving group when we
2554 reach the last stmt in the group. */
2555 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
2556 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
2558 *vec_stmt = NULL_TREE;
2569 dr_chain = VEC_alloc (tree, heap, group_size);
2570 oprnds = VEC_alloc (tree, heap, group_size);
2572 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
2573 gcc_assert (alignment_support_cheme);
2574 gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */
2576 /* In case the vectorization factor (VF) is bigger than the number
2577 of elements that we can fit in a vectype (nunits), we have to generate
2578 more than one vector stmt - i.e - we need to "unroll" the
2579 vector stmt by a factor VF/nunits. For more details see documentation in
2580 vect_get_vec_def_for_copy_stmt. */
2582 /* In case of interleaving (non-unit strided access):
2589 We create vectorized storess starting from base address (the access of the
2590 first stmt in the chain (S2 in the above example), when the last store stmt
2591 of the chain (S4) is reached:
2594 VS2: &base + vec_size*1 = vx0
2595 VS3: &base + vec_size*2 = vx1
2596 VS4: &base + vec_size*3 = vx3
2598 Then permutation statements are generated:
2600 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
2601 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
2604 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
2605 (the order of the data-refs in the output of vect_permute_store_chain
2606 corresponds to the order of scalar stmts in the interleaving chain - see
2607 the documentaion of vect_permute_store_chain()).
2609 In case of both multiple types and interleaving, above vector stores and
2610 permutation stmts are created for every copy. The result vector stmts are
2611 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
2612 STMT_VINFO_RELATED_STMT for the next copies.
2615 prev_stmt_info = NULL;
2616 for (j = 0; j < ncopies; j++)
2623 /* For interleaved stores we collect vectorized defs for all the
2624 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
2625 as an input to vect_permute_store_chain(), and OPRNDS as an input
2626 to vect_get_vec_def_for_stmt_copy() for the next copy.
2627 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
2628 OPRNDS are of size 1.
2630 next_stmt = first_stmt;
2631 for (i = 0; i < group_size; i++)
2633 /* Since gaps are not supported for interleaved stores, GROUP_SIZE
2634 is the exact number of stmts in the chain. Therefore, NEXT_STMT
2635 can't be NULL_TREE. In case that there is no interleaving,
2636 GROUP_SIZE is 1, and only one iteration of the loop will be
2639 gcc_assert (next_stmt);
2640 op = TREE_OPERAND (next_stmt, 1);
2641 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
2642 VEC_quick_push(tree, dr_chain, vec_oprnd);
2643 VEC_quick_push(tree, oprnds, vec_oprnd);
2644 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
2646 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE,
2647 &dummy, &ptr_incr, false);
2651 /* For interleaved stores we created vectorized defs for all the
2652 defs stored in OPRNDS in the previous iteration (previous copy).
2653 DR_CHAIN is then used as an input to vect_permute_store_chain(),
2654 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
2656 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
2657 OPRNDS are of size 1.
2659 for (i = 0; i < group_size; i++)
2661 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
2662 VEC_index (tree, oprnds, i));
2663 VEC_replace(tree, dr_chain, i, vec_oprnd);
2664 VEC_replace(tree, oprnds, i, vec_oprnd);
2666 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
2671 result_chain = VEC_alloc (tree, heap, group_size);
2673 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
2678 next_stmt = first_stmt;
2679 for (i = 0; i < group_size; i++)
2681 /* For strided stores vectorized defs are interleaved in
2682 vect_permute_store_chain(). */
2684 vec_oprnd = VEC_index(tree, result_chain, i);
2686 data_ref = build_fold_indirect_ref (dataref_ptr);
2687 /* Arguments are ready. Create the new vector stmt. */
2688 new_stmt = build2 (MODIFY_EXPR, vectype, data_ref, vec_oprnd);
2689 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2691 /* Set the V_MAY_DEFS for the vector pointer. If this virtual def has a
2692 use outside the loop and a loop peel is performed then the def may be
2693 renamed by the peel. Mark it for renaming so the later use will also
2695 copy_virtual_operands (new_stmt, next_stmt);
2698 /* The original store is deleted so the same SSA_NAMEs can be used.
2700 FOR_EACH_SSA_TREE_OPERAND (def, next_stmt, iter, SSA_OP_VMAYDEF)
2702 SSA_NAME_DEF_STMT (def) = new_stmt;
2703 mark_sym_for_renaming (SSA_NAME_VAR (def));
2706 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2710 /* Create new names for all the definitions created by COPY and
2711 add replacement mappings for each new name. */
2712 FOR_EACH_SSA_DEF_OPERAND (def_p, new_stmt, iter, SSA_OP_VMAYDEF)
2714 create_new_def_for (DEF_FROM_PTR (def_p), new_stmt, def_p);
2715 mark_sym_for_renaming (SSA_NAME_VAR (DEF_FROM_PTR (def_p)));
2718 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2721 prev_stmt_info = vinfo_for_stmt (new_stmt);
2722 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
2725 /* Bump the vector pointer. */
2726 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
2734 /* Function vect_setup_realignment
2736 This function is called when vectorizing an unaligned load using
2737 the dr_unaligned_software_pipeline scheme.
2738 This function generates the following code at the loop prolog:
2741 msq_init = *(floor(p)); # prolog load
2742 realignment_token = call target_builtin;
2744 msq = phi (msq_init, ---)
2746 The code above sets up a new (vector) pointer, pointing to the first
2747 location accessed by STMT, and a "floor-aligned" load using that pointer.
2748 It also generates code to compute the "realignment-token" (if the relevant
2749 target hook was defined), and creates a phi-node at the loop-header bb
2750 whose arguments are the result of the prolog-load (created by this
2751 function) and the result of a load that takes place in the loop (to be
2752 created by the caller to this function).
2753 The caller to this function uses the phi-result (msq) to create the
2754 realignment code inside the loop, and sets up the missing phi argument,
2758 msq = phi (msq_init, lsq)
2759 lsq = *(floor(p')); # load in loop
2760 result = realign_load (msq, lsq, realignment_token);
2763 STMT - (scalar) load stmt to be vectorized. This load accesses
2764 a memory location that may be unaligned.
2765 BSI - place where new code is to be inserted.
2768 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
2769 target hook, if defined.
2770 Return value - the result of the loop-header phi node. */
2773 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
2774 tree *realignment_token)
2776 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2777 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2778 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2779 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2780 edge pe = loop_preheader_edge (loop);
2781 tree scalar_dest = TREE_OPERAND (stmt, 0);
2794 /* 1. Create msq_init = *(floor(p1)) in the loop preheader */
2795 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2796 ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true);
2797 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
2798 new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, data_ref);
2799 new_temp = make_ssa_name (vec_dest, new_stmt);
2800 TREE_OPERAND (new_stmt, 0) = new_temp;
2801 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
2802 gcc_assert (!new_bb);
2803 msq_init = TREE_OPERAND (new_stmt, 0);
2804 copy_virtual_operands (new_stmt, stmt);
2805 update_vuses_to_preheader (new_stmt, loop);
2807 /* 2. Create permutation mask, if required, in loop preheader. */
2808 if (targetm.vectorize.builtin_mask_for_load)
2811 tree params = build_tree_list (NULL_TREE, init_addr);
2813 vec_dest = vect_create_destination_var (scalar_dest,
2814 TREE_TYPE (new_stmt));
2815 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
2816 new_stmt = build_function_call_expr (builtin_decl, params);
2817 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, new_stmt);
2818 new_temp = make_ssa_name (vec_dest, new_stmt);
2819 TREE_OPERAND (new_stmt, 0) = new_temp;
2820 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
2821 gcc_assert (!new_bb);
2822 *realignment_token = TREE_OPERAND (new_stmt, 0);
2824 /* The result of the CALL_EXPR to this builtin is determined from
2825 the value of the parameter and no global variables are touched
2826 which makes the builtin a "const" function. Requiring the
2827 builtin to have the "const" attribute makes it unnecessary
2828 to call mark_call_clobbered. */
2829 gcc_assert (TREE_READONLY (builtin_decl));
2832 /* 3. Create msq = phi <msq_init, lsq> in loop */
2833 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2834 msq = make_ssa_name (vec_dest, NULL_TREE);
2835 phi_stmt = create_phi_node (msq, loop->header);
2836 SSA_NAME_DEF_STMT (msq) = phi_stmt;
2837 add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop));
2843 /* Function vect_strided_load_supported.
2845 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
2846 and FALSE otherwise. */
2849 vect_strided_load_supported (tree vectype)
2851 optab perm_even_optab, perm_odd_optab;
2854 mode = (int) TYPE_MODE (vectype);
2856 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
2857 if (!perm_even_optab)
2859 if (vect_print_dump_info (REPORT_DETAILS))
2860 fprintf (vect_dump, "no optab for perm_even.");
2864 if (perm_even_optab->handlers[mode].insn_code == CODE_FOR_nothing)
2866 if (vect_print_dump_info (REPORT_DETAILS))
2867 fprintf (vect_dump, "perm_even op not supported by target.");
2871 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
2872 if (!perm_odd_optab)
2874 if (vect_print_dump_info (REPORT_DETAILS))
2875 fprintf (vect_dump, "no optab for perm_odd.");
2879 if (perm_odd_optab->handlers[mode].insn_code == CODE_FOR_nothing)
2881 if (vect_print_dump_info (REPORT_DETAILS))
2882 fprintf (vect_dump, "perm_odd op not supported by target.");
2889 /* Function vect_permute_load_chain.
2891 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
2892 a power of 2, generate extract_even/odd stmts to reorder the input data
2893 correctly. Return the final references for loads in RESULT_CHAIN.
2895 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
2896 The input is 4 vectors each containg 8 elements. We assign a number to each
2897 element, the input sequence is:
2899 1st vec: 0 1 2 3 4 5 6 7
2900 2nd vec: 8 9 10 11 12 13 14 15
2901 3rd vec: 16 17 18 19 20 21 22 23
2902 4th vec: 24 25 26 27 28 29 30 31
2904 The output sequence should be:
2906 1st vec: 0 4 8 12 16 20 24 28
2907 2nd vec: 1 5 9 13 17 21 25 29
2908 3rd vec: 2 6 10 14 18 22 26 30
2909 4th vec: 3 7 11 15 19 23 27 31
2911 i.e., the first output vector should contain the first elements of each
2912 interleaving group, etc.
2914 We use extract_even/odd instructions to create such output. The input of each
2915 extract_even/odd operation is two vectors
2919 and the output is the vector of extracted even/odd elements. The output of
2920 extract_even will be: 0 2 4 6
2921 and of extract_odd: 1 3 5 7
2924 The permutaion is done in log LENGTH stages. In each stage extract_even and
2925 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
2926 order. In our example,
2928 E1: extract_even (1st vec, 2nd vec)
2929 E2: extract_odd (1st vec, 2nd vec)
2930 E3: extract_even (3rd vec, 4th vec)
2931 E4: extract_odd (3rd vec, 4th vec)
2933 The output for the first stage will be:
2935 E1: 0 2 4 6 8 10 12 14
2936 E2: 1 3 5 7 9 11 13 15
2937 E3: 16 18 20 22 24 26 28 30
2938 E4: 17 19 21 23 25 27 29 31
2940 In order to proceed and create the correct sequence for the next stage (or
2941 for the correct output, if the second stage is the last one, as in our
2942 example), we first put the output of extract_even operation and then the
2943 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
2944 The input for the second stage is:
2946 1st vec (E1): 0 2 4 6 8 10 12 14
2947 2nd vec (E3): 16 18 20 22 24 26 28 30
2948 3rd vec (E2): 1 3 5 7 9 11 13 15
2949 4th vec (E4): 17 19 21 23 25 27 29 31
2951 The output of the second stage:
2953 E1: 0 4 8 12 16 20 24 28
2954 E2: 2 6 10 14 18 22 26 30
2955 E3: 1 5 9 13 17 21 25 29
2956 E4: 3 7 11 15 19 23 27 31
2958 And RESULT_CHAIN after reordering:
2960 1st vec (E1): 0 4 8 12 16 20 24 28
2961 2nd vec (E3): 1 5 9 13 17 21 25 29
2962 3rd vec (E2): 2 6 10 14 18 22 26 30
2963 4th vec (E4): 3 7 11 15 19 23 27 31. */
2966 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
2967 unsigned int length,
2969 block_stmt_iterator *bsi,
2970 VEC(tree,heap) **result_chain)
2972 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
2973 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2977 /* Check that the operation is supported. */
2978 if (!vect_strided_load_supported (vectype))
2981 *result_chain = VEC_copy (tree, heap, dr_chain);
2982 for (i = 0; i < exact_log2 (length); i++)
2984 for (j = 0; j < length; j +=2)
2986 first_vect = VEC_index (tree, dr_chain, j);
2987 second_vect = VEC_index (tree, dr_chain, j+1);
2989 /* data_ref = permute_even (first_data_ref, second_data_ref); */
2990 perm_dest = create_tmp_var (vectype, "vect_perm_even");
2991 add_referenced_var (perm_dest);
2993 perm_stmt = build2 (MODIFY_EXPR, vectype, perm_dest,
2994 build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
2995 first_vect, second_vect));
2997 data_ref = make_ssa_name (perm_dest, perm_stmt);
2998 TREE_OPERAND (perm_stmt, 0) = data_ref;
2999 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3000 mark_new_vars_to_rename (perm_stmt);
3002 VEC_replace (tree, *result_chain, j/2, data_ref);
3004 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
3005 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
3006 add_referenced_var (perm_dest);
3008 perm_stmt = build2 (MODIFY_EXPR, vectype, perm_dest,
3009 build2 (VEC_EXTRACT_ODD_EXPR, vectype,
3010 first_vect, second_vect));
3011 data_ref = make_ssa_name (perm_dest, perm_stmt);
3012 TREE_OPERAND (perm_stmt, 0) = data_ref;
3013 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3014 mark_new_vars_to_rename (perm_stmt);
3016 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
3018 dr_chain = VEC_copy (tree, heap, *result_chain);
3024 /* Function vect_transform_strided_load.
3026 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
3027 to perform their permutation and ascribe the result vectorized statements to
3028 the scalar statements.
3032 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
3033 block_stmt_iterator *bsi)
3035 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3036 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3037 tree next_stmt, new_stmt;
3038 VEC(tree,heap) *result_chain = NULL;
3039 unsigned int i, gap_count;
3042 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
3043 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
3044 vectors, that are ready for vector computation. */
3045 result_chain = VEC_alloc (tree, heap, size);
3047 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
3050 /* Put a permuted data-ref in the VECTORIZED_STMT field.
3051 Since we scan the chain starting from it's first node, their order
3052 corresponds the order of data-refs in RESULT_CHAIN. */
3053 next_stmt = first_stmt;
3055 for (i = 0; VEC_iterate(tree, result_chain, i, tmp_data_ref); i++)
3060 /* Skip the gaps. Loads created for the gaps will be removed by dead
3061 code elimination pass later.
3062 DR_GROUP_GAP is the number of steps in elements from the previous
3063 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
3064 correspond to the gaps.
3066 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
3074 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
3075 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
3076 copies, and we put the new vector statement in the first available
3078 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
3079 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
3082 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
3083 tree rel_stmt = STMT_VINFO_RELATED_STMT (
3084 vinfo_for_stmt (prev_stmt));
3087 prev_stmt = rel_stmt;
3088 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
3090 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
3092 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3094 /* If NEXT_STMT accesses the same DR as the previous statement,
3095 put the same TMP_DATA_REF as its vectorized statement; otherwise
3096 get the next data-ref from RESULT_CHAIN. */
3097 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
3105 /* vectorizable_load.
3107 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
3109 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3110 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3111 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3114 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3117 tree vec_dest = NULL;
3118 tree data_ref = NULL;
3120 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3121 stmt_vec_info prev_stmt_info;
3122 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3123 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3124 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
3125 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3128 tree new_stmt = NULL_TREE;
3130 enum dr_alignment_support alignment_support_cheme;
3131 tree dataref_ptr = NULL_TREE;
3133 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3134 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3135 int i, j, group_size;
3136 tree msq = NULL_TREE, lsq;
3137 tree offset = NULL_TREE;
3138 tree realignment_token = NULL_TREE;
3139 tree phi_stmt = NULL_TREE;
3140 VEC(tree,heap) *dr_chain = NULL;
3141 bool strided_load = false;
3144 /* Is vectorizable load? */
3145 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3148 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3150 if (STMT_VINFO_LIVE_P (stmt_info))
3152 /* FORNOW: not yet supported. */
3153 if (vect_print_dump_info (REPORT_DETAILS))
3154 fprintf (vect_dump, "value used after loop.");
3158 if (TREE_CODE (stmt) != MODIFY_EXPR)
3161 scalar_dest = TREE_OPERAND (stmt, 0);
3162 if (TREE_CODE (scalar_dest) != SSA_NAME)
3165 op = TREE_OPERAND (stmt, 1);
3166 if (TREE_CODE (op) != ARRAY_REF
3167 && TREE_CODE (op) != INDIRECT_REF
3168 && !DR_GROUP_FIRST_DR (stmt_info))
3171 if (!STMT_VINFO_DATA_REF (stmt_info))
3174 mode = (int) TYPE_MODE (vectype);
3176 /* FORNOW. In some cases can vectorize even if data-type not supported
3177 (e.g. - data copies). */
3178 if (mov_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3180 if (vect_print_dump_info (REPORT_DETAILS))
3181 fprintf (vect_dump, "Aligned load, but unsupported type.");
3185 /* Check if the load is a part of an interleaving chain. */
3186 if (DR_GROUP_FIRST_DR (stmt_info))
3188 strided_load = true;
3190 /* Check if interleaving is supported. */
3191 if (!vect_strided_load_supported (vectype))
3195 if (!vec_stmt) /* transformation not required. */
3197 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
3203 if (vect_print_dump_info (REPORT_DETAILS))
3204 fprintf (vect_dump, "transform load.");
3208 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3209 /* Check if the chain of loads is already vectorized. */
3210 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
3212 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3215 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3216 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3217 dr_chain = VEC_alloc (tree, heap, group_size);
3226 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3227 gcc_assert (alignment_support_cheme);
3230 /* In case the vectorization factor (VF) is bigger than the number
3231 of elements that we can fit in a vectype (nunits), we have to generate
3232 more than one vector stmt - i.e - we need to "unroll" the
3233 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3234 from one copy of the vector stmt to the next, in the field
3235 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3236 stages to find the correct vector defs to be used when vectorizing
3237 stmts that use the defs of the current stmt. The example below illustrates
3238 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3239 4 vectorized stmts):
3241 before vectorization:
3242 RELATED_STMT VEC_STMT
3246 step 1: vectorize stmt S1:
3247 We first create the vector stmt VS1_0, and, as usual, record a
3248 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
3249 Next, we create the vector stmt VS1_1, and record a pointer to
3250 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
3251 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
3253 RELATED_STMT VEC_STMT
3254 VS1_0: vx0 = memref0 VS1_1 -
3255 VS1_1: vx1 = memref1 VS1_2 -
3256 VS1_2: vx2 = memref2 VS1_3 -
3257 VS1_3: vx3 = memref3 - -
3258 S1: x = load - VS1_0
3261 See in documentation in vect_get_vec_def_for_stmt_copy for how the
3262 information we recorded in RELATED_STMT field is used to vectorize
3265 /* In case of interleaving (non-unit strided access):
3272 Vectorized loads are created in the order of memory accesses
3273 starting from the access of the first stmt of the chain:
3276 VS2: vx1 = &base + vec_size*1
3277 VS3: vx3 = &base + vec_size*2
3278 VS4: vx4 = &base + vec_size*3
3280 Then permutation statements are generated:
3282 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
3283 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
3286 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3287 (the order of the data-refs in the output of vect_permute_load_chain
3288 corresponds to the order of scalar stmts in the interleaving chain - see
3289 the documentaion of vect_permute_load_chain()).
3290 The generation of permutation stmts and recording them in
3291 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
3293 In case of both multiple types and interleaving, the vector loads and
3294 permutation stmts above are created for every copy. The result vector stmts
3295 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3296 STMT_VINFO_RELATED_STMT for the next copies. */
3298 /* If the data reference is aligned (dr_aligned) or potentially unaligned
3299 on a target that supports unaligned accesses (dr_unaligned_supported)
3300 we generate the following code:
3304 p = p + indx * vectype_size;
3309 Otherwise, the data reference is potentially unaligned on a target that
3310 does not support unaligned accesses (dr_unaligned_software_pipeline) -
3311 then generate the following code, in which the data in each iteration is
3312 obtained by two vector loads, one from the previous iteration, and one
3313 from the current iteration:
3315 msq_init = *(floor(p1))
3316 p2 = initial_addr + VS - 1;
3317 realignment_token = call target_builtin;
3320 p2 = p2 + indx * vectype_size
3322 vec_dest = realign_load (msq, lsq, realignment_token)
3327 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3329 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token);
3330 phi_stmt = SSA_NAME_DEF_STMT (msq);
3331 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
3334 prev_stmt_info = NULL;
3335 for (j = 0; j < ncopies; j++)
3337 /* 1. Create the vector pointer update chain. */
3339 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset,
3340 &dummy, &ptr_incr, false);
3342 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3344 for (i = 0; i < group_size; i++)
3346 /* 2. Create the vector-load in the loop. */
3347 switch (alignment_support_cheme)
3350 gcc_assert (aligned_access_p (first_dr));
3351 data_ref = build_fold_indirect_ref (dataref_ptr);
3353 case dr_unaligned_supported:
3355 int mis = DR_MISALIGNMENT (first_dr);
3356 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
3358 gcc_assert (!aligned_access_p (first_dr));
3359 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
3361 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
3364 case dr_unaligned_software_pipeline:
3365 gcc_assert (!aligned_access_p (first_dr));
3366 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
3371 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3372 new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, data_ref);
3373 new_temp = make_ssa_name (vec_dest, new_stmt);
3374 TREE_OPERAND (new_stmt, 0) = new_temp;
3375 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3376 copy_virtual_operands (new_stmt, stmt);
3377 mark_new_vars_to_rename (new_stmt);
3379 /* 3. Handle explicit realignment if necessary/supported. */
3380 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3383 <vec_dest = realign_load (msq, lsq, realignment_token)> */
3384 lsq = TREE_OPERAND (new_stmt, 0);
3385 if (!realignment_token)
3386 realignment_token = dataref_ptr;
3387 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3389 build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token);
3390 new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, new_stmt);
3391 new_temp = make_ssa_name (vec_dest, new_stmt);
3392 TREE_OPERAND (new_stmt, 0) = new_temp;
3393 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3394 if (i == group_size - 1 && j == ncopies - 1)
3395 add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop));
3399 VEC_quick_push (tree, dr_chain, new_temp);
3400 if (i < group_size - 1)
3401 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3406 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
3408 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3409 dr_chain = VEC_alloc (tree, heap, group_size);
3414 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3416 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3417 prev_stmt_info = vinfo_for_stmt (new_stmt);
3425 /* Function vectorizable_live_operation.
3427 STMT computes a value that is used outside the loop. Check if
3428 it can be supported. */
3431 vectorizable_live_operation (tree stmt,
3432 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3433 tree *vec_stmt ATTRIBUTE_UNUSED)
3436 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3437 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3439 enum tree_code code;
3443 enum vect_def_type dt;
3445 if (!STMT_VINFO_LIVE_P (stmt_info))
3448 if (TREE_CODE (stmt) != MODIFY_EXPR)
3451 if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME)
3454 operation = TREE_OPERAND (stmt, 1);
3455 code = TREE_CODE (operation);
3457 op_type = TREE_CODE_LENGTH (code);
3459 /* FORNOW: support only if all uses are invariant. This means
3460 that the scalar operations can remain in place, unvectorized.
3461 The original last scalar value that they compute will be used. */
3463 for (i = 0; i < op_type; i++)
3465 op = TREE_OPERAND (operation, i);
3466 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
3468 if (vect_print_dump_info (REPORT_DETAILS))
3469 fprintf (vect_dump, "use not simple.");
3473 if (dt != vect_invariant_def && dt != vect_constant_def)
3477 /* No transformation is required for the cases we currently support. */
3482 /* Function vect_is_simple_cond.
3485 LOOP - the loop that is being vectorized.
3486 COND - Condition that is checked for simple use.
3488 Returns whether a COND can be vectorized. Checks whether
3489 condition operands are supportable using vec_is_simple_use. */
3492 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
3496 enum vect_def_type dt;
3498 if (!COMPARISON_CLASS_P (cond))
3501 lhs = TREE_OPERAND (cond, 0);
3502 rhs = TREE_OPERAND (cond, 1);
3504 if (TREE_CODE (lhs) == SSA_NAME)
3506 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
3507 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
3510 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST)
3513 if (TREE_CODE (rhs) == SSA_NAME)
3515 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
3516 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
3519 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST)
3525 /* vectorizable_condition.
3527 Check if STMT is conditional modify expression that can be vectorized.
3528 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3529 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
3532 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3535 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3537 tree scalar_dest = NULL_TREE;
3538 tree vec_dest = NULL_TREE;
3539 tree op = NULL_TREE;
3540 tree cond_expr, then_clause, else_clause;
3541 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3542 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3543 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
3544 tree vec_compare, vec_cond_expr;
3546 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3547 enum machine_mode vec_mode;
3549 enum vect_def_type dt;
3550 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3551 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3553 gcc_assert (ncopies >= 1);
3555 return false; /* FORNOW */
3557 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3560 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3562 if (STMT_VINFO_LIVE_P (stmt_info))
3564 /* FORNOW: not yet supported. */
3565 if (vect_print_dump_info (REPORT_DETAILS))
3566 fprintf (vect_dump, "value used after loop.");
3570 if (TREE_CODE (stmt) != MODIFY_EXPR)
3573 op = TREE_OPERAND (stmt, 1);
3575 if (TREE_CODE (op) != COND_EXPR)
3578 cond_expr = TREE_OPERAND (op, 0);
3579 then_clause = TREE_OPERAND (op, 1);
3580 else_clause = TREE_OPERAND (op, 2);
3582 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
3585 /* We do not handle two different vector types for the condition
3587 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
3590 if (TREE_CODE (then_clause) == SSA_NAME)
3592 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
3593 if (!vect_is_simple_use (then_clause, loop_vinfo,
3594 &then_def_stmt, &def, &dt))
3597 else if (TREE_CODE (then_clause) != INTEGER_CST
3598 && TREE_CODE (then_clause) != REAL_CST)
3601 if (TREE_CODE (else_clause) == SSA_NAME)
3603 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
3604 if (!vect_is_simple_use (else_clause, loop_vinfo,
3605 &else_def_stmt, &def, &dt))
3608 else if (TREE_CODE (else_clause) != INTEGER_CST
3609 && TREE_CODE (else_clause) != REAL_CST)
3613 vec_mode = TYPE_MODE (vectype);
3617 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
3618 return expand_vec_cond_expr_p (op, vec_mode);
3624 scalar_dest = TREE_OPERAND (stmt, 0);
3625 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3627 /* Handle cond expr. */
3629 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
3631 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
3632 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
3633 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
3635 /* Arguments are ready. create the new vector stmt. */
3636 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
3637 vec_cond_lhs, vec_cond_rhs);
3638 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
3639 vec_compare, vec_then_clause, vec_else_clause);
3641 *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, vec_cond_expr);
3642 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3643 TREE_OPERAND (*vec_stmt, 0) = new_temp;
3644 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3649 /* Function vect_transform_stmt.
3651 Create a vectorized stmt to replace STMT, and insert it at BSI. */
3654 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
3656 bool is_store = false;
3657 tree vec_stmt = NULL_TREE;
3658 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3659 tree orig_stmt_in_pattern;
3662 if (STMT_VINFO_RELEVANT_P (stmt_info))
3664 switch (STMT_VINFO_TYPE (stmt_info))
3666 case type_demotion_vec_info_type:
3667 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
3671 case type_promotion_vec_info_type:
3672 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
3676 case op_vec_info_type:
3677 done = vectorizable_operation (stmt, bsi, &vec_stmt);
3681 case assignment_vec_info_type:
3682 done = vectorizable_assignment (stmt, bsi, &vec_stmt);
3686 case load_vec_info_type:
3687 done = vectorizable_load (stmt, bsi, &vec_stmt);
3691 case store_vec_info_type:
3692 done = vectorizable_store (stmt, bsi, &vec_stmt);
3694 if (DR_GROUP_FIRST_DR (stmt_info))
3696 /* In case of interleaving, the whole chain is vectorized when the
3697 last store in the chain is reached. Store stmts before the last
3698 one are skipped, and there vec_stmt_info shoudn't be freed
3700 *strided_store = true;
3701 if (STMT_VINFO_VEC_STMT (stmt_info))
3708 case condition_vec_info_type:
3709 done = vectorizable_condition (stmt, bsi, &vec_stmt);
3714 if (vect_print_dump_info (REPORT_DETAILS))
3715 fprintf (vect_dump, "stmt not supported.");
3719 gcc_assert (vec_stmt || *strided_store);
3722 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
3723 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
3724 if (orig_stmt_in_pattern)
3726 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
3727 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
3729 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3731 /* STMT was inserted by the vectorizer to replace a
3732 computation idiom. ORIG_STMT_IN_PATTERN is a stmt in the
3733 original sequence that computed this idiom. We need to
3734 record a pointer to VEC_STMT in the stmt_info of
3735 ORIG_STMT_IN_PATTERN. See more details in the
3736 documentation of vect_pattern_recog. */
3738 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
3744 if (STMT_VINFO_LIVE_P (stmt_info))
3746 switch (STMT_VINFO_TYPE (stmt_info))
3748 case reduc_vec_info_type:
3749 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
3754 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
3763 /* This function builds ni_name = number of iterations loop executes
3764 on the loop preheader. */
3767 vect_build_loop_niters (loop_vec_info loop_vinfo)
3769 tree ni_name, stmt, var;
3771 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3772 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
3774 var = create_tmp_var (TREE_TYPE (ni), "niters");
3775 add_referenced_var (var);
3776 ni_name = force_gimple_operand (ni, &stmt, false, var);
3778 pe = loop_preheader_edge (loop);
3781 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
3782 gcc_assert (!new_bb);
3789 /* This function generates the following statements:
3791 ni_name = number of iterations loop executes
3792 ratio = ni_name / vf
3793 ratio_mult_vf_name = ratio * vf
3795 and places them at the loop preheader edge. */
3798 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
3800 tree *ratio_mult_vf_name_ptr,
3801 tree *ratio_name_ptr)
3809 tree ratio_mult_vf_name;
3810 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3811 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
3812 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3815 pe = loop_preheader_edge (loop);
3817 /* Generate temporary variable that contains
3818 number of iterations loop executes. */
3820 ni_name = vect_build_loop_niters (loop_vinfo);
3821 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
3823 /* Create: ratio = ni >> log2(vf) */
3825 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
3826 if (!is_gimple_val (ratio_name))
3828 var = create_tmp_var (TREE_TYPE (ni), "bnd");
3829 add_referenced_var (var);
3831 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
3832 pe = loop_preheader_edge (loop);
3833 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
3834 gcc_assert (!new_bb);
3837 /* Create: ratio_mult_vf = ratio << log2 (vf). */
3839 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
3840 ratio_name, log_vf);
3841 if (!is_gimple_val (ratio_mult_vf_name))
3843 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
3844 add_referenced_var (var);
3846 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
3848 pe = loop_preheader_edge (loop);
3849 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
3850 gcc_assert (!new_bb);
3853 *ni_name_ptr = ni_name;
3854 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
3855 *ratio_name_ptr = ratio_name;
3861 /* Function update_vuses_to_preheader.
3864 STMT - a statement with potential VUSEs.
3865 LOOP - the loop whose preheader will contain STMT.
3867 It's possible to vectorize a loop even though an SSA_NAME from a VUSE
3868 appears to be defined in a V_MAY_DEF in another statement in a loop.
3869 One such case is when the VUSE is at the dereference of a __restricted__
3870 pointer in a load and the V_MAY_DEF is at the dereference of a different
3871 __restricted__ pointer in a store. Vectorization may result in
3872 copy_virtual_uses being called to copy the problematic VUSE to a new
3873 statement that is being inserted in the loop preheader. This procedure
3874 is called to change the SSA_NAME in the new statement's VUSE from the
3875 SSA_NAME updated in the loop to the related SSA_NAME available on the
3876 path entering the loop.
3878 When this function is called, we have the following situation:
3883 # name1 = phi < name0 , name2>
3888 # name2 = vdef <name1>
3893 Stmt S1 was created in the loop preheader block as part of misaligned-load
3894 handling. This function fixes the name of the vuse of S1 from 'name1' to
3898 update_vuses_to_preheader (tree stmt, struct loop *loop)
3900 basic_block header_bb = loop->header;
3901 edge preheader_e = loop_preheader_edge (loop);
3903 use_operand_p use_p;
3905 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VUSE)
3907 tree ssa_name = USE_FROM_PTR (use_p);
3908 tree def_stmt = SSA_NAME_DEF_STMT (ssa_name);
3909 tree name_var = SSA_NAME_VAR (ssa_name);
3910 basic_block bb = bb_for_stmt (def_stmt);
3912 /* For a use before any definitions, def_stmt is a NOP_EXPR. */
3913 if (!IS_EMPTY_STMT (def_stmt)
3914 && flow_bb_inside_loop_p (loop, bb))
3916 /* If the block containing the statement defining the SSA_NAME
3917 is in the loop then it's necessary to find the definition
3918 outside the loop using the PHI nodes of the header. */
3920 bool updated = false;
3922 for (phi = phi_nodes (header_bb); phi; phi = TREE_CHAIN (phi))
3924 if (SSA_NAME_VAR (PHI_RESULT (phi)) == name_var)
3926 SET_USE (use_p, PHI_ARG_DEF (phi, preheader_e->dest_idx));
3931 gcc_assert (updated);
3937 /* Function vect_update_ivs_after_vectorizer.
3939 "Advance" the induction variables of LOOP to the value they should take
3940 after the execution of LOOP. This is currently necessary because the
3941 vectorizer does not handle induction variables that are used after the
3942 loop. Such a situation occurs when the last iterations of LOOP are
3944 1. We introduced new uses after LOOP for IVs that were not originally used
3945 after LOOP: the IVs of LOOP are now used by an epilog loop.
3946 2. LOOP is going to be vectorized; this means that it will iterate N/VF
3947 times, whereas the loop IVs should be bumped N times.
3950 - LOOP - a loop that is going to be vectorized. The last few iterations
3951 of LOOP were peeled.
3952 - NITERS - the number of iterations that LOOP executes (before it is
3953 vectorized). i.e, the number of times the ivs should be bumped.
3954 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
3955 coming out from LOOP on which there are uses of the LOOP ivs
3956 (this is the path from LOOP->exit to epilog_loop->preheader).
3958 The new definitions of the ivs are placed in LOOP->exit.
3959 The phi args associated with the edge UPDATE_E in the bb
3960 UPDATE_E->dest are updated accordingly.
3962 Assumption 1: Like the rest of the vectorizer, this function assumes
3963 a single loop exit that has a single predecessor.
3965 Assumption 2: The phi nodes in the LOOP header and in update_bb are
3966 organized in the same order.
3968 Assumption 3: The access function of the ivs is simple enough (see
3969 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
3971 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
3972 coming out of LOOP on which the ivs of LOOP are used (this is the path
3973 that leads to the epilog loop; other paths skip the epilog loop). This
3974 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
3975 needs to have its phis updated.
3979 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
3982 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3983 basic_block exit_bb = single_exit (loop)->dest;
3985 basic_block update_bb = update_e->dest;
3987 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
3989 /* Make sure there exists a single-predecessor exit bb: */
3990 gcc_assert (single_pred_p (exit_bb));
3992 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
3994 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
3996 tree access_fn = NULL;
3997 tree evolution_part;
4000 tree var, stmt, ni, ni_name;
4001 block_stmt_iterator last_bsi;
4003 if (vect_print_dump_info (REPORT_DETAILS))
4005 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
4006 print_generic_expr (vect_dump, phi, TDF_SLIM);
4009 /* Skip virtual phi's. */
4010 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
4012 if (vect_print_dump_info (REPORT_DETAILS))
4013 fprintf (vect_dump, "virtual phi. skip.");
4017 /* Skip reduction phis. */
4018 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
4020 if (vect_print_dump_info (REPORT_DETAILS))
4021 fprintf (vect_dump, "reduc phi. skip.");
4025 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
4026 gcc_assert (access_fn);
4028 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
4029 gcc_assert (evolution_part != NULL_TREE);
4031 /* FORNOW: We do not support IVs whose evolution function is a polynomial
4032 of degree >= 2 or exponential. */
4033 gcc_assert (!tree_is_chrec (evolution_part));
4035 step_expr = evolution_part;
4036 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
4039 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
4040 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
4041 niters, step_expr), init_expr);
4043 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
4044 add_referenced_var (var);
4046 ni_name = force_gimple_operand (ni, &stmt, false, var);
4048 /* Insert stmt into exit_bb. */
4049 last_bsi = bsi_last (exit_bb);
4051 bsi_insert_before (&last_bsi, stmt, BSI_SAME_STMT);
4053 /* Fix phi expressions in the successor bb. */
4054 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
4059 /* Function vect_do_peeling_for_loop_bound
4061 Peel the last iterations of the loop represented by LOOP_VINFO.
4062 The peeled iterations form a new epilog loop. Given that the loop now
4063 iterates NITERS times, the new epilog loop iterates
4064 NITERS % VECTORIZATION_FACTOR times.
4066 The original loop will later be made to iterate
4067 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
4070 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
4072 tree ni_name, ratio_mult_vf_name;
4073 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4074 struct loop *new_loop;
4076 basic_block preheader;
4079 if (vect_print_dump_info (REPORT_DETAILS))
4080 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
4082 initialize_original_copy_tables ();
4084 /* Generate the following variables on the preheader of original loop:
4086 ni_name = number of iteration the original loop executes
4087 ratio = ni_name / vf
4088 ratio_mult_vf_name = ratio * vf */
4089 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
4090 &ratio_mult_vf_name, ratio);
4092 loop_num = loop->num;
4093 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
4094 ratio_mult_vf_name, ni_name, false);
4095 gcc_assert (new_loop);
4096 gcc_assert (loop_num == loop->num);
4097 #ifdef ENABLE_CHECKING
4098 slpeel_verify_cfg_after_peeling (loop, new_loop);
4101 /* A guard that controls whether the new_loop is to be executed or skipped
4102 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
4103 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
4104 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
4105 is on the path where the LOOP IVs are used and need to be updated. */
4107 preheader = loop_preheader_edge (new_loop)->src;
4108 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
4109 update_e = EDGE_PRED (preheader, 0);
4111 update_e = EDGE_PRED (preheader, 1);
4113 /* Update IVs of original loop as if they were advanced
4114 by ratio_mult_vf_name steps. */
4115 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
4117 /* After peeling we have to reset scalar evolution analyzer. */
4120 free_original_copy_tables ();
4124 /* Function vect_gen_niters_for_prolog_loop
4126 Set the number of iterations for the loop represented by LOOP_VINFO
4127 to the minimum between LOOP_NITERS (the original iteration count of the loop)
4128 and the misalignment of DR - the data reference recorded in
4129 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
4130 this loop, the data reference DR will refer to an aligned location.
4132 The following computation is generated:
4134 If the misalignment of DR is known at compile time:
4135 addr_mis = int mis = DR_MISALIGNMENT (dr);
4136 Else, compute address misalignment in bytes:
4137 addr_mis = addr & (vectype_size - 1)
4139 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
4141 (elem_size = element type size; an element is the scalar element
4142 whose type is the inner type of the vectype)
4146 prolog_niters = min ( LOOP_NITERS ,
4147 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
4148 where group_size is the size of the interleaved group.
4152 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
4154 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
4155 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4156 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4158 tree iters, iters_name;
4161 tree dr_stmt = DR_STMT (dr);
4162 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
4163 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4164 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
4165 tree niters_type = TREE_TYPE (loop_niters);
4167 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
4169 if (DR_GROUP_FIRST_DR (stmt_info))
4171 /* For interleaved access element size must be multipled by the size of
4172 the interleaved group. */
4173 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
4174 DR_GROUP_FIRST_DR (stmt_info)));
4175 element_size *= group_size;
4178 pe = loop_preheader_edge (loop);
4180 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
4182 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
4183 int elem_misalign = byte_misalign / element_size;
4185 if (vect_print_dump_info (REPORT_DETAILS))
4186 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
4187 iters = build_int_cst (niters_type,
4188 (vf - elem_misalign)&(vf/group_size-1));
4192 tree new_stmts = NULL_TREE;
4194 vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE);
4195 tree ptr_type = TREE_TYPE (start_addr);
4196 tree size = TYPE_SIZE (ptr_type);
4197 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
4198 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
4199 tree elem_size_log =
4200 build_int_cst (type, exact_log2 (vectype_align/vf));
4201 tree vf_minus_1 = build_int_cst (type, vf - 1);
4202 tree vf_tree = build_int_cst (type, vf);
4206 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
4207 gcc_assert (!new_bb);
4209 /* Create: byte_misalign = addr & (vectype_size - 1) */
4211 fold_build2 (BIT_AND_EXPR, type, start_addr, vectype_size_minus_1);
4213 /* Create: elem_misalign = byte_misalign / element_size */
4215 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
4217 /* Create: (niters_type) (VF - elem_misalign)&(VF - 1) */
4218 iters = fold_build2 (MINUS_EXPR, type, vf_tree, elem_misalign);
4219 iters = fold_build2 (BIT_AND_EXPR, type, iters, vf_minus_1);
4220 iters = fold_convert (niters_type, iters);
4223 /* Create: prolog_loop_niters = min (iters, loop_niters) */
4224 /* If the loop bound is known at compile time we already verified that it is
4225 greater than vf; since the misalignment ('iters') is at most vf, there's
4226 no need to generate the MIN_EXPR in this case. */
4227 if (TREE_CODE (loop_niters) != INTEGER_CST)
4228 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
4230 if (vect_print_dump_info (REPORT_DETAILS))
4232 fprintf (vect_dump, "niters for prolog loop: ");
4233 print_generic_expr (vect_dump, iters, TDF_SLIM);
4236 var = create_tmp_var (niters_type, "prolog_loop_niters");
4237 add_referenced_var (var);
4238 iters_name = force_gimple_operand (iters, &stmt, false, var);
4240 /* Insert stmt on loop preheader edge. */
4243 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4244 gcc_assert (!new_bb);
4251 /* Function vect_update_init_of_dr
4253 NITERS iterations were peeled from LOOP. DR represents a data reference
4254 in LOOP. This function updates the information recorded in DR to
4255 account for the fact that the first NITERS iterations had already been
4256 executed. Specifically, it updates the OFFSET field of DR. */
4259 vect_update_init_of_dr (struct data_reference *dr, tree niters)
4261 tree offset = DR_OFFSET (dr);
4263 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
4264 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
4265 DR_OFFSET (dr) = offset;
4269 /* Function vect_update_inits_of_drs
4271 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
4272 This function updates the information recorded for the data references in
4273 the loop to account for the fact that the first NITERS iterations had
4274 already been executed. Specifically, it updates the initial_condition of the
4275 access_function of all the data_references in the loop. */
4278 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
4281 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
4282 struct data_reference *dr;
4284 if (vect_dump && (dump_flags & TDF_DETAILS))
4285 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
4287 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
4288 vect_update_init_of_dr (dr, niters);
4292 /* Function vect_do_peeling_for_alignment
4294 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
4295 'niters' is set to the misalignment of one of the data references in the
4296 loop, thereby forcing it to refer to an aligned location at the beginning
4297 of the execution of this loop. The data reference for which we are
4298 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
4301 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
4303 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4304 tree niters_of_prolog_loop, ni_name;
4306 struct loop *new_loop;
4308 if (vect_print_dump_info (REPORT_DETAILS))
4309 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
4311 initialize_original_copy_tables ();
4313 ni_name = vect_build_loop_niters (loop_vinfo);
4314 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
4316 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
4318 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
4319 niters_of_prolog_loop, ni_name, true);
4320 gcc_assert (new_loop);
4321 #ifdef ENABLE_CHECKING
4322 slpeel_verify_cfg_after_peeling (new_loop, loop);
4325 /* Update number of times loop executes. */
4326 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
4327 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
4328 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
4330 /* Update the init conditions of the access functions of all data refs. */
4331 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
4333 /* After peeling we have to reset scalar evolution analyzer. */
4336 free_original_copy_tables ();
4340 /* Function vect_create_cond_for_align_checks.
4342 Create a conditional expression that represents the alignment checks for
4343 all of data references (array element references) whose alignment must be
4347 LOOP_VINFO - two fields of the loop information are used.
4348 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
4349 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
4352 COND_EXPR_STMT_LIST - statements needed to construct the conditional
4354 The returned value is the conditional expression to be used in the if
4355 statement that controls which version of the loop gets executed at runtime.
4357 The algorithm makes two assumptions:
4358 1) The number of bytes "n" in a vector is a power of 2.
4359 2) An address "a" is aligned if a%n is zero and that this
4360 test can be done as a&(n-1) == 0. For example, for 16
4361 byte vectors the test is a&0xf == 0. */
4364 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
4365 tree *cond_expr_stmt_list)
4367 VEC(tree,heap) *may_misalign_stmts
4368 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
4370 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
4374 tree int_ptrsize_type;
4376 tree or_tmp_name = NULL_TREE;
4377 tree and_tmp, and_tmp_name, and_stmt;
4380 /* Check that mask is one less than a power of 2, i.e., mask is
4381 all zeros followed by all ones. */
4382 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
4384 /* CHECKME: what is the best integer or unsigned type to use to hold a
4385 cast from a pointer value? */
4386 psize = TYPE_SIZE (ptr_type_node);
4388 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
4390 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
4391 of the first vector of the i'th data reference. */
4393 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
4395 tree new_stmt_list = NULL_TREE;
4397 tree addr_tmp, addr_tmp_name, addr_stmt;
4398 tree or_tmp, new_or_tmp_name, or_stmt;
4400 /* create: addr_tmp = (int)(address_of_first_vector) */
4401 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
4405 if (new_stmt_list != NULL_TREE)
4406 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
4408 sprintf (tmp_name, "%s%d", "addr2int", i);
4409 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4410 add_referenced_var (addr_tmp);
4411 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
4412 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
4413 addr_stmt = build2 (MODIFY_EXPR, void_type_node,
4414 addr_tmp_name, addr_stmt);
4415 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
4416 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
4418 /* The addresses are OR together. */
4420 if (or_tmp_name != NULL_TREE)
4422 /* create: or_tmp = or_tmp | addr_tmp */
4423 sprintf (tmp_name, "%s%d", "orptrs", i);
4424 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4425 add_referenced_var (or_tmp);
4426 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
4427 or_stmt = build2 (MODIFY_EXPR, void_type_node, new_or_tmp_name,
4428 build2 (BIT_IOR_EXPR, int_ptrsize_type,
4431 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
4432 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
4433 or_tmp_name = new_or_tmp_name;
4436 or_tmp_name = addr_tmp_name;
4440 mask_cst = build_int_cst (int_ptrsize_type, mask);
4442 /* create: and_tmp = or_tmp & mask */
4443 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
4444 add_referenced_var (and_tmp);
4445 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
4447 and_stmt = build2 (MODIFY_EXPR, void_type_node,
4449 build2 (BIT_AND_EXPR, int_ptrsize_type,
4450 or_tmp_name, mask_cst));
4451 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
4452 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
4454 /* Make and_tmp the left operand of the conditional test against zero.
4455 if and_tmp has a nonzero bit then some address is unaligned. */
4456 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
4457 return build2 (EQ_EXPR, boolean_type_node,
4458 and_tmp_name, ptrsize_zero);
4462 /* Function vect_transform_loop.
4464 The analysis phase has determined that the loop is vectorizable.
4465 Vectorize the loop - created vectorized stmts to replace the scalar
4466 stmts in the loop, and update the loop exit condition. */
4469 vect_transform_loop (loop_vec_info loop_vinfo)
4471 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4472 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4473 int nbbs = loop->num_nodes;
4474 block_stmt_iterator si;
4477 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4482 if (vect_print_dump_info (REPORT_DETAILS))
4483 fprintf (vect_dump, "=== vec_transform_loop ===");
4485 /* If the loop has data references that may or may not be aligned then
4486 two versions of the loop need to be generated, one which is vectorized
4487 and one which isn't. A test is then generated to control which of the
4488 loops is executed. The test checks for the alignment of all of the
4489 data references that may or may not be aligned. */
4491 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
4495 tree cond_expr_stmt_list = NULL_TREE;
4496 basic_block condition_bb;
4497 block_stmt_iterator cond_exp_bsi;
4498 basic_block merge_bb;
4499 basic_block new_exit_bb;
4501 tree orig_phi, new_phi, arg;
4503 cond_expr = vect_create_cond_for_align_checks (loop_vinfo,
4504 &cond_expr_stmt_list);
4505 initialize_original_copy_tables ();
4506 nloop = loop_version (loop, cond_expr, &condition_bb, true);
4507 free_original_copy_tables();
4509 /** Loop versioning violates an assumption we try to maintain during
4510 vectorization - that the loop exit block has a single predecessor.
4511 After versioning, the exit block of both loop versions is the same
4512 basic block (i.e. it has two predecessors). Just in order to simplify
4513 following transformations in the vectorizer, we fix this situation
4514 here by adding a new (empty) block on the exit-edge of the loop,
4515 with the proper loop-exit phis to maintain loop-closed-form. **/
4517 merge_bb = single_exit (loop)->dest;
4518 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
4519 new_exit_bb = split_edge (single_exit (loop));
4520 new_exit_e = single_exit (loop);
4521 e = EDGE_SUCC (new_exit_bb, 0);
4523 for (orig_phi = phi_nodes (merge_bb); orig_phi;
4524 orig_phi = PHI_CHAIN (orig_phi))
4526 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
4528 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
4529 add_phi_arg (new_phi, arg, new_exit_e);
4530 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
4533 /** end loop-exit-fixes after versioning **/
4535 update_ssa (TODO_update_ssa);
4536 cond_exp_bsi = bsi_last (condition_bb);
4537 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
4540 /* CHECKME: we wouldn't need this if we called update_ssa once
4542 bitmap_zero (vect_vnames_to_rename);
4544 /* Peel the loop if there are data refs with unknown alignment.
4545 Only one data ref with unknown store is allowed. */
4547 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4548 vect_do_peeling_for_alignment (loop_vinfo);
4550 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4551 compile time constant), or it is a constant that doesn't divide by the
4552 vectorization factor, then an epilog loop needs to be created.
4553 We therefore duplicate the loop: the original loop will be vectorized,
4554 and will compute the first (n/VF) iterations. The second copy of the loop
4555 will remain scalar and will compute the remaining (n%VF) iterations.
4556 (VF is the vectorization factor). */
4558 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4559 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4560 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
4561 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
4563 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4564 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4566 /* 1) Make sure the loop header has exactly two entries
4567 2) Make sure we have a preheader basic block. */
4569 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4571 split_edge (loop_preheader_edge (loop));
4573 /* FORNOW: the vectorizer supports only loops which body consist
4574 of one basic block (header + empty latch). When the vectorizer will
4575 support more involved loop forms, the order by which the BBs are
4576 traversed need to be reconsidered. */
4578 for (i = 0; i < nbbs; i++)
4580 basic_block bb = bbs[i];
4582 for (si = bsi_start (bb); !bsi_end_p (si);)
4584 tree stmt = bsi_stmt (si);
4585 stmt_vec_info stmt_info;
4588 if (vect_print_dump_info (REPORT_DETAILS))
4590 fprintf (vect_dump, "------>vectorizing statement: ");
4591 print_generic_expr (vect_dump, stmt, TDF_SLIM);
4593 stmt_info = vinfo_for_stmt (stmt);
4594 gcc_assert (stmt_info);
4595 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4596 && !STMT_VINFO_LIVE_P (stmt_info))
4602 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4603 != (unsigned HOST_WIDE_INT) vectorization_factor)
4604 && vect_print_dump_info (REPORT_DETAILS))
4605 fprintf (vect_dump, "multiple-types.");
4607 /* -------- vectorize statement ------------ */
4608 if (vect_print_dump_info (REPORT_DETAILS))
4609 fprintf (vect_dump, "transform statement.");
4611 strided_store = false;
4612 is_store = vect_transform_stmt (stmt, &si, &strided_store);
4616 if (DR_GROUP_FIRST_DR (stmt_info))
4618 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4619 interleaving chain was completed - free all the stores in
4621 tree next = DR_GROUP_FIRST_DR (stmt_info);
4623 stmt_vec_info next_stmt_info;
4627 next_stmt_info = vinfo_for_stmt (next);
4628 /* Free the attached stmt_vec_info and remove the stmt. */
4629 ann = stmt_ann (next);
4630 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
4631 free (next_stmt_info);
4632 set_stmt_info (ann, NULL);
4635 bsi_remove (&si, true);
4640 /* Free the attached stmt_vec_info and remove the stmt. */
4641 ann = stmt_ann (stmt);
4643 set_stmt_info (ann, NULL);
4644 bsi_remove (&si, true);
4652 /* This is case of skipped interleaved store. We don't free
4653 its stmt_vec_info. */
4654 bsi_remove (&si, true);
4662 slpeel_make_loop_iterate_ntimes (loop, ratio);
4664 EXECUTE_IF_SET_IN_BITMAP (vect_vnames_to_rename, 0, j, bi)
4665 mark_sym_for_renaming (SSA_NAME_VAR (ssa_name (j)));
4667 /* The memory tags and pointers in vectorized statements need to
4668 have their SSA forms updated. FIXME, why can't this be delayed
4669 until all the loops have been transformed? */
4670 update_ssa (TODO_update_ssa);
4672 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
4673 fprintf (vect_dump, "LOOP VECTORIZED.");