1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to the Free
19 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
24 #include "coretypes.h"
30 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
50 static tree vect_create_destination_var (tree, tree);
51 static tree vect_create_data_ref_ptr
52 (tree, block_stmt_iterator *, tree, tree *, tree *, bool, tree);
53 static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree);
54 static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *);
55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
57 static tree vect_init_vector (tree, tree, tree);
58 static void vect_finish_stmt_generation
59 (tree stmt, tree vec_stmt, block_stmt_iterator *bsi);
60 static bool vect_is_simple_cond (tree, loop_vec_info);
61 static void update_vuses_to_preheader (tree, struct loop*);
62 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
63 static tree get_initial_def_for_reduction (tree, tree, tree *);
65 /* Utility function dealing with loop peeling (not peeling itself). */
66 static void vect_generate_tmps_on_preheader
67 (loop_vec_info, tree *, tree *, tree *);
68 static tree vect_build_loop_niters (loop_vec_info);
69 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
70 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
71 static void vect_update_init_of_dr (struct data_reference *, tree niters);
72 static void vect_update_inits_of_drs (loop_vec_info, tree);
73 static int vect_min_worthwhile_factor (enum tree_code);
76 /* Function vect_get_new_vect_var.
78 Returns a name for a new variable. The current naming scheme appends the
79 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
80 the name of vectorizer generated variables, and appends that to NAME if
84 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
97 case vect_pointer_var:
105 new_vect_var = create_tmp_var (type, concat (prefix, name, NULL));
107 new_vect_var = create_tmp_var (type, prefix);
113 /* Function vect_create_addr_base_for_vector_ref.
115 Create an expression that computes the address of the first memory location
116 that will be accessed for a data reference.
119 STMT: The statement containing the data reference.
120 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
121 OFFSET: Optional. If supplied, it is be added to the initial address.
124 1. Return an SSA_NAME whose value is the address of the memory location of
125 the first vector of the data reference.
126 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
127 these statement(s) which define the returned SSA_NAME.
129 FORNOW: We are only handling array accesses with step 1. */
132 vect_create_addr_base_for_vector_ref (tree stmt,
136 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
137 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
138 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
139 tree base_name = build_fold_indirect_ref (data_ref_base);
141 tree addr_base, addr_expr;
143 tree base_offset = unshare_expr (DR_OFFSET (dr));
144 tree init = unshare_expr (DR_INIT (dr));
145 tree vect_ptr_type, addr_expr2;
147 /* Create base_offset */
148 base_offset = size_binop (PLUS_EXPR, base_offset, init);
149 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
150 add_referenced_var (dest);
151 base_offset = force_gimple_operand (base_offset, &new_stmt, false, dest);
152 append_to_statement_list_force (new_stmt, new_stmt_list);
156 tree tmp = create_tmp_var (TREE_TYPE (base_offset), "offset");
159 /* For interleaved access step we divide STEP by the size of the
160 interleaving group. */
161 if (DR_GROUP_SIZE (stmt_info))
162 step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr),
163 build_int_cst (TREE_TYPE (offset),
164 DR_GROUP_SIZE (stmt_info)));
168 add_referenced_var (tmp);
169 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
170 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
171 base_offset, offset);
172 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
173 append_to_statement_list_force (new_stmt, new_stmt_list);
176 /* base + base_offset */
177 addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base,
180 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
182 /* addr_expr = addr_base */
183 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
184 get_name (base_name));
185 add_referenced_var (addr_expr);
186 vec_stmt = fold_convert (vect_ptr_type, addr_base);
187 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
188 get_name (base_name));
189 add_referenced_var (addr_expr2);
190 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
191 append_to_statement_list_force (new_stmt, new_stmt_list);
193 if (vect_print_dump_info (REPORT_DETAILS))
195 fprintf (vect_dump, "created ");
196 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
202 /* Function vect_create_data_ref_ptr.
204 Create a new pointer to vector type (vp), that points to the first location
205 accessed in the loop by STMT, along with the def-use update chain to
206 appropriately advance the pointer through the loop iterations. Also set
207 aliasing information for the pointer. This vector pointer is used by the
208 callers to this function to create a memory reference expression for vector
212 1. STMT: a stmt that references memory. Expected to be of the form
213 GIMPLE_MODIFY_STMT <name, data-ref> or
214 GIMPLE_MODIFY_STMT <data-ref, name>.
215 2. BSI: block_stmt_iterator where new stmts can be added.
216 3. OFFSET (optional): an offset to be added to the initial address accessed
217 by the data-ref in STMT.
218 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
219 pointing to the initial address.
220 5. TYPE: if not NULL indicates the required type of the data-ref
223 1. Declare a new ptr to vector_type, and have it point to the base of the
224 data reference (initial addressed accessed by the data reference).
225 For example, for vector of type V8HI, the following code is generated:
228 vp = (v8hi *)initial_address;
230 if OFFSET is not supplied:
231 initial_address = &a[init];
232 if OFFSET is supplied:
233 initial_address = &a[init + OFFSET];
235 Return the initial_address in INITIAL_ADDRESS.
237 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
238 update the pointer in each iteration of the loop.
240 Return the increment stmt that updates the pointer in PTR_INCR.
242 3. Return the pointer. */
245 vect_create_data_ref_ptr (tree stmt,
246 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
247 tree offset, tree *initial_address, tree *ptr_incr,
248 bool only_init, tree type)
251 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
252 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
253 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
254 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
260 tree new_stmt_list = NULL_TREE;
261 edge pe = loop_preheader_edge (loop);
264 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
266 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
268 if (vect_print_dump_info (REPORT_DETAILS))
270 tree data_ref_base = base_name;
271 fprintf (vect_dump, "create vector-pointer variable to type: ");
272 print_generic_expr (vect_dump, vectype, TDF_SLIM);
273 if (TREE_CODE (data_ref_base) == VAR_DECL)
274 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
275 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
276 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
277 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
278 fprintf (vect_dump, " vectorizing a record based array ref: ");
279 else if (TREE_CODE (data_ref_base) == SSA_NAME)
280 fprintf (vect_dump, " vectorizing a pointer ref: ");
281 print_generic_expr (vect_dump, base_name, TDF_SLIM);
284 /** (1) Create the new vector-pointer variable: **/
286 vect_ptr_type = build_pointer_type (type);
288 vect_ptr_type = build_pointer_type (vectype);
289 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
290 get_name (base_name));
291 add_referenced_var (vect_ptr);
293 /** (2) Add aliasing information to the new vector-pointer:
294 (The points-to info (DR_PTR_INFO) may be defined later.) **/
296 tag = DR_MEMTAG (dr);
299 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
300 tag must be created with tag added to its may alias list. */
302 new_type_alias (vect_ptr, tag, DR_REF (dr));
304 set_symbol_mem_tag (vect_ptr, tag);
306 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
308 /** (3) Calculate the initial address the vector-pointer, and set
309 the vector-pointer to point to it before the loop: **/
311 /* Create: (&(base[init_val+offset]) in the loop preheader. */
312 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
314 pe = loop_preheader_edge (loop);
315 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
316 gcc_assert (!new_bb);
317 *initial_address = new_temp;
319 /* Create: p = (vectype *) initial_base */
320 vec_stmt = fold_convert (vect_ptr_type, new_temp);
321 vec_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vect_ptr, vec_stmt);
322 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
323 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
324 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
325 gcc_assert (!new_bb);
328 /** (4) Handle the updating of the vector-pointer inside the loop: **/
330 if (only_init) /* No update in loop is required. */
332 /* Copy the points-to information if it exists. */
333 if (DR_PTR_INFO (dr))
334 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
335 return vect_ptr_init;
339 block_stmt_iterator incr_bsi;
341 tree indx_before_incr, indx_after_incr;
344 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
345 create_iv (vect_ptr_init,
346 fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)),
347 NULL_TREE, loop, &incr_bsi, insert_after,
348 &indx_before_incr, &indx_after_incr);
349 incr = bsi_stmt (incr_bsi);
350 set_stmt_info (stmt_ann (incr),
351 new_stmt_vec_info (incr, loop_vinfo));
353 /* Copy the points-to information if it exists. */
354 if (DR_PTR_INFO (dr))
356 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
357 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
359 merge_alias_info (vect_ptr_init, indx_before_incr);
360 merge_alias_info (vect_ptr_init, indx_after_incr);
364 return indx_before_incr;
369 /* Function bump_vector_ptr
371 Increment a pointer (to a vector type) by vector-size. Connect the new
372 increment stmt to the existing def-use update-chain of the pointer.
374 The pointer def-use update-chain before this function:
375 DATAREF_PTR = phi (p_0, p_2)
377 PTR_INCR: p_2 = DATAREF_PTR + step
379 The pointer def-use update-chain after this function:
380 DATAREF_PTR = phi (p_0, p_2)
382 NEW_DATAREF_PTR = DATAREF_PTR + vector_size
384 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
387 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
389 PTR_INCR - the stmt that updates the pointer in each iteration of the loop.
390 The increment amount across iterations is also expected to be
392 BSI - location where the new update stmt is to be placed.
393 STMT - the original scalar memory-access stmt that is being vectorized.
395 Output: Return NEW_DATAREF_PTR as illustrated above.
400 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
403 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
404 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
405 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
406 tree vptr_type = TREE_TYPE (dataref_ptr);
407 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
408 tree update = fold_convert (vptr_type, TYPE_SIZE_UNIT (vectype));
412 tree new_dataref_ptr;
414 incr_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, ptr_var,
415 build2 (PLUS_EXPR, vptr_type, dataref_ptr, update));
416 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
417 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
418 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
420 /* Update the vector-pointer's cross-iteration increment. */
421 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
423 tree use = USE_FROM_PTR (use_p);
425 if (use == dataref_ptr)
426 SET_USE (use_p, new_dataref_ptr);
428 gcc_assert (tree_int_cst_compare (use, update) == 0);
431 /* Copy the points-to information if it exists. */
432 if (DR_PTR_INFO (dr))
433 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
434 merge_alias_info (new_dataref_ptr, dataref_ptr);
436 return new_dataref_ptr;
440 /* Function vect_create_destination_var.
442 Create a new temporary of type VECTYPE. */
445 vect_create_destination_var (tree scalar_dest, tree vectype)
448 const char *new_name;
450 enum vect_var_kind kind;
452 kind = vectype ? vect_simple_var : vect_scalar_var;
453 type = vectype ? vectype : TREE_TYPE (scalar_dest);
455 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
457 new_name = get_name (scalar_dest);
460 vec_dest = vect_get_new_vect_var (type, vect_simple_var, new_name);
461 add_referenced_var (vec_dest);
467 /* Function vect_init_vector.
469 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
470 the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be
471 used in the vectorization of STMT. */
474 vect_init_vector (tree stmt, tree vector_var, tree vector_type)
476 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
477 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
478 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
486 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
487 add_referenced_var (new_var);
489 init_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, new_var, vector_var);
490 new_temp = make_ssa_name (new_var, init_stmt);
491 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
493 pe = loop_preheader_edge (loop);
494 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
495 gcc_assert (!new_bb);
497 if (vect_print_dump_info (REPORT_DETAILS))
499 fprintf (vect_dump, "created new init_stmt: ");
500 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
503 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
508 /* Function vect_get_vec_def_for_operand.
510 OP is an operand in STMT. This function returns a (vector) def that will be
511 used in the vectorized stmt for STMT.
513 In the case that OP is an SSA_NAME which is defined in the loop, then
514 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
516 In case OP is an invariant or constant, a new stmt that creates a vector def
517 needs to be introduced. */
520 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
525 stmt_vec_info def_stmt_info = NULL;
526 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
527 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
528 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
529 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
530 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
536 enum vect_def_type dt;
540 if (vect_print_dump_info (REPORT_DETAILS))
542 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
543 print_generic_expr (vect_dump, op, TDF_SLIM);
546 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
547 gcc_assert (is_simple_use);
548 if (vect_print_dump_info (REPORT_DETAILS))
552 fprintf (vect_dump, "def = ");
553 print_generic_expr (vect_dump, def, TDF_SLIM);
557 fprintf (vect_dump, " def_stmt = ");
558 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
564 /* Case 1: operand is a constant. */
565 case vect_constant_def:
570 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
571 if (vect_print_dump_info (REPORT_DETAILS))
572 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
574 for (i = nunits - 1; i >= 0; --i)
576 t = tree_cons (NULL_TREE, op, t);
578 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
579 vec_cst = build_vector (vector_type, t);
581 return vect_init_vector (stmt, vec_cst, vector_type);
584 /* Case 2: operand is defined outside the loop - loop invariant. */
585 case vect_invariant_def:
590 /* Create 'vec_inv = {inv,inv,..,inv}' */
591 if (vect_print_dump_info (REPORT_DETAILS))
592 fprintf (vect_dump, "Create vector_inv.");
594 for (i = nunits - 1; i >= 0; --i)
596 t = tree_cons (NULL_TREE, def, t);
599 /* FIXME: use build_constructor directly. */
600 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
601 vec_inv = build_constructor_from_list (vector_type, t);
602 return vect_init_vector (stmt, vec_inv, vector_type);
605 /* Case 3: operand is defined inside the loop. */
609 *scalar_def = def_stmt;
611 /* Get the def from the vectorized stmt. */
612 def_stmt_info = vinfo_for_stmt (def_stmt);
613 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
614 gcc_assert (vec_stmt);
615 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
619 /* Case 4: operand is defined by a loop header phi - reduction */
620 case vect_reduction_def:
622 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
624 /* Get the def before the loop */
625 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
626 return get_initial_def_for_reduction (stmt, op, scalar_def);
629 /* Case 5: operand is defined by loop-header phi - induction. */
630 case vect_induction_def:
632 if (vect_print_dump_info (REPORT_DETAILS))
633 fprintf (vect_dump, "induction - unsupported.");
634 internal_error ("no support for induction"); /* FORNOW */
643 /* Function vect_get_vec_def_for_stmt_copy
645 Return a vector-def for an operand. This function is used when the
646 vectorized stmt to be created (by the caller to this function) is a "copy"
647 created in case the vectorized result cannot fit in one vector, and several
648 copies of the vector-stmt are required. In this case the vector-def is
649 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
650 of the stmt that defines VEC_OPRND.
651 DT is the type of the vector def VEC_OPRND.
654 In case the vectorization factor (VF) is bigger than the number
655 of elements that can fit in a vectype (nunits), we have to generate
656 more than one vector stmt to vectorize the scalar stmt. This situation
657 arises when there are multiple data-types operated upon in the loop; the
658 smallest data-type determines the VF, and as a result, when vectorizing
659 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
660 vector stmt (each computing a vector of 'nunits' results, and together
661 computing 'VF' results in each iteration). This function is called when
662 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
663 which VF=16 and nuniti=4, so the number of copies required is 4):
665 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
667 S1: x = load VS1.0: vx.0 = memref0 VS1.1
668 VS1.1: vx.1 = memref1 VS1.2
669 VS1.2: vx.2 = memref2 VS1.3
670 VS1.3: vx.3 = memref3
672 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
673 VSnew.1: vz1 = vx.1 + ... VSnew.2
674 VSnew.2: vz2 = vx.2 + ... VSnew.3
675 VSnew.3: vz3 = vx.3 + ...
677 The vectorization of S1 is explained in vectorizable_load.
678 The vectorization of S2:
679 To create the first vector-stmt out of the 4 copies - VSnew.0 -
680 the function 'vect_get_vec_def_for_operand' is called to
681 get the relevant vector-def for each operand of S2. For operand x it
682 returns the vector-def 'vx.0'.
684 To create the remaining copies of the vector-stmt (VSnew.j), this
685 function is called to get the relevant vector-def for each operand. It is
686 obtained from the respective VS1.j stmt, which is recorded in the
687 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
689 For example, to obtain the vector-def 'vx.1' in order to create the
690 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
691 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
692 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
693 and return its def ('vx.1').
694 Overall, to create the above sequence this function will be called 3 times:
695 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
696 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
697 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
700 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
702 tree vec_stmt_for_operand;
703 stmt_vec_info def_stmt_info;
705 if (dt == vect_invariant_def || dt == vect_constant_def)
707 /* Do nothing; can reuse same def. */ ;
711 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
712 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
713 gcc_assert (def_stmt_info);
714 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
715 gcc_assert (vec_stmt_for_operand);
716 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
722 /* Function vect_finish_stmt_generation.
724 Insert a new stmt. */
727 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
728 block_stmt_iterator *bsi)
730 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
731 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
733 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
734 set_stmt_info (get_stmt_ann (vec_stmt),
735 new_stmt_vec_info (vec_stmt, loop_vinfo));
737 if (vect_print_dump_info (REPORT_DETAILS))
739 fprintf (vect_dump, "add new stmt: ");
740 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
743 /* Make sure bsi points to the stmt that is being vectorized. */
744 gcc_assert (stmt == bsi_stmt (*bsi));
746 #ifdef USE_MAPPED_LOCATION
747 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
749 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
754 #define ADJUST_IN_EPILOG 1
756 /* Function get_initial_def_for_reduction
759 STMT - a stmt that performs a reduction operation in the loop.
760 INIT_VAL - the initial value of the reduction variable
763 SCALAR_DEF - a tree that holds a value to be added to the final result
764 of the reduction (used for "ADJUST_IN_EPILOG" - see below).
765 Return a vector variable, initialized according to the operation that STMT
766 performs. This vector will be used as the initial value of the
767 vector of partial results.
769 Option1 ("ADJUST_IN_EPILOG"): Initialize the vector as follows:
772 min/max: [init_val,init_val,..,init_val,init_val]
773 bit and/or: [init_val,init_val,..,init_val,init_val]
774 and when necessary (e.g. add/mult case) let the caller know
775 that it needs to adjust the result by init_val.
777 Option2: Initialize the vector as follows:
778 add: [0,0,...,0,init_val]
779 mult: [1,1,...,1,init_val]
780 min/max: [init_val,init_val,...,init_val]
781 bit and/or: [init_val,init_val,...,init_val]
782 and no adjustments are needed.
784 For example, for the following code:
790 STMT is 's = s + a[i]', and the reduction variable is 's'.
791 For a vector of 4 units, we want to return either [0,0,0,init_val],
792 or [0,0,0,0] and let the caller know that it needs to adjust
793 the result at the end by 'init_val'.
795 FORNOW: We use the "ADJUST_IN_EPILOG" scheme.
796 TODO: Use some cost-model to estimate which scheme is more profitable.
800 get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def)
802 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
803 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
804 int nunits = GET_MODE_NUNITS (TYPE_MODE (vectype));
806 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
807 tree type = TREE_TYPE (init_val);
809 tree vec, t = NULL_TREE;
810 bool need_epilog_adjust;
814 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
821 if (INTEGRAL_TYPE_P (type))
822 def = build_int_cst (type, 0);
824 def = build_real (type, dconst0);
826 #ifdef ADJUST_IN_EPILOG
827 /* All the 'nunits' elements are set to 0. The final result will be
828 adjusted by 'init_val' at the loop epilog. */
830 need_epilog_adjust = true;
832 /* 'nunits - 1' elements are set to 0; The last element is set to
833 'init_val'. No further adjustments at the epilog are needed. */
834 nelements = nunits - 1;
835 need_epilog_adjust = false;
843 need_epilog_adjust = false;
850 for (i = nelements - 1; i >= 0; --i)
851 t = tree_cons (NULL_TREE, def, t);
853 if (nelements == nunits - 1)
855 /* Set the last element of the vector. */
856 t = tree_cons (NULL_TREE, init_val, t);
859 gcc_assert (nelements == nunits);
861 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
862 if (TREE_CODE (init_val) == INTEGER_CST || TREE_CODE (init_val) == REAL_CST)
863 vec = build_vector (vector_type, t);
865 vec = build_constructor_from_list (vector_type, t);
867 if (!need_epilog_adjust)
868 *scalar_def = NULL_TREE;
870 *scalar_def = init_val;
872 return vect_init_vector (stmt, vec, vector_type);
876 /* Function vect_create_epilog_for_reduction
878 Create code at the loop-epilog to finalize the result of a reduction
881 VECT_DEF is a vector of partial results.
882 REDUC_CODE is the tree-code for the epilog reduction.
883 STMT is the scalar reduction stmt that is being vectorized.
884 REDUCTION_PHI is the phi-node that carries the reduction computation.
887 1. Creates the reduction def-use cycle: sets the the arguments for
889 The loop-entry argument is the vectorized initial-value of the reduction.
890 The loop-latch argument is VECT_DEF - the vector of partial sums.
891 2. "Reduces" the vector of partial results VECT_DEF into a single result,
892 by applying the operation specified by REDUC_CODE if available, or by
893 other means (whole-vector shifts or a scalar loop).
894 The function also creates a new phi node at the loop exit to preserve
895 loop-closed form, as illustrated below.
897 The flow at the entry to this function:
900 vec_def = phi <null, null> # REDUCTION_PHI
901 VECT_DEF = vector_stmt # vectorized form of STMT
902 s_loop = scalar_stmt # (scalar) STMT
904 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
908 The above is transformed by this function into:
911 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
912 VECT_DEF = vector_stmt # vectorized form of STMT
913 s_loop = scalar_stmt # (scalar) STMT
915 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
916 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
917 v_out2 = reduce <v_out1>
918 s_out3 = extract_field <v_out2, 0>
919 s_out4 = adjust_result <s_out3>
925 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
926 enum tree_code reduc_code, tree reduction_phi)
928 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
930 enum machine_mode mode;
931 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
932 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
937 block_stmt_iterator exit_bsi;
942 tree new_scalar_dest, exit_phi;
943 tree bitsize, bitpos, bytesize;
944 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
945 tree scalar_initial_def;
946 tree vec_initial_def;
948 imm_use_iterator imm_iter;
950 bool extract_scalar_result;
954 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
957 op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
958 reduction_op = TREE_OPERAND (operation, op_type-1);
959 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
960 mode = TYPE_MODE (vectype);
962 /*** 1. Create the reduction def-use cycle ***/
964 /* 1.1 set the loop-entry arg of the reduction-phi: */
965 /* For the case of reduction, vect_get_vec_def_for_operand returns
966 the scalar def before the loop, that defines the initial value
967 of the reduction variable. */
968 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
969 &scalar_initial_def);
970 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
972 /* 1.2 set the loop-latch arg for the reduction-phi: */
973 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
975 if (vect_print_dump_info (REPORT_DETAILS))
977 fprintf (vect_dump, "transform reduction: created def-use cycle:");
978 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
979 fprintf (vect_dump, "\n");
980 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
984 /*** 2. Create epilog code
985 The reduction epilog code operates across the elements of the vector
986 of partial results computed by the vectorized loop.
987 The reduction epilog code consists of:
988 step 1: compute the scalar result in a vector (v_out2)
989 step 2: extract the scalar result (s_out3) from the vector (v_out2)
990 step 3: adjust the scalar result (s_out3) if needed.
992 Step 1 can be accomplished using one the following three schemes:
993 (scheme 1) using reduc_code, if available.
994 (scheme 2) using whole-vector shifts, if available.
995 (scheme 3) using a scalar loop. In this case steps 1+2 above are
998 The overall epilog code looks like this:
1000 s_out0 = phi <s_loop> # original EXIT_PHI
1001 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1002 v_out2 = reduce <v_out1> # step 1
1003 s_out3 = extract_field <v_out2, 0> # step 2
1004 s_out4 = adjust_result <s_out3> # step 3
1006 (step 3 is optional, and step2 1 and 2 may be combined).
1007 Lastly, the uses of s_out0 are replaced by s_out4.
1011 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
1012 v_out1 = phi <v_loop> */
1014 exit_bb = single_exit (loop)->dest;
1015 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
1016 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
1017 exit_bsi = bsi_start (exit_bb);
1019 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
1020 (i.e. when reduc_code is not available) and in the final adjustment code
1021 (if needed). Also get the original scalar reduction variable as
1022 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
1023 represents a reduction pattern), the tree-code and scalar-def are
1024 taken from the original stmt that the pattern-stmt (STMT) replaces.
1025 Otherwise (it is a regular reduction) - the tree-code and scalar-def
1026 are taken from STMT. */
1028 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1031 /* Regular reduction */
1036 /* Reduction pattern */
1037 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
1038 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1039 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
1041 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1042 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
1043 scalar_type = TREE_TYPE (scalar_dest);
1044 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
1045 bitsize = TYPE_SIZE (scalar_type);
1046 bytesize = TYPE_SIZE_UNIT (scalar_type);
1048 /* 2.3 Create the reduction code, using one of the three schemes described
1051 if (reduc_code < NUM_TREE_CODES)
1053 /*** Case 1: Create:
1054 v_out2 = reduc_expr <v_out1> */
1056 if (vect_print_dump_info (REPORT_DETAILS))
1057 fprintf (vect_dump, "Reduce using direct vector reduction.");
1059 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1060 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
1061 build1 (reduc_code, vectype, PHI_RESULT (new_phi)));
1062 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1063 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1064 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1066 extract_scalar_result = true;
1070 enum tree_code shift_code = 0;
1071 bool have_whole_vector_shift = true;
1073 int element_bitsize = tree_low_cst (bitsize, 1);
1074 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1077 if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
1078 shift_code = VEC_RSHIFT_EXPR;
1080 have_whole_vector_shift = false;
1082 /* Regardless of whether we have a whole vector shift, if we're
1083 emulating the operation via tree-vect-generic, we don't want
1084 to use it. Only the first round of the reduction is likely
1085 to still be profitable via emulation. */
1086 /* ??? It might be better to emit a reduction tree code here, so that
1087 tree-vect-generic can expand the first round via bit tricks. */
1088 if (!VECTOR_MODE_P (mode))
1089 have_whole_vector_shift = false;
1092 optab optab = optab_for_tree_code (code, vectype);
1093 if (optab->handlers[mode].insn_code == CODE_FOR_nothing)
1094 have_whole_vector_shift = false;
1097 if (have_whole_vector_shift)
1099 /*** Case 2: Create:
1100 for (offset = VS/2; offset >= element_size; offset/=2)
1102 Create: va' = vec_shift <va, offset>
1103 Create: va = vop <va, va'>
1106 if (vect_print_dump_info (REPORT_DETAILS))
1107 fprintf (vect_dump, "Reduce using vector shifts");
1109 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1110 new_temp = PHI_RESULT (new_phi);
1112 for (bit_offset = vec_size_in_bits/2;
1113 bit_offset >= element_bitsize;
1116 tree bitpos = size_int (bit_offset);
1118 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1120 build2 (shift_code, vectype,
1122 new_name = make_ssa_name (vec_dest, epilog_stmt);
1123 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1124 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1126 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1128 build2 (code, vectype,
1129 new_name, new_temp));
1130 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1131 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1132 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1135 extract_scalar_result = true;
1141 /*** Case 3: Create:
1142 s = extract_field <v_out2, 0>
1143 for (offset = element_size;
1144 offset < vector_size;
1145 offset += element_size;)
1147 Create: s' = extract_field <v_out2, offset>
1148 Create: s = op <s, s'>
1151 if (vect_print_dump_info (REPORT_DETAILS))
1152 fprintf (vect_dump, "Reduce using scalar code. ");
1154 vec_temp = PHI_RESULT (new_phi);
1155 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1156 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1158 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1159 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1160 new_scalar_dest, rhs);
1161 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1162 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1163 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1165 for (bit_offset = element_bitsize;
1166 bit_offset < vec_size_in_bits;
1167 bit_offset += element_bitsize)
1169 tree bitpos = bitsize_int (bit_offset);
1170 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1173 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1174 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1175 new_scalar_dest, rhs);
1176 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
1177 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1178 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1180 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1182 build2 (code, scalar_type, new_name, new_temp));
1183 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1184 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1185 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1188 extract_scalar_result = false;
1192 /* 2.4 Extract the final scalar result. Create:
1193 s_out3 = extract_field <v_out2, bitpos> */
1195 if (extract_scalar_result)
1199 if (vect_print_dump_info (REPORT_DETAILS))
1200 fprintf (vect_dump, "extract scalar result");
1202 if (BYTES_BIG_ENDIAN)
1203 bitpos = size_binop (MULT_EXPR,
1204 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
1205 TYPE_SIZE (scalar_type));
1207 bitpos = bitsize_zero_node;
1209 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
1210 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1211 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1212 new_scalar_dest, rhs);
1213 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1214 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1215 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1218 /* 2.4 Adjust the final result by the initial value of the reduction
1219 variable. (When such adjustment is not needed, then
1220 'scalar_initial_def' is zero).
1223 s_out4 = scalar_expr <s_out3, scalar_initial_def> */
1225 if (scalar_initial_def)
1227 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1229 build2 (code, scalar_type, new_temp, scalar_initial_def));
1230 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1231 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1232 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1235 /* 2.6 Replace uses of s_out0 with uses of s_out3 */
1237 /* Find the loop-closed-use at the loop exit of the original scalar result.
1238 (The reduction result is expected to have two immediate uses - one at the
1239 latch block, and one at the loop exit). */
1241 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
1243 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
1245 exit_phi = USE_STMT (use_p);
1249 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
1250 gcc_assert (exit_phi);
1251 /* Replace the uses: */
1252 orig_name = PHI_RESULT (exit_phi);
1253 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
1254 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1255 SET_USE (use_p, new_temp);
1259 /* Function vectorizable_reduction.
1261 Check if STMT performs a reduction operation that can be vectorized.
1262 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1263 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1264 Return FALSE if not a vectorizable STMT, TRUE otherwise.
1266 This function also handles reduction idioms (patterns) that have been
1267 recognized in advance during vect_pattern_recog. In this case, STMT may be
1269 X = pattern_expr (arg0, arg1, ..., X)
1270 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
1271 sequence that had been detected and replaced by the pattern-stmt (STMT).
1273 In some cases of reduction patterns, the type of the reduction variable X is
1274 different than the type of the other arguments of STMT.
1275 In such cases, the vectype that is used when transforming STMT into a vector
1276 stmt is different than the vectype that is used to determine the
1277 vectorization factor, because it consists of a different number of elements
1278 than the actual number of elements that are being operated upon in parallel.
1280 For example, consider an accumulation of shorts into an int accumulator.
1281 On some targets it's possible to vectorize this pattern operating on 8
1282 shorts at a time (hence, the vectype for purposes of determining the
1283 vectorization factor should be V8HI); on the other hand, the vectype that
1284 is used to create the vector form is actually V4SI (the type of the result).
1286 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
1287 indicates what is the actual level of parallelism (V8HI in the example), so
1288 that the right vectorization factor would be derived. This vectype
1289 corresponds to the type of arguments to the reduction stmt, and should *NOT*
1290 be used to create the vectorized stmt. The right vectype for the vectorized
1291 stmt is obtained from the type of the result X:
1292 get_vectype_for_scalar_type (TREE_TYPE (X))
1294 This means that, contrary to "regular" reductions (or "regular" stmts in
1295 general), the following equation:
1296 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
1297 does *NOT* necessarily hold for reduction patterns. */
1300 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1305 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
1306 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1307 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1308 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1309 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1311 enum tree_code code, orig_code, epilog_reduc_code = 0;
1312 enum machine_mode vec_mode;
1314 optab optab, reduc_optab;
1315 tree new_temp = NULL_TREE;
1317 enum vect_def_type dt;
1322 stmt_vec_info orig_stmt_info;
1323 tree expr = NULL_TREE;
1325 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1326 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1327 stmt_vec_info prev_stmt_info;
1329 tree new_stmt = NULL_TREE;
1332 gcc_assert (ncopies >= 1);
1334 /* 1. Is vectorizable reduction? */
1336 /* Not supportable if the reduction variable is used in the loop. */
1337 if (STMT_VINFO_RELEVANT_P (stmt_info))
1340 if (!STMT_VINFO_LIVE_P (stmt_info))
1343 /* Make sure it was already recognized as a reduction computation. */
1344 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
1347 /* 2. Has this been recognized as a reduction pattern?
1349 Check if STMT represents a pattern that has been recognized
1350 in earlier analysis stages. For stmts that represent a pattern,
1351 the STMT_VINFO_RELATED_STMT field records the last stmt in
1352 the original sequence that constitutes the pattern. */
1354 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1357 orig_stmt_info = vinfo_for_stmt (orig_stmt);
1358 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
1359 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
1360 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
1363 /* 3. Check the operands of the operation. The first operands are defined
1364 inside the loop body. The last operand is the reduction variable,
1365 which is defined by the loop-header-phi. */
1367 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
1369 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1370 code = TREE_CODE (operation);
1371 op_type = TREE_CODE_LENGTH (code);
1372 if (op_type != binary_op && op_type != ternary_op)
1374 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1375 scalar_type = TREE_TYPE (scalar_dest);
1377 /* All uses but the last are expected to be defined in the loop.
1378 The last use is the reduction variable. */
1379 for (i = 0; i < op_type-1; i++)
1381 op = TREE_OPERAND (operation, i);
1382 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1383 gcc_assert (is_simple_use);
1384 gcc_assert (dt == vect_loop_def || dt == vect_invariant_def ||
1385 dt == vect_constant_def);
1388 op = TREE_OPERAND (operation, i);
1389 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1390 gcc_assert (is_simple_use);
1391 gcc_assert (dt == vect_reduction_def);
1392 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1394 gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt));
1396 gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt));
1398 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
1401 /* 4. Supportable by target? */
1403 /* 4.1. check support for the operation in the loop */
1404 optab = optab_for_tree_code (code, vectype);
1407 if (vect_print_dump_info (REPORT_DETAILS))
1408 fprintf (vect_dump, "no optab.");
1411 vec_mode = TYPE_MODE (vectype);
1412 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1414 if (vect_print_dump_info (REPORT_DETAILS))
1415 fprintf (vect_dump, "op not supported by target.");
1416 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1417 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1418 < vect_min_worthwhile_factor (code))
1420 if (vect_print_dump_info (REPORT_DETAILS))
1421 fprintf (vect_dump, "proceeding using word mode.");
1424 /* Worthwhile without SIMD support? */
1425 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1426 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1427 < vect_min_worthwhile_factor (code))
1429 if (vect_print_dump_info (REPORT_DETAILS))
1430 fprintf (vect_dump, "not worthwhile without SIMD support.");
1434 /* 4.2. Check support for the epilog operation.
1436 If STMT represents a reduction pattern, then the type of the
1437 reduction variable may be different than the type of the rest
1438 of the arguments. For example, consider the case of accumulation
1439 of shorts into an int accumulator; The original code:
1440 S1: int_a = (int) short_a;
1441 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
1444 STMT: int_acc = widen_sum <short_a, int_acc>
1447 1. The tree-code that is used to create the vector operation in the
1448 epilog code (that reduces the partial results) is not the
1449 tree-code of STMT, but is rather the tree-code of the original
1450 stmt from the pattern that STMT is replacing. I.e, in the example
1451 above we want to use 'widen_sum' in the loop, but 'plus' in the
1453 2. The type (mode) we use to check available target support
1454 for the vector operation to be created in the *epilog*, is
1455 determined by the type of the reduction variable (in the example
1456 above we'd check this: plus_optab[vect_int_mode]).
1457 However the type (mode) we use to check available target support
1458 for the vector operation to be created *inside the loop*, is
1459 determined by the type of the other arguments to STMT (in the
1460 example we'd check this: widen_sum_optab[vect_short_mode]).
1462 This is contrary to "regular" reductions, in which the types of all
1463 the arguments are the same as the type of the reduction variable.
1464 For "regular" reductions we can therefore use the same vector type
1465 (and also the same tree-code) when generating the epilog code and
1466 when generating the code inside the loop. */
1470 /* This is a reduction pattern: get the vectype from the type of the
1471 reduction variable, and get the tree-code from orig_stmt. */
1472 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1473 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
1474 vec_mode = TYPE_MODE (vectype);
1478 /* Regular reduction: use the same vectype and tree-code as used for
1479 the vector code inside the loop can be used for the epilog code. */
1483 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
1485 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
1488 if (vect_print_dump_info (REPORT_DETAILS))
1489 fprintf (vect_dump, "no optab for reduction.");
1490 epilog_reduc_code = NUM_TREE_CODES;
1492 if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1494 if (vect_print_dump_info (REPORT_DETAILS))
1495 fprintf (vect_dump, "reduc op not supported by target.");
1496 epilog_reduc_code = NUM_TREE_CODES;
1499 if (!vec_stmt) /* transformation not required. */
1501 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
1507 if (vect_print_dump_info (REPORT_DETAILS))
1508 fprintf (vect_dump, "transform reduction.");
1510 /* Create the destination vector */
1511 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1513 /* Create the reduction-phi that defines the reduction-operand. */
1514 new_phi = create_phi_node (vec_dest, loop->header);
1516 /* In case the vectorization factor (VF) is bigger than the number
1517 of elements that we can fit in a vectype (nunits), we have to generate
1518 more than one vector stmt - i.e - we need to "unroll" the
1519 vector stmt by a factor VF/nunits. For more details see documentation
1520 in vectorizable_operation. */
1522 prev_stmt_info = NULL;
1523 for (j = 0; j < ncopies; j++)
1528 op = TREE_OPERAND (operation, 0);
1529 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
1530 if (op_type == ternary_op)
1532 op = TREE_OPERAND (operation, 1);
1533 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
1536 /* Get the vector def for the reduction variable from the phi node */
1537 reduc_def = PHI_RESULT (new_phi);
1541 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
1542 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
1543 if (op_type == ternary_op)
1544 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
1546 /* Get the vector def for the reduction variable from the vectorized
1547 reduction operation generated in the previous iteration (j-1) */
1548 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
1551 /* Arguments are ready. create the new vector stmt. */
1553 if (op_type == binary_op)
1554 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
1556 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
1558 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, expr);
1559 new_temp = make_ssa_name (vec_dest, new_stmt);
1560 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1561 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1564 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1566 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1567 prev_stmt_info = vinfo_for_stmt (new_stmt);
1570 /* Finalize the reduction-phi (set it's arguments) and create the
1571 epilog reduction code. */
1572 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
1576 /* Checks if CALL can be vectorized in type VECTYPE. Returns
1577 true if the target has a vectorized version of the function,
1578 or false if the function cannot be vectorized. */
1581 vectorizable_function (tree call, tree vectype)
1583 tree fndecl = get_callee_fndecl (call);
1585 /* We only handle functions that do not read or clobber memory -- i.e.
1586 const or novops ones. */
1587 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
1591 || TREE_CODE (fndecl) != FUNCTION_DECL
1592 || !DECL_BUILT_IN (fndecl))
1595 if (targetm.vectorize.builtin_vectorized_function (DECL_FUNCTION_CODE (fndecl), vectype))
1601 /* Returns an expression that performs a call to vectorized version
1602 of FNDECL in type VECTYPE, with the arguments given by ARGS.
1603 If extra statements need to be generated, they are inserted
1607 build_vectorized_function_call (tree fndecl,
1608 tree vectype, tree args)
1611 enum built_in_function code = DECL_FUNCTION_CODE (fndecl);
1613 /* The target specific builtin should be available. */
1614 vfndecl = targetm.vectorize.builtin_vectorized_function (code, vectype);
1615 gcc_assert (vfndecl != NULL_TREE);
1617 return build_function_call_expr (vfndecl, args);
1620 /* Function vectorizable_call.
1622 Check if STMT performs a function call that can be vectorized.
1623 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1624 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1625 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1628 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1633 tree op, args, type;
1634 tree vec_oprnd, vargs, *pvargs_end;
1635 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1636 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1637 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1638 tree fndecl, rhs, new_temp, def, def_stmt;
1639 enum vect_def_type dt;
1641 /* Is STMT a vectorizable call? */
1642 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
1645 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
1648 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1649 if (TREE_CODE (operation) != CALL_EXPR)
1652 /* For now, we only vectorize functions if a target specific builtin
1653 is available. TODO -- in some cases, it might be profitable to
1654 insert the calls for pieces of the vector, in order to be able
1655 to vectorize other operations in the loop. */
1656 if (!vectorizable_function (operation, vectype))
1658 if (vect_print_dump_info (REPORT_DETAILS))
1659 fprintf (vect_dump, "function is not vectorizable.");
1663 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
1665 for (args = TREE_OPERAND (operation, 1); args; args = TREE_CHAIN (args))
1667 op = TREE_VALUE (args);
1669 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
1671 if (vect_print_dump_info (REPORT_DETAILS))
1672 fprintf (vect_dump, "use not simple.");
1677 if (!vec_stmt) /* transformation not required. */
1679 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
1685 if (vect_print_dump_info (REPORT_DETAILS))
1686 fprintf (vect_dump, "transform operation.");
1689 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1690 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1694 pvargs_end = &vargs;
1695 for (args = TREE_OPERAND (operation, 1); args; args = TREE_CHAIN (args))
1697 op = TREE_VALUE (args);
1698 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
1700 *pvargs_end = tree_cons (NULL_TREE, vec_oprnd, NULL_TREE);
1701 pvargs_end = &TREE_CHAIN (*pvargs_end);
1704 fndecl = get_callee_fndecl (operation);
1705 rhs = build_vectorized_function_call (fndecl, vectype, vargs);
1706 *vec_stmt = build2 (GIMPLE_MODIFY_STMT, vectype, vec_dest, rhs);
1707 new_temp = make_ssa_name (vec_dest, *vec_stmt);
1708 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
1710 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
1712 /* The call in STMT might prevent it from being removed in dce. We however
1713 cannot remove it here, due to the way the ssa name it defines is mapped
1714 to the new definition. So just replace rhs of the statement with something
1716 type = TREE_TYPE (scalar_dest);
1717 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
1723 /* Function vectorizable_assignment.
1725 Check if STMT performs an assignment (copy) that can be vectorized.
1726 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1727 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1728 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1731 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1737 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1738 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1739 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1742 enum vect_def_type dt;
1743 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1744 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1746 gcc_assert (ncopies >= 1);
1748 return false; /* FORNOW */
1750 /* Is vectorizable assignment? */
1751 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1754 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1756 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
1759 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1760 if (TREE_CODE (scalar_dest) != SSA_NAME)
1763 op = GIMPLE_STMT_OPERAND (stmt, 1);
1764 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
1766 if (vect_print_dump_info (REPORT_DETAILS))
1767 fprintf (vect_dump, "use not simple.");
1771 if (!vec_stmt) /* transformation not required. */
1773 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
1778 if (vect_print_dump_info (REPORT_DETAILS))
1779 fprintf (vect_dump, "transform assignment.");
1782 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1785 op = GIMPLE_STMT_OPERAND (stmt, 1);
1786 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
1788 /* Arguments are ready. create the new vector stmt. */
1789 *vec_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, vec_oprnd);
1790 new_temp = make_ssa_name (vec_dest, *vec_stmt);
1791 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
1792 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
1798 /* Function vect_min_worthwhile_factor.
1800 For a loop where we could vectorize the operation indicated by CODE,
1801 return the minimum vectorization factor that makes it worthwhile
1802 to use generic vectors. */
1804 vect_min_worthwhile_factor (enum tree_code code)
1825 /* Function vectorizable_operation.
1827 Check if STMT performs a binary or unary operation that can be vectorized.
1828 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1829 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1830 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1833 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1838 tree op0, op1 = NULL;
1839 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
1840 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1841 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1842 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1843 enum tree_code code;
1844 enum machine_mode vec_mode;
1849 enum machine_mode optab_op2_mode;
1851 enum vect_def_type dt0, dt1;
1853 stmt_vec_info prev_stmt_info;
1854 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
1857 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
1860 gcc_assert (ncopies >= 1);
1862 /* Is STMT a vectorizable binary/unary operation? */
1863 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1866 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1868 if (STMT_VINFO_LIVE_P (stmt_info))
1870 /* FORNOW: not yet supported. */
1871 if (vect_print_dump_info (REPORT_DETAILS))
1872 fprintf (vect_dump, "value used after loop.");
1876 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
1879 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
1882 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1883 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
1884 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
1885 if (nunits_out != nunits_in)
1888 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1889 code = TREE_CODE (operation);
1890 optab = optab_for_tree_code (code, vectype);
1892 /* Support only unary or binary operations. */
1893 op_type = TREE_CODE_LENGTH (code);
1894 if (op_type != unary_op && op_type != binary_op)
1896 if (vect_print_dump_info (REPORT_DETAILS))
1897 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
1901 op0 = TREE_OPERAND (operation, 0);
1902 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
1904 if (vect_print_dump_info (REPORT_DETAILS))
1905 fprintf (vect_dump, "use not simple.");
1909 if (op_type == binary_op)
1911 op1 = TREE_OPERAND (operation, 1);
1912 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
1914 if (vect_print_dump_info (REPORT_DETAILS))
1915 fprintf (vect_dump, "use not simple.");
1920 /* Supportable by target? */
1923 if (vect_print_dump_info (REPORT_DETAILS))
1924 fprintf (vect_dump, "no optab.");
1927 vec_mode = TYPE_MODE (vectype);
1928 icode = (int) optab->handlers[(int) vec_mode].insn_code;
1929 if (icode == CODE_FOR_nothing)
1931 if (vect_print_dump_info (REPORT_DETAILS))
1932 fprintf (vect_dump, "op not supported by target.");
1933 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1934 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1935 < vect_min_worthwhile_factor (code))
1937 if (vect_print_dump_info (REPORT_DETAILS))
1938 fprintf (vect_dump, "proceeding using word mode.");
1941 /* Worthwhile without SIMD support? */
1942 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1943 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1944 < vect_min_worthwhile_factor (code))
1946 if (vect_print_dump_info (REPORT_DETAILS))
1947 fprintf (vect_dump, "not worthwhile without SIMD support.");
1951 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
1953 /* FORNOW: not yet supported. */
1954 if (!VECTOR_MODE_P (vec_mode))
1957 /* Invariant argument is needed for a vector shift
1958 by a scalar shift operand. */
1959 optab_op2_mode = insn_data[icode].operand[2].mode;
1960 if (! (VECTOR_MODE_P (optab_op2_mode)
1961 || dt1 == vect_constant_def
1962 || dt1 == vect_invariant_def))
1964 if (vect_print_dump_info (REPORT_DETAILS))
1965 fprintf (vect_dump, "operand mode requires invariant argument.");
1970 if (!vec_stmt) /* transformation not required. */
1972 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
1978 if (vect_print_dump_info (REPORT_DETAILS))
1979 fprintf (vect_dump, "transform binary/unary operation.");
1982 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1984 /* In case the vectorization factor (VF) is bigger than the number
1985 of elements that we can fit in a vectype (nunits), we have to generate
1986 more than one vector stmt - i.e - we need to "unroll" the
1987 vector stmt by a factor VF/nunits. In doing so, we record a pointer
1988 from one copy of the vector stmt to the next, in the field
1989 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
1990 stages to find the correct vector defs to be used when vectorizing
1991 stmts that use the defs of the current stmt. The example below illustrates
1992 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
1993 4 vectorized stmts):
1995 before vectorization:
1996 RELATED_STMT VEC_STMT
2000 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
2002 RELATED_STMT VEC_STMT
2003 VS1_0: vx0 = memref0 VS1_1 -
2004 VS1_1: vx1 = memref1 VS1_2 -
2005 VS1_2: vx2 = memref2 VS1_3 -
2006 VS1_3: vx3 = memref3 - -
2007 S1: x = load - VS1_0
2010 step2: vectorize stmt S2 (done here):
2011 To vectorize stmt S2 we first need to find the relevant vector
2012 def for the first operand 'x'. This is, as usual, obtained from
2013 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
2014 that defines 'x' (S1). This way we find the stmt VS1_0, and the
2015 relevant vector def 'vx0'. Having found 'vx0' we can generate
2016 the vector stmt VS2_0, and as usual, record it in the
2017 STMT_VINFO_VEC_STMT of stmt S2.
2018 When creating the second copy (VS2_1), we obtain the relevant vector
2019 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
2020 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
2021 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
2022 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
2023 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
2024 chain of stmts and pointers:
2025 RELATED_STMT VEC_STMT
2026 VS1_0: vx0 = memref0 VS1_1 -
2027 VS1_1: vx1 = memref1 VS1_2 -
2028 VS1_2: vx2 = memref2 VS1_3 -
2029 VS1_3: vx3 = memref3 - -
2030 S1: x = load - VS1_0
2031 VS2_0: vz0 = vx0 + v1 VS2_1 -
2032 VS2_1: vz1 = vx1 + v1 VS2_2 -
2033 VS2_2: vz2 = vx2 + v1 VS2_3 -
2034 VS2_3: vz3 = vx3 + v1 - -
2035 S2: z = x + 1 - VS2_0 */
2037 prev_stmt_info = NULL;
2038 for (j = 0; j < ncopies; j++)
2043 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2044 if (op_type == binary_op)
2046 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2048 /* Vector shl and shr insn patterns can be defined with
2049 scalar operand 2 (shift operand). In this case, use
2050 constant or loop invariant op1 directly, without
2051 extending it to vector mode first. */
2052 optab_op2_mode = insn_data[icode].operand[2].mode;
2053 if (!VECTOR_MODE_P (optab_op2_mode))
2055 if (vect_print_dump_info (REPORT_DETAILS))
2056 fprintf (vect_dump, "operand 1 using scalar mode.");
2061 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2066 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2067 if (op_type == binary_op)
2068 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2071 /* Arguments are ready. create the new vector stmt. */
2073 if (op_type == binary_op)
2074 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
2075 build2 (code, vectype, vec_oprnd0, vec_oprnd1));
2077 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
2078 build1 (code, vectype, vec_oprnd0));
2079 new_temp = make_ssa_name (vec_dest, new_stmt);
2080 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2081 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2084 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2086 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2087 prev_stmt_info = vinfo_for_stmt (new_stmt);
2094 /* Function vectorizable_type_demotion
2096 Check if STMT performs a binary or unary operation that involves
2097 type demotion, and if it can be vectorized.
2098 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2099 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2100 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2103 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
2110 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2111 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2112 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2113 enum tree_code code;
2116 enum vect_def_type dt0;
2118 stmt_vec_info prev_stmt_info;
2128 enum machine_mode vec_mode;
2130 /* Is STMT a vectorizable type-demotion operation? */
2132 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2135 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2137 if (STMT_VINFO_LIVE_P (stmt_info))
2139 /* FORNOW: not yet supported. */
2140 if (vect_print_dump_info (REPORT_DETAILS))
2141 fprintf (vect_dump, "value used after loop.");
2145 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2148 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2151 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2152 code = TREE_CODE (operation);
2153 if (code != NOP_EXPR && code != CONVERT_EXPR)
2156 op0 = TREE_OPERAND (operation, 0);
2157 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2158 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2160 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2161 scalar_type = TREE_TYPE (scalar_dest);
2162 vectype_out = get_vectype_for_scalar_type (scalar_type);
2163 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2164 if (nunits_in != nunits_out / 2) /* FORNOW */
2167 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2168 gcc_assert (ncopies >= 1);
2170 /* Check the operands of the operation. */
2171 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2173 if (vect_print_dump_info (REPORT_DETAILS))
2174 fprintf (vect_dump, "use not simple.");
2178 /* Supportable by target? */
2179 code = VEC_PACK_MOD_EXPR;
2180 optab = optab_for_tree_code (VEC_PACK_MOD_EXPR, vectype_in);
2184 vec_mode = TYPE_MODE (vectype_in);
2185 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
2188 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2190 if (!vec_stmt) /* transformation not required. */
2192 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
2198 if (vect_print_dump_info (REPORT_DETAILS))
2199 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
2203 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2205 /* In case the vectorization factor (VF) is bigger than the number
2206 of elements that we can fit in a vectype (nunits), we have to generate
2207 more than one vector stmt - i.e - we need to "unroll" the
2208 vector stmt by a factor VF/nunits. */
2209 prev_stmt_info = NULL;
2210 for (j = 0; j < ncopies; j++)
2215 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2216 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2217 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd0);
2221 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
2222 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2225 /* Arguments are ready. Create the new vector stmt. */
2226 expr = build2 (code, vectype_out, vec_oprnd0, vec_oprnd1);
2227 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, expr);
2228 new_temp = make_ssa_name (vec_dest, new_stmt);
2229 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2230 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2233 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2235 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2237 prev_stmt_info = vinfo_for_stmt (new_stmt);
2240 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2245 /* Function vect_gen_widened_results_half
2247 Create a vector stmt whose code, type, number of arguments, and result
2248 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
2249 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
2250 In the case that CODE is a CALL_EXPR, this means that a call to DECL
2251 needs to be created (DECL is a function-decl of a target-builtin).
2252 STMT is the original scalar stmt that we are vectorizing. */
2255 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
2256 tree vec_oprnd0, tree vec_oprnd1, int op_type,
2257 tree vec_dest, block_stmt_iterator *bsi,
2267 /* Generate half of the widened result: */
2268 if (code == CALL_EXPR)
2270 /* Target specific support */
2271 vec_params = build_tree_list (NULL_TREE, vec_oprnd0);
2272 if (op_type == binary_op)
2273 vec_params = tree_cons (NULL_TREE, vec_oprnd1, vec_params);
2274 expr = build_function_call_expr (decl, vec_params);
2278 /* Generic support */
2279 gcc_assert (op_type == TREE_CODE_LENGTH (code));
2280 if (op_type == binary_op)
2281 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
2283 expr = build1 (code, vectype, vec_oprnd0);
2285 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, expr);
2286 new_temp = make_ssa_name (vec_dest, new_stmt);
2287 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2288 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2290 if (code == CALL_EXPR)
2292 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2294 if (TREE_CODE (sym) == SSA_NAME)
2295 sym = SSA_NAME_VAR (sym);
2296 mark_sym_for_renaming (sym);
2304 /* Function vectorizable_type_promotion
2306 Check if STMT performs a binary or unary operation that involves
2307 type promotion, and if it can be vectorized.
2308 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2309 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2310 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2313 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
2319 tree op0, op1 = NULL;
2320 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2321 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2322 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2323 enum tree_code code, code1 = CODE_FOR_nothing, code2 = CODE_FOR_nothing;
2324 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
2327 enum vect_def_type dt0, dt1;
2329 stmt_vec_info prev_stmt_info;
2337 /* Is STMT a vectorizable type-promotion operation? */
2339 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2342 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2344 if (STMT_VINFO_LIVE_P (stmt_info))
2346 /* FORNOW: not yet supported. */
2347 if (vect_print_dump_info (REPORT_DETAILS))
2348 fprintf (vect_dump, "value used after loop.");
2352 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2355 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2358 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2359 code = TREE_CODE (operation);
2360 if (code != NOP_EXPR && code != WIDEN_MULT_EXPR)
2363 op0 = TREE_OPERAND (operation, 0);
2364 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2365 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2366 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2367 gcc_assert (ncopies >= 1);
2369 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2370 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2371 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2372 if (nunits_out != nunits_in / 2) /* FORNOW */
2375 /* Check the operands of the operation. */
2376 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2378 if (vect_print_dump_info (REPORT_DETAILS))
2379 fprintf (vect_dump, "use not simple.");
2383 op_type = TREE_CODE_LENGTH (code);
2384 if (op_type == binary_op)
2386 op1 = TREE_OPERAND (operation, 1);
2387 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2389 if (vect_print_dump_info (REPORT_DETAILS))
2390 fprintf (vect_dump, "use not simple.");
2395 /* Supportable by target? */
2396 if (!supportable_widening_operation (code, stmt, vectype_in,
2397 &decl1, &decl2, &code1, &code2))
2400 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2402 if (!vec_stmt) /* transformation not required. */
2404 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
2410 if (vect_print_dump_info (REPORT_DETAILS))
2411 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
2415 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2417 /* In case the vectorization factor (VF) is bigger than the number
2418 of elements that we can fit in a vectype (nunits), we have to generate
2419 more than one vector stmt - i.e - we need to "unroll" the
2420 vector stmt by a factor VF/nunits. */
2422 prev_stmt_info = NULL;
2423 for (j = 0; j < ncopies; j++)
2428 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2429 if (op_type == binary_op)
2430 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2434 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2435 if (op_type == binary_op)
2436 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2439 /* Arguments are ready. Create the new vector stmt. We are creating
2440 two vector defs because the widened result does not fit in one vector.
2441 The vectorized stmt can be expressed as a call to a taregt builtin,
2442 or a using a tree-code. */
2443 /* Generate first half of the widened result: */
2444 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
2445 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2447 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2449 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2450 prev_stmt_info = vinfo_for_stmt (new_stmt);
2452 /* Generate second half of the widened result: */
2453 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
2454 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2455 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2456 prev_stmt_info = vinfo_for_stmt (new_stmt);
2460 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2465 /* Function vect_strided_store_supported.
2467 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
2468 and FALSE otherwise. */
2471 vect_strided_store_supported (tree vectype)
2473 optab interleave_high_optab, interleave_low_optab;
2476 mode = (int) TYPE_MODE (vectype);
2478 /* Check that the operation is supported. */
2479 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
2481 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
2483 if (!interleave_high_optab || !interleave_low_optab)
2485 if (vect_print_dump_info (REPORT_DETAILS))
2486 fprintf (vect_dump, "no optab for interleave.");
2490 if (interleave_high_optab->handlers[(int) mode].insn_code
2492 || interleave_low_optab->handlers[(int) mode].insn_code
2493 == CODE_FOR_nothing)
2495 if (vect_print_dump_info (REPORT_DETAILS))
2496 fprintf (vect_dump, "interleave op not supported by target.");
2503 /* Function vect_permute_store_chain.
2505 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
2506 a power of 2, generate interleave_high/low stmts to reorder the data
2507 correctly for the stores. Return the final references for stores in
2510 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
2511 The input is 4 vectors each containing 8 elements. We assign a number to each
2512 element, the input sequence is:
2514 1st vec: 0 1 2 3 4 5 6 7
2515 2nd vec: 8 9 10 11 12 13 14 15
2516 3rd vec: 16 17 18 19 20 21 22 23
2517 4th vec: 24 25 26 27 28 29 30 31
2519 The output sequence should be:
2521 1st vec: 0 8 16 24 1 9 17 25
2522 2nd vec: 2 10 18 26 3 11 19 27
2523 3rd vec: 4 12 20 28 5 13 21 30
2524 4th vec: 6 14 22 30 7 15 23 31
2526 i.e., we interleave the contents of the four vectors in their order.
2528 We use interleave_high/low instructions to create such output. The input of
2529 each interleave_high/low operation is two vectors:
2532 the even elements of the result vector are obtained left-to-right from the
2533 high/low elements of the first vector. The odd elements of the result are
2534 obtained left-to-right from the high/low elements of the second vector.
2535 The output of interleave_high will be: 0 4 1 5
2536 and of interleave_low: 2 6 3 7
2539 The permutation is done in log LENGTH stages. In each stage interleave_high
2540 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
2541 where the first argument is taken from the first half of DR_CHAIN and the
2542 second argument from it's second half.
2545 I1: interleave_high (1st vec, 3rd vec)
2546 I2: interleave_low (1st vec, 3rd vec)
2547 I3: interleave_high (2nd vec, 4th vec)
2548 I4: interleave_low (2nd vec, 4th vec)
2550 The output for the first stage is:
2552 I1: 0 16 1 17 2 18 3 19
2553 I2: 4 20 5 21 6 22 7 23
2554 I3: 8 24 9 25 10 26 11 27
2555 I4: 12 28 13 29 14 30 15 31
2557 The output of the second stage, i.e. the final result is:
2559 I1: 0 8 16 24 1 9 17 25
2560 I2: 2 10 18 26 3 11 19 27
2561 I3: 4 12 20 28 5 13 21 30
2562 I4: 6 14 22 30 7 15 23 31. */
2565 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
2566 unsigned int length,
2568 block_stmt_iterator *bsi,
2569 VEC(tree,heap) **result_chain)
2571 tree perm_dest, perm_stmt, vect1, vect2, high, low;
2572 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2576 VEC(tree,heap) *first, *second;
2578 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2579 first = VEC_alloc (tree, heap, length/2);
2580 second = VEC_alloc (tree, heap, length/2);
2582 /* Check that the operation is supported. */
2583 if (!vect_strided_store_supported (vectype))
2586 *result_chain = VEC_copy (tree, heap, dr_chain);
2588 for (i = 0; i < exact_log2 (length); i++)
2590 for (j = 0; j < length/2; j++)
2592 vect1 = VEC_index (tree, dr_chain, j);
2593 vect2 = VEC_index (tree, dr_chain, j+length/2);
2595 /* Create interleaving stmt:
2596 in the case of big endian:
2597 high = interleave_high (vect1, vect2)
2598 and in the case of little endian:
2599 high = interleave_low (vect1, vect2). */
2600 perm_dest = create_tmp_var (vectype, "vect_inter_high");
2601 add_referenced_var (perm_dest);
2602 if (BYTES_BIG_ENDIAN)
2603 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
2604 build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype,
2607 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
2608 build2 (VEC_INTERLEAVE_LOW_EXPR, vectype,
2610 high = make_ssa_name (perm_dest, perm_stmt);
2611 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
2612 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
2613 VEC_replace (tree, *result_chain, 2*j, high);
2615 /* Create interleaving stmt:
2616 in the case of big endian:
2617 low = interleave_low (vect1, vect2)
2618 and in the case of little endian:
2619 low = interleave_high (vect1, vect2). */
2620 perm_dest = create_tmp_var (vectype, "vect_inter_low");
2621 add_referenced_var (perm_dest);
2622 if (BYTES_BIG_ENDIAN)
2623 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
2624 build2 (VEC_INTERLEAVE_LOW_EXPR, vectype,
2627 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
2628 build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype,
2630 low = make_ssa_name (perm_dest, perm_stmt);
2631 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
2632 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
2633 VEC_replace (tree, *result_chain, 2*j+1, low);
2635 dr_chain = VEC_copy (tree, heap, *result_chain);
2641 /* Function vectorizable_store.
2643 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
2645 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2646 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2647 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2650 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2655 tree vec_oprnd = NULL_TREE;
2656 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2657 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
2658 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2659 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2660 enum machine_mode vec_mode;
2662 enum dr_alignment_support alignment_support_cheme;
2664 def_operand_p def_p;
2666 enum vect_def_type dt;
2667 stmt_vec_info prev_stmt_info = NULL;
2668 tree dataref_ptr = NULL_TREE;
2669 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2670 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2672 tree next_stmt, first_stmt;
2673 bool strided_store = false;
2674 unsigned int group_size, i;
2675 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
2676 gcc_assert (ncopies >= 1);
2678 /* Is vectorizable store? */
2680 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2683 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2684 if (TREE_CODE (scalar_dest) != ARRAY_REF
2685 && TREE_CODE (scalar_dest) != INDIRECT_REF
2686 && !DR_GROUP_FIRST_DR (stmt_info))
2689 op = GIMPLE_STMT_OPERAND (stmt, 1);
2690 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
2692 if (vect_print_dump_info (REPORT_DETAILS))
2693 fprintf (vect_dump, "use not simple.");
2697 vec_mode = TYPE_MODE (vectype);
2698 /* FORNOW. In some cases can vectorize even if data-type not supported
2699 (e.g. - array initialization with 0). */
2700 if (mov_optab->handlers[(int)vec_mode].insn_code == CODE_FOR_nothing)
2703 if (!STMT_VINFO_DATA_REF (stmt_info))
2706 if (DR_GROUP_FIRST_DR (stmt_info))
2708 strided_store = true;
2709 if (!vect_strided_store_supported (vectype))
2713 if (!vec_stmt) /* transformation not required. */
2715 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
2721 if (vect_print_dump_info (REPORT_DETAILS))
2722 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
2726 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
2727 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2728 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
2730 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
2732 /* We vectorize all the stmts of the interleaving group when we
2733 reach the last stmt in the group. */
2734 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
2735 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
2737 *vec_stmt = NULL_TREE;
2748 dr_chain = VEC_alloc (tree, heap, group_size);
2749 oprnds = VEC_alloc (tree, heap, group_size);
2751 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
2752 gcc_assert (alignment_support_cheme);
2753 gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */
2755 /* In case the vectorization factor (VF) is bigger than the number
2756 of elements that we can fit in a vectype (nunits), we have to generate
2757 more than one vector stmt - i.e - we need to "unroll" the
2758 vector stmt by a factor VF/nunits. For more details see documentation in
2759 vect_get_vec_def_for_copy_stmt. */
2761 /* In case of interleaving (non-unit strided access):
2768 We create vectorized storess starting from base address (the access of the
2769 first stmt in the chain (S2 in the above example), when the last store stmt
2770 of the chain (S4) is reached:
2773 VS2: &base + vec_size*1 = vx0
2774 VS3: &base + vec_size*2 = vx1
2775 VS4: &base + vec_size*3 = vx3
2777 Then permutation statements are generated:
2779 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
2780 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
2783 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
2784 (the order of the data-refs in the output of vect_permute_store_chain
2785 corresponds to the order of scalar stmts in the interleaving chain - see
2786 the documentation of vect_permute_store_chain()).
2788 In case of both multiple types and interleaving, above vector stores and
2789 permutation stmts are created for every copy. The result vector stmts are
2790 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
2791 STMT_VINFO_RELATED_STMT for the next copies.
2794 prev_stmt_info = NULL;
2795 for (j = 0; j < ncopies; j++)
2802 /* For interleaved stores we collect vectorized defs for all the
2803 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
2804 as an input to vect_permute_store_chain(), and OPRNDS as an input
2805 to vect_get_vec_def_for_stmt_copy() for the next copy.
2806 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
2807 OPRNDS are of size 1.
2809 next_stmt = first_stmt;
2810 for (i = 0; i < group_size; i++)
2812 /* Since gaps are not supported for interleaved stores, GROUP_SIZE
2813 is the exact number of stmts in the chain. Therefore, NEXT_STMT
2814 can't be NULL_TREE. In case that there is no interleaving,
2815 GROUP_SIZE is 1, and only one iteration of the loop will be
2818 gcc_assert (next_stmt);
2819 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
2820 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
2821 VEC_quick_push(tree, dr_chain, vec_oprnd);
2822 VEC_quick_push(tree, oprnds, vec_oprnd);
2823 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
2825 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE,
2826 &dummy, &ptr_incr, false,
2827 TREE_TYPE (vec_oprnd));
2831 /* For interleaved stores we created vectorized defs for all the
2832 defs stored in OPRNDS in the previous iteration (previous copy).
2833 DR_CHAIN is then used as an input to vect_permute_store_chain(),
2834 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
2836 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
2837 OPRNDS are of size 1.
2839 for (i = 0; i < group_size; i++)
2841 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
2842 VEC_index (tree, oprnds, i));
2843 VEC_replace(tree, dr_chain, i, vec_oprnd);
2844 VEC_replace(tree, oprnds, i, vec_oprnd);
2846 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
2851 result_chain = VEC_alloc (tree, heap, group_size);
2853 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
2858 next_stmt = first_stmt;
2859 for (i = 0; i < group_size; i++)
2861 /* For strided stores vectorized defs are interleaved in
2862 vect_permute_store_chain(). */
2864 vec_oprnd = VEC_index(tree, result_chain, i);
2866 data_ref = build_fold_indirect_ref (dataref_ptr);
2867 /* Arguments are ready. Create the new vector stmt. */
2868 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, data_ref,
2870 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2872 /* Set the VDEFs for the vector pointer. If this virtual def
2873 has a use outside the loop and a loop peel is performed
2874 then the def may be renamed by the peel. Mark it for
2875 renaming so the later use will also be renamed. */
2876 copy_virtual_operands (new_stmt, next_stmt);
2879 /* The original store is deleted so the same SSA_NAMEs
2881 FOR_EACH_SSA_TREE_OPERAND (def, next_stmt, iter, SSA_OP_VDEF)
2883 SSA_NAME_DEF_STMT (def) = new_stmt;
2884 mark_sym_for_renaming (SSA_NAME_VAR (def));
2887 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2891 /* Create new names for all the definitions created by COPY and
2892 add replacement mappings for each new name. */
2893 FOR_EACH_SSA_DEF_OPERAND (def_p, new_stmt, iter, SSA_OP_VDEF)
2895 create_new_def_for (DEF_FROM_PTR (def_p), new_stmt, def_p);
2896 mark_sym_for_renaming (SSA_NAME_VAR (DEF_FROM_PTR (def_p)));
2899 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2902 prev_stmt_info = vinfo_for_stmt (new_stmt);
2903 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
2906 /* Bump the vector pointer. */
2907 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
2915 /* Function vect_setup_realignment
2917 This function is called when vectorizing an unaligned load using
2918 the dr_unaligned_software_pipeline scheme.
2919 This function generates the following code at the loop prolog:
2922 msq_init = *(floor(p)); # prolog load
2923 realignment_token = call target_builtin;
2925 msq = phi (msq_init, ---)
2927 The code above sets up a new (vector) pointer, pointing to the first
2928 location accessed by STMT, and a "floor-aligned" load using that pointer.
2929 It also generates code to compute the "realignment-token" (if the relevant
2930 target hook was defined), and creates a phi-node at the loop-header bb
2931 whose arguments are the result of the prolog-load (created by this
2932 function) and the result of a load that takes place in the loop (to be
2933 created by the caller to this function).
2934 The caller to this function uses the phi-result (msq) to create the
2935 realignment code inside the loop, and sets up the missing phi argument,
2939 msq = phi (msq_init, lsq)
2940 lsq = *(floor(p')); # load in loop
2941 result = realign_load (msq, lsq, realignment_token);
2944 STMT - (scalar) load stmt to be vectorized. This load accesses
2945 a memory location that may be unaligned.
2946 BSI - place where new code is to be inserted.
2949 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
2950 target hook, if defined.
2951 Return value - the result of the loop-header phi node. */
2954 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
2955 tree *realignment_token)
2957 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2958 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2959 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2960 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2961 edge pe = loop_preheader_edge (loop);
2962 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2975 /* 1. Create msq_init = *(floor(p1)) in the loop preheader */
2976 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2977 ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true,
2979 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
2980 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, data_ref);
2981 new_temp = make_ssa_name (vec_dest, new_stmt);
2982 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2983 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
2984 gcc_assert (!new_bb);
2985 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
2986 copy_virtual_operands (new_stmt, stmt);
2987 update_vuses_to_preheader (new_stmt, loop);
2989 /* 2. Create permutation mask, if required, in loop preheader. */
2990 if (targetm.vectorize.builtin_mask_for_load)
2993 tree params = build_tree_list (NULL_TREE, init_addr);
2995 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
2996 new_stmt = build_function_call_expr (builtin_decl, params);
2997 vec_dest = vect_create_destination_var (scalar_dest,
2998 TREE_TYPE (new_stmt));
2999 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3001 new_temp = make_ssa_name (vec_dest, new_stmt);
3002 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3003 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3004 gcc_assert (!new_bb);
3005 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
3007 /* The result of the CALL_EXPR to this builtin is determined from
3008 the value of the parameter and no global variables are touched
3009 which makes the builtin a "const" function. Requiring the
3010 builtin to have the "const" attribute makes it unnecessary
3011 to call mark_call_clobbered. */
3012 gcc_assert (TREE_READONLY (builtin_decl));
3015 /* 3. Create msq = phi <msq_init, lsq> in loop */
3016 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3017 msq = make_ssa_name (vec_dest, NULL_TREE);
3018 phi_stmt = create_phi_node (msq, loop->header);
3019 SSA_NAME_DEF_STMT (msq) = phi_stmt;
3020 add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop));
3026 /* Function vect_strided_load_supported.
3028 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
3029 and FALSE otherwise. */
3032 vect_strided_load_supported (tree vectype)
3034 optab perm_even_optab, perm_odd_optab;
3037 mode = (int) TYPE_MODE (vectype);
3039 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
3040 if (!perm_even_optab)
3042 if (vect_print_dump_info (REPORT_DETAILS))
3043 fprintf (vect_dump, "no optab for perm_even.");
3047 if (perm_even_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3049 if (vect_print_dump_info (REPORT_DETAILS))
3050 fprintf (vect_dump, "perm_even op not supported by target.");
3054 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
3055 if (!perm_odd_optab)
3057 if (vect_print_dump_info (REPORT_DETAILS))
3058 fprintf (vect_dump, "no optab for perm_odd.");
3062 if (perm_odd_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3064 if (vect_print_dump_info (REPORT_DETAILS))
3065 fprintf (vect_dump, "perm_odd op not supported by target.");
3072 /* Function vect_permute_load_chain.
3074 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
3075 a power of 2, generate extract_even/odd stmts to reorder the input data
3076 correctly. Return the final references for loads in RESULT_CHAIN.
3078 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3079 The input is 4 vectors each containing 8 elements. We assign a number to each
3080 element, the input sequence is:
3082 1st vec: 0 1 2 3 4 5 6 7
3083 2nd vec: 8 9 10 11 12 13 14 15
3084 3rd vec: 16 17 18 19 20 21 22 23
3085 4th vec: 24 25 26 27 28 29 30 31
3087 The output sequence should be:
3089 1st vec: 0 4 8 12 16 20 24 28
3090 2nd vec: 1 5 9 13 17 21 25 29
3091 3rd vec: 2 6 10 14 18 22 26 30
3092 4th vec: 3 7 11 15 19 23 27 31
3094 i.e., the first output vector should contain the first elements of each
3095 interleaving group, etc.
3097 We use extract_even/odd instructions to create such output. The input of each
3098 extract_even/odd operation is two vectors
3102 and the output is the vector of extracted even/odd elements. The output of
3103 extract_even will be: 0 2 4 6
3104 and of extract_odd: 1 3 5 7
3107 The permutation is done in log LENGTH stages. In each stage extract_even and
3108 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
3109 order. In our example,
3111 E1: extract_even (1st vec, 2nd vec)
3112 E2: extract_odd (1st vec, 2nd vec)
3113 E3: extract_even (3rd vec, 4th vec)
3114 E4: extract_odd (3rd vec, 4th vec)
3116 The output for the first stage will be:
3118 E1: 0 2 4 6 8 10 12 14
3119 E2: 1 3 5 7 9 11 13 15
3120 E3: 16 18 20 22 24 26 28 30
3121 E4: 17 19 21 23 25 27 29 31
3123 In order to proceed and create the correct sequence for the next stage (or
3124 for the correct output, if the second stage is the last one, as in our
3125 example), we first put the output of extract_even operation and then the
3126 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
3127 The input for the second stage is:
3129 1st vec (E1): 0 2 4 6 8 10 12 14
3130 2nd vec (E3): 16 18 20 22 24 26 28 30
3131 3rd vec (E2): 1 3 5 7 9 11 13 15
3132 4th vec (E4): 17 19 21 23 25 27 29 31
3134 The output of the second stage:
3136 E1: 0 4 8 12 16 20 24 28
3137 E2: 2 6 10 14 18 22 26 30
3138 E3: 1 5 9 13 17 21 25 29
3139 E4: 3 7 11 15 19 23 27 31
3141 And RESULT_CHAIN after reordering:
3143 1st vec (E1): 0 4 8 12 16 20 24 28
3144 2nd vec (E3): 1 5 9 13 17 21 25 29
3145 3rd vec (E2): 2 6 10 14 18 22 26 30
3146 4th vec (E4): 3 7 11 15 19 23 27 31. */
3149 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
3150 unsigned int length,
3152 block_stmt_iterator *bsi,
3153 VEC(tree,heap) **result_chain)
3155 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
3156 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3160 /* Check that the operation is supported. */
3161 if (!vect_strided_load_supported (vectype))
3164 *result_chain = VEC_copy (tree, heap, dr_chain);
3165 for (i = 0; i < exact_log2 (length); i++)
3167 for (j = 0; j < length; j +=2)
3169 first_vect = VEC_index (tree, dr_chain, j);
3170 second_vect = VEC_index (tree, dr_chain, j+1);
3172 /* data_ref = permute_even (first_data_ref, second_data_ref); */
3173 perm_dest = create_tmp_var (vectype, "vect_perm_even");
3174 add_referenced_var (perm_dest);
3176 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
3177 build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
3178 first_vect, second_vect));
3180 data_ref = make_ssa_name (perm_dest, perm_stmt);
3181 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3182 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3183 mark_symbols_for_renaming (perm_stmt);
3185 VEC_replace (tree, *result_chain, j/2, data_ref);
3187 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
3188 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
3189 add_referenced_var (perm_dest);
3191 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
3192 build2 (VEC_EXTRACT_ODD_EXPR, vectype,
3193 first_vect, second_vect));
3194 data_ref = make_ssa_name (perm_dest, perm_stmt);
3195 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3196 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3197 mark_symbols_for_renaming (perm_stmt);
3199 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
3201 dr_chain = VEC_copy (tree, heap, *result_chain);
3207 /* Function vect_transform_strided_load.
3209 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
3210 to perform their permutation and ascribe the result vectorized statements to
3211 the scalar statements.
3215 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
3216 block_stmt_iterator *bsi)
3218 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3219 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3220 tree next_stmt, new_stmt;
3221 VEC(tree,heap) *result_chain = NULL;
3222 unsigned int i, gap_count;
3225 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
3226 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
3227 vectors, that are ready for vector computation. */
3228 result_chain = VEC_alloc (tree, heap, size);
3230 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
3233 /* Put a permuted data-ref in the VECTORIZED_STMT field.
3234 Since we scan the chain starting from it's first node, their order
3235 corresponds the order of data-refs in RESULT_CHAIN. */
3236 next_stmt = first_stmt;
3238 for (i = 0; VEC_iterate(tree, result_chain, i, tmp_data_ref); i++)
3243 /* Skip the gaps. Loads created for the gaps will be removed by dead
3244 code elimination pass later.
3245 DR_GROUP_GAP is the number of steps in elements from the previous
3246 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
3247 correspond to the gaps.
3249 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
3257 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
3258 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
3259 copies, and we put the new vector statement in the first available
3261 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
3262 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
3265 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
3266 tree rel_stmt = STMT_VINFO_RELATED_STMT (
3267 vinfo_for_stmt (prev_stmt));
3270 prev_stmt = rel_stmt;
3271 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
3273 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
3275 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3277 /* If NEXT_STMT accesses the same DR as the previous statement,
3278 put the same TMP_DATA_REF as its vectorized statement; otherwise
3279 get the next data-ref from RESULT_CHAIN. */
3280 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
3288 /* vectorizable_load.
3290 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
3292 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3293 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3294 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3297 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3300 tree vec_dest = NULL;
3301 tree data_ref = NULL;
3303 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3304 stmt_vec_info prev_stmt_info;
3305 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3306 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3307 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
3308 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3311 tree new_stmt = NULL_TREE;
3313 enum dr_alignment_support alignment_support_cheme;
3314 tree dataref_ptr = NULL_TREE;
3316 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3317 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3318 int i, j, group_size;
3319 tree msq = NULL_TREE, lsq;
3320 tree offset = NULL_TREE;
3321 tree realignment_token = NULL_TREE;
3322 tree phi_stmt = NULL_TREE;
3323 VEC(tree,heap) *dr_chain = NULL;
3324 bool strided_load = false;
3327 /* Is vectorizable load? */
3328 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3331 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3333 if (STMT_VINFO_LIVE_P (stmt_info))
3335 /* FORNOW: not yet supported. */
3336 if (vect_print_dump_info (REPORT_DETAILS))
3337 fprintf (vect_dump, "value used after loop.");
3341 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3344 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3345 if (TREE_CODE (scalar_dest) != SSA_NAME)
3348 op = GIMPLE_STMT_OPERAND (stmt, 1);
3349 if (TREE_CODE (op) != ARRAY_REF
3350 && TREE_CODE (op) != INDIRECT_REF
3351 && !DR_GROUP_FIRST_DR (stmt_info))
3354 if (!STMT_VINFO_DATA_REF (stmt_info))
3357 mode = (int) TYPE_MODE (vectype);
3359 /* FORNOW. In some cases can vectorize even if data-type not supported
3360 (e.g. - data copies). */
3361 if (mov_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3363 if (vect_print_dump_info (REPORT_DETAILS))
3364 fprintf (vect_dump, "Aligned load, but unsupported type.");
3368 /* Check if the load is a part of an interleaving chain. */
3369 if (DR_GROUP_FIRST_DR (stmt_info))
3371 strided_load = true;
3373 /* Check if interleaving is supported. */
3374 if (!vect_strided_load_supported (vectype))
3378 if (!vec_stmt) /* transformation not required. */
3380 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
3386 if (vect_print_dump_info (REPORT_DETAILS))
3387 fprintf (vect_dump, "transform load.");
3391 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3392 /* Check if the chain of loads is already vectorized. */
3393 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
3395 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3398 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3399 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3400 dr_chain = VEC_alloc (tree, heap, group_size);
3409 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3410 gcc_assert (alignment_support_cheme);
3413 /* In case the vectorization factor (VF) is bigger than the number
3414 of elements that we can fit in a vectype (nunits), we have to generate
3415 more than one vector stmt - i.e - we need to "unroll" the
3416 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3417 from one copy of the vector stmt to the next, in the field
3418 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3419 stages to find the correct vector defs to be used when vectorizing
3420 stmts that use the defs of the current stmt. The example below illustrates
3421 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3422 4 vectorized stmts):
3424 before vectorization:
3425 RELATED_STMT VEC_STMT
3429 step 1: vectorize stmt S1:
3430 We first create the vector stmt VS1_0, and, as usual, record a
3431 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
3432 Next, we create the vector stmt VS1_1, and record a pointer to
3433 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
3434 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
3436 RELATED_STMT VEC_STMT
3437 VS1_0: vx0 = memref0 VS1_1 -
3438 VS1_1: vx1 = memref1 VS1_2 -
3439 VS1_2: vx2 = memref2 VS1_3 -
3440 VS1_3: vx3 = memref3 - -
3441 S1: x = load - VS1_0
3444 See in documentation in vect_get_vec_def_for_stmt_copy for how the
3445 information we recorded in RELATED_STMT field is used to vectorize
3448 /* In case of interleaving (non-unit strided access):
3455 Vectorized loads are created in the order of memory accesses
3456 starting from the access of the first stmt of the chain:
3459 VS2: vx1 = &base + vec_size*1
3460 VS3: vx3 = &base + vec_size*2
3461 VS4: vx4 = &base + vec_size*3
3463 Then permutation statements are generated:
3465 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
3466 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
3469 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3470 (the order of the data-refs in the output of vect_permute_load_chain
3471 corresponds to the order of scalar stmts in the interleaving chain - see
3472 the documentation of vect_permute_load_chain()).
3473 The generation of permutation stmts and recording them in
3474 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
3476 In case of both multiple types and interleaving, the vector loads and
3477 permutation stmts above are created for every copy. The result vector stmts
3478 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3479 STMT_VINFO_RELATED_STMT for the next copies. */
3481 /* If the data reference is aligned (dr_aligned) or potentially unaligned
3482 on a target that supports unaligned accesses (dr_unaligned_supported)
3483 we generate the following code:
3487 p = p + indx * vectype_size;
3492 Otherwise, the data reference is potentially unaligned on a target that
3493 does not support unaligned accesses (dr_unaligned_software_pipeline) -
3494 then generate the following code, in which the data in each iteration is
3495 obtained by two vector loads, one from the previous iteration, and one
3496 from the current iteration:
3498 msq_init = *(floor(p1))
3499 p2 = initial_addr + VS - 1;
3500 realignment_token = call target_builtin;
3503 p2 = p2 + indx * vectype_size
3505 vec_dest = realign_load (msq, lsq, realignment_token)
3510 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3512 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token);
3513 phi_stmt = SSA_NAME_DEF_STMT (msq);
3514 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
3517 prev_stmt_info = NULL;
3518 for (j = 0; j < ncopies; j++)
3520 /* 1. Create the vector pointer update chain. */
3522 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset, &dummy,
3523 &ptr_incr, false, NULL_TREE);
3525 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3527 for (i = 0; i < group_size; i++)
3529 /* 2. Create the vector-load in the loop. */
3530 switch (alignment_support_cheme)
3533 gcc_assert (aligned_access_p (first_dr));
3534 data_ref = build_fold_indirect_ref (dataref_ptr);
3536 case dr_unaligned_supported:
3538 int mis = DR_MISALIGNMENT (first_dr);
3539 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
3541 gcc_assert (!aligned_access_p (first_dr));
3542 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
3544 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
3547 case dr_unaligned_software_pipeline:
3548 gcc_assert (!aligned_access_p (first_dr));
3549 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
3554 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3555 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3557 new_temp = make_ssa_name (vec_dest, new_stmt);
3558 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3559 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3560 copy_virtual_operands (new_stmt, stmt);
3561 mark_symbols_for_renaming (new_stmt);
3563 /* 3. Handle explicit realignment if necessary/supported. */
3564 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3567 <vec_dest = realign_load (msq, lsq, realignment_token)> */
3568 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
3569 if (!realignment_token)
3570 realignment_token = dataref_ptr;
3571 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3573 build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token);
3574 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3576 new_temp = make_ssa_name (vec_dest, new_stmt);
3577 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3578 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3579 if (i == group_size - 1 && j == ncopies - 1)
3580 add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop));
3584 VEC_quick_push (tree, dr_chain, new_temp);
3585 if (i < group_size - 1)
3586 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3591 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
3593 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3594 dr_chain = VEC_alloc (tree, heap, group_size);
3599 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3601 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3602 prev_stmt_info = vinfo_for_stmt (new_stmt);
3610 /* Function vectorizable_live_operation.
3612 STMT computes a value that is used outside the loop. Check if
3613 it can be supported. */
3616 vectorizable_live_operation (tree stmt,
3617 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3618 tree *vec_stmt ATTRIBUTE_UNUSED)
3621 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3622 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3624 enum tree_code code;
3628 enum vect_def_type dt;
3630 if (!STMT_VINFO_LIVE_P (stmt_info))
3633 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3636 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3639 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3640 code = TREE_CODE (operation);
3642 op_type = TREE_CODE_LENGTH (code);
3644 /* FORNOW: support only if all uses are invariant. This means
3645 that the scalar operations can remain in place, unvectorized.
3646 The original last scalar value that they compute will be used. */
3648 for (i = 0; i < op_type; i++)
3650 op = TREE_OPERAND (operation, i);
3651 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
3653 if (vect_print_dump_info (REPORT_DETAILS))
3654 fprintf (vect_dump, "use not simple.");
3658 if (dt != vect_invariant_def && dt != vect_constant_def)
3662 /* No transformation is required for the cases we currently support. */
3667 /* Function vect_is_simple_cond.
3670 LOOP - the loop that is being vectorized.
3671 COND - Condition that is checked for simple use.
3673 Returns whether a COND can be vectorized. Checks whether
3674 condition operands are supportable using vec_is_simple_use. */
3677 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
3681 enum vect_def_type dt;
3683 if (!COMPARISON_CLASS_P (cond))
3686 lhs = TREE_OPERAND (cond, 0);
3687 rhs = TREE_OPERAND (cond, 1);
3689 if (TREE_CODE (lhs) == SSA_NAME)
3691 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
3692 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
3695 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST)
3698 if (TREE_CODE (rhs) == SSA_NAME)
3700 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
3701 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
3704 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST)
3710 /* vectorizable_condition.
3712 Check if STMT is conditional modify expression that can be vectorized.
3713 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3714 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
3717 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3720 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3722 tree scalar_dest = NULL_TREE;
3723 tree vec_dest = NULL_TREE;
3724 tree op = NULL_TREE;
3725 tree cond_expr, then_clause, else_clause;
3726 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3727 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3728 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
3729 tree vec_compare, vec_cond_expr;
3731 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3732 enum machine_mode vec_mode;
3734 enum vect_def_type dt;
3735 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3736 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3738 gcc_assert (ncopies >= 1);
3740 return false; /* FORNOW */
3742 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3745 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3747 if (STMT_VINFO_LIVE_P (stmt_info))
3749 /* FORNOW: not yet supported. */
3750 if (vect_print_dump_info (REPORT_DETAILS))
3751 fprintf (vect_dump, "value used after loop.");
3755 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3758 op = GIMPLE_STMT_OPERAND (stmt, 1);
3760 if (TREE_CODE (op) != COND_EXPR)
3763 cond_expr = TREE_OPERAND (op, 0);
3764 then_clause = TREE_OPERAND (op, 1);
3765 else_clause = TREE_OPERAND (op, 2);
3767 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
3770 /* We do not handle two different vector types for the condition
3772 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
3775 if (TREE_CODE (then_clause) == SSA_NAME)
3777 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
3778 if (!vect_is_simple_use (then_clause, loop_vinfo,
3779 &then_def_stmt, &def, &dt))
3782 else if (TREE_CODE (then_clause) != INTEGER_CST
3783 && TREE_CODE (then_clause) != REAL_CST)
3786 if (TREE_CODE (else_clause) == SSA_NAME)
3788 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
3789 if (!vect_is_simple_use (else_clause, loop_vinfo,
3790 &else_def_stmt, &def, &dt))
3793 else if (TREE_CODE (else_clause) != INTEGER_CST
3794 && TREE_CODE (else_clause) != REAL_CST)
3798 vec_mode = TYPE_MODE (vectype);
3802 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
3803 return expand_vec_cond_expr_p (op, vec_mode);
3809 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3810 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3812 /* Handle cond expr. */
3814 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
3816 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
3817 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
3818 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
3820 /* Arguments are ready. create the new vector stmt. */
3821 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
3822 vec_cond_lhs, vec_cond_rhs);
3823 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
3824 vec_compare, vec_then_clause, vec_else_clause);
3826 *vec_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3828 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3829 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
3830 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3835 /* Function vect_transform_stmt.
3837 Create a vectorized stmt to replace STMT, and insert it at BSI. */
3840 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
3842 bool is_store = false;
3843 tree vec_stmt = NULL_TREE;
3844 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3845 tree orig_stmt_in_pattern;
3848 if (STMT_VINFO_RELEVANT_P (stmt_info))
3850 switch (STMT_VINFO_TYPE (stmt_info))
3852 case type_demotion_vec_info_type:
3853 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
3857 case type_promotion_vec_info_type:
3858 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
3862 case op_vec_info_type:
3863 done = vectorizable_operation (stmt, bsi, &vec_stmt);
3867 case assignment_vec_info_type:
3868 done = vectorizable_assignment (stmt, bsi, &vec_stmt);
3872 case load_vec_info_type:
3873 done = vectorizable_load (stmt, bsi, &vec_stmt);
3877 case store_vec_info_type:
3878 done = vectorizable_store (stmt, bsi, &vec_stmt);
3880 if (DR_GROUP_FIRST_DR (stmt_info))
3882 /* In case of interleaving, the whole chain is vectorized when the
3883 last store in the chain is reached. Store stmts before the last
3884 one are skipped, and there vec_stmt_info shouldn't be freed
3886 *strided_store = true;
3887 if (STMT_VINFO_VEC_STMT (stmt_info))
3894 case condition_vec_info_type:
3895 done = vectorizable_condition (stmt, bsi, &vec_stmt);
3899 case call_vec_info_type:
3900 done = vectorizable_call (stmt, bsi, &vec_stmt);
3904 if (vect_print_dump_info (REPORT_DETAILS))
3905 fprintf (vect_dump, "stmt not supported.");
3909 gcc_assert (vec_stmt || *strided_store);
3912 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
3913 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
3914 if (orig_stmt_in_pattern)
3916 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
3917 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
3919 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3921 /* STMT was inserted by the vectorizer to replace a
3922 computation idiom. ORIG_STMT_IN_PATTERN is a stmt in the
3923 original sequence that computed this idiom. We need to
3924 record a pointer to VEC_STMT in the stmt_info of
3925 ORIG_STMT_IN_PATTERN. See more details in the
3926 documentation of vect_pattern_recog. */
3928 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
3934 if (STMT_VINFO_LIVE_P (stmt_info))
3936 switch (STMT_VINFO_TYPE (stmt_info))
3938 case reduc_vec_info_type:
3939 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
3944 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
3953 /* This function builds ni_name = number of iterations loop executes
3954 on the loop preheader. */
3957 vect_build_loop_niters (loop_vec_info loop_vinfo)
3959 tree ni_name, stmt, var;
3961 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3962 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
3964 var = create_tmp_var (TREE_TYPE (ni), "niters");
3965 add_referenced_var (var);
3966 ni_name = force_gimple_operand (ni, &stmt, false, var);
3968 pe = loop_preheader_edge (loop);
3971 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
3972 gcc_assert (!new_bb);
3979 /* This function generates the following statements:
3981 ni_name = number of iterations loop executes
3982 ratio = ni_name / vf
3983 ratio_mult_vf_name = ratio * vf
3985 and places them at the loop preheader edge. */
3988 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
3990 tree *ratio_mult_vf_name_ptr,
3991 tree *ratio_name_ptr)
3999 tree ratio_mult_vf_name;
4000 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4001 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
4002 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4005 pe = loop_preheader_edge (loop);
4007 /* Generate temporary variable that contains
4008 number of iterations loop executes. */
4010 ni_name = vect_build_loop_niters (loop_vinfo);
4011 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
4013 /* Create: ratio = ni >> log2(vf) */
4015 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
4016 if (!is_gimple_val (ratio_name))
4018 var = create_tmp_var (TREE_TYPE (ni), "bnd");
4019 add_referenced_var (var);
4021 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
4022 pe = loop_preheader_edge (loop);
4023 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4024 gcc_assert (!new_bb);
4027 /* Create: ratio_mult_vf = ratio << log2 (vf). */
4029 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
4030 ratio_name, log_vf);
4031 if (!is_gimple_val (ratio_mult_vf_name))
4033 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
4034 add_referenced_var (var);
4036 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
4038 pe = loop_preheader_edge (loop);
4039 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4040 gcc_assert (!new_bb);
4043 *ni_name_ptr = ni_name;
4044 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
4045 *ratio_name_ptr = ratio_name;
4051 /* Function update_vuses_to_preheader.
4054 STMT - a statement with potential VUSEs.
4055 LOOP - the loop whose preheader will contain STMT.
4057 It's possible to vectorize a loop even though an SSA_NAME from a VUSE
4058 appears to be defined in a VDEF in another statement in a loop.
4059 One such case is when the VUSE is at the dereference of a __restricted__
4060 pointer in a load and the VDEF is at the dereference of a different
4061 __restricted__ pointer in a store. Vectorization may result in
4062 copy_virtual_uses being called to copy the problematic VUSE to a new
4063 statement that is being inserted in the loop preheader. This procedure
4064 is called to change the SSA_NAME in the new statement's VUSE from the
4065 SSA_NAME updated in the loop to the related SSA_NAME available on the
4066 path entering the loop.
4068 When this function is called, we have the following situation:
4073 # name1 = phi < name0 , name2>
4078 # name2 = vdef <name1>
4083 Stmt S1 was created in the loop preheader block as part of misaligned-load
4084 handling. This function fixes the name of the vuse of S1 from 'name1' to
4088 update_vuses_to_preheader (tree stmt, struct loop *loop)
4090 basic_block header_bb = loop->header;
4091 edge preheader_e = loop_preheader_edge (loop);
4093 use_operand_p use_p;
4095 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VUSE)
4097 tree ssa_name = USE_FROM_PTR (use_p);
4098 tree def_stmt = SSA_NAME_DEF_STMT (ssa_name);
4099 tree name_var = SSA_NAME_VAR (ssa_name);
4100 basic_block bb = bb_for_stmt (def_stmt);
4102 /* For a use before any definitions, def_stmt is a NOP_EXPR. */
4103 if (!IS_EMPTY_STMT (def_stmt)
4104 && flow_bb_inside_loop_p (loop, bb))
4106 /* If the block containing the statement defining the SSA_NAME
4107 is in the loop then it's necessary to find the definition
4108 outside the loop using the PHI nodes of the header. */
4110 bool updated = false;
4112 for (phi = phi_nodes (header_bb); phi; phi = TREE_CHAIN (phi))
4114 if (SSA_NAME_VAR (PHI_RESULT (phi)) == name_var)
4116 SET_USE (use_p, PHI_ARG_DEF (phi, preheader_e->dest_idx));
4121 gcc_assert (updated);
4127 /* Function vect_update_ivs_after_vectorizer.
4129 "Advance" the induction variables of LOOP to the value they should take
4130 after the execution of LOOP. This is currently necessary because the
4131 vectorizer does not handle induction variables that are used after the
4132 loop. Such a situation occurs when the last iterations of LOOP are
4134 1. We introduced new uses after LOOP for IVs that were not originally used
4135 after LOOP: the IVs of LOOP are now used by an epilog loop.
4136 2. LOOP is going to be vectorized; this means that it will iterate N/VF
4137 times, whereas the loop IVs should be bumped N times.
4140 - LOOP - a loop that is going to be vectorized. The last few iterations
4141 of LOOP were peeled.
4142 - NITERS - the number of iterations that LOOP executes (before it is
4143 vectorized). i.e, the number of times the ivs should be bumped.
4144 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
4145 coming out from LOOP on which there are uses of the LOOP ivs
4146 (this is the path from LOOP->exit to epilog_loop->preheader).
4148 The new definitions of the ivs are placed in LOOP->exit.
4149 The phi args associated with the edge UPDATE_E in the bb
4150 UPDATE_E->dest are updated accordingly.
4152 Assumption 1: Like the rest of the vectorizer, this function assumes
4153 a single loop exit that has a single predecessor.
4155 Assumption 2: The phi nodes in the LOOP header and in update_bb are
4156 organized in the same order.
4158 Assumption 3: The access function of the ivs is simple enough (see
4159 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
4161 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
4162 coming out of LOOP on which the ivs of LOOP are used (this is the path
4163 that leads to the epilog loop; other paths skip the epilog loop). This
4164 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
4165 needs to have its phis updated.
4169 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
4172 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4173 basic_block exit_bb = single_exit (loop)->dest;
4175 basic_block update_bb = update_e->dest;
4177 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
4179 /* Make sure there exists a single-predecessor exit bb: */
4180 gcc_assert (single_pred_p (exit_bb));
4182 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
4184 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
4186 tree access_fn = NULL;
4187 tree evolution_part;
4190 tree var, stmt, ni, ni_name;
4191 block_stmt_iterator last_bsi;
4193 if (vect_print_dump_info (REPORT_DETAILS))
4195 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
4196 print_generic_expr (vect_dump, phi, TDF_SLIM);
4199 /* Skip virtual phi's. */
4200 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
4202 if (vect_print_dump_info (REPORT_DETAILS))
4203 fprintf (vect_dump, "virtual phi. skip.");
4207 /* Skip reduction phis. */
4208 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
4210 if (vect_print_dump_info (REPORT_DETAILS))
4211 fprintf (vect_dump, "reduc phi. skip.");
4215 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
4216 gcc_assert (access_fn);
4218 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
4219 gcc_assert (evolution_part != NULL_TREE);
4221 /* FORNOW: We do not support IVs whose evolution function is a polynomial
4222 of degree >= 2 or exponential. */
4223 gcc_assert (!tree_is_chrec (evolution_part));
4225 step_expr = evolution_part;
4226 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
4229 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
4230 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
4231 fold_convert (TREE_TYPE (init_expr),
4236 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
4237 add_referenced_var (var);
4239 ni_name = force_gimple_operand (ni, &stmt, false, var);
4241 /* Insert stmt into exit_bb. */
4242 last_bsi = bsi_last (exit_bb);
4244 bsi_insert_before (&last_bsi, stmt, BSI_SAME_STMT);
4246 /* Fix phi expressions in the successor bb. */
4247 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
4252 /* Function vect_do_peeling_for_loop_bound
4254 Peel the last iterations of the loop represented by LOOP_VINFO.
4255 The peeled iterations form a new epilog loop. Given that the loop now
4256 iterates NITERS times, the new epilog loop iterates
4257 NITERS % VECTORIZATION_FACTOR times.
4259 The original loop will later be made to iterate
4260 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
4263 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
4265 tree ni_name, ratio_mult_vf_name;
4266 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4267 struct loop *new_loop;
4269 basic_block preheader;
4272 if (vect_print_dump_info (REPORT_DETAILS))
4273 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
4275 initialize_original_copy_tables ();
4277 /* Generate the following variables on the preheader of original loop:
4279 ni_name = number of iteration the original loop executes
4280 ratio = ni_name / vf
4281 ratio_mult_vf_name = ratio * vf */
4282 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
4283 &ratio_mult_vf_name, ratio);
4285 loop_num = loop->num;
4286 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
4287 ratio_mult_vf_name, ni_name, false);
4288 gcc_assert (new_loop);
4289 gcc_assert (loop_num == loop->num);
4290 #ifdef ENABLE_CHECKING
4291 slpeel_verify_cfg_after_peeling (loop, new_loop);
4294 /* A guard that controls whether the new_loop is to be executed or skipped
4295 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
4296 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
4297 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
4298 is on the path where the LOOP IVs are used and need to be updated. */
4300 preheader = loop_preheader_edge (new_loop)->src;
4301 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
4302 update_e = EDGE_PRED (preheader, 0);
4304 update_e = EDGE_PRED (preheader, 1);
4306 /* Update IVs of original loop as if they were advanced
4307 by ratio_mult_vf_name steps. */
4308 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
4310 /* After peeling we have to reset scalar evolution analyzer. */
4313 free_original_copy_tables ();
4317 /* Function vect_gen_niters_for_prolog_loop
4319 Set the number of iterations for the loop represented by LOOP_VINFO
4320 to the minimum between LOOP_NITERS (the original iteration count of the loop)
4321 and the misalignment of DR - the data reference recorded in
4322 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
4323 this loop, the data reference DR will refer to an aligned location.
4325 The following computation is generated:
4327 If the misalignment of DR is known at compile time:
4328 addr_mis = int mis = DR_MISALIGNMENT (dr);
4329 Else, compute address misalignment in bytes:
4330 addr_mis = addr & (vectype_size - 1)
4332 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
4334 (elem_size = element type size; an element is the scalar element
4335 whose type is the inner type of the vectype)
4339 prolog_niters = min ( LOOP_NITERS ,
4340 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
4341 where group_size is the size of the interleaved group.
4345 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
4347 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
4348 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4349 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4351 tree iters, iters_name;
4354 tree dr_stmt = DR_STMT (dr);
4355 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
4356 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4357 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
4358 tree niters_type = TREE_TYPE (loop_niters);
4360 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
4362 if (DR_GROUP_FIRST_DR (stmt_info))
4364 /* For interleaved access element size must be multiplied by the size of
4365 the interleaved group. */
4366 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
4367 DR_GROUP_FIRST_DR (stmt_info)));
4368 element_size *= group_size;
4371 pe = loop_preheader_edge (loop);
4373 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
4375 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
4376 int elem_misalign = byte_misalign / element_size;
4378 if (vect_print_dump_info (REPORT_DETAILS))
4379 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
4380 iters = build_int_cst (niters_type,
4381 (vf - elem_misalign)&(vf/group_size-1));
4385 tree new_stmts = NULL_TREE;
4387 vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE);
4388 tree ptr_type = TREE_TYPE (start_addr);
4389 tree size = TYPE_SIZE (ptr_type);
4390 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
4391 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
4392 tree elem_size_log =
4393 build_int_cst (type, exact_log2 (vectype_align/vf));
4394 tree vf_minus_1 = build_int_cst (type, vf - 1);
4395 tree vf_tree = build_int_cst (type, vf);
4399 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
4400 gcc_assert (!new_bb);
4402 /* Create: byte_misalign = addr & (vectype_size - 1) */
4404 fold_build2 (BIT_AND_EXPR, type, start_addr, vectype_size_minus_1);
4406 /* Create: elem_misalign = byte_misalign / element_size */
4408 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
4410 /* Create: (niters_type) (VF - elem_misalign)&(VF - 1) */
4411 iters = fold_build2 (MINUS_EXPR, type, vf_tree, elem_misalign);
4412 iters = fold_build2 (BIT_AND_EXPR, type, iters, vf_minus_1);
4413 iters = fold_convert (niters_type, iters);
4416 /* Create: prolog_loop_niters = min (iters, loop_niters) */
4417 /* If the loop bound is known at compile time we already verified that it is
4418 greater than vf; since the misalignment ('iters') is at most vf, there's
4419 no need to generate the MIN_EXPR in this case. */
4420 if (TREE_CODE (loop_niters) != INTEGER_CST)
4421 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
4423 if (vect_print_dump_info (REPORT_DETAILS))
4425 fprintf (vect_dump, "niters for prolog loop: ");
4426 print_generic_expr (vect_dump, iters, TDF_SLIM);
4429 var = create_tmp_var (niters_type, "prolog_loop_niters");
4430 add_referenced_var (var);
4431 iters_name = force_gimple_operand (iters, &stmt, false, var);
4433 /* Insert stmt on loop preheader edge. */
4436 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4437 gcc_assert (!new_bb);
4444 /* Function vect_update_init_of_dr
4446 NITERS iterations were peeled from LOOP. DR represents a data reference
4447 in LOOP. This function updates the information recorded in DR to
4448 account for the fact that the first NITERS iterations had already been
4449 executed. Specifically, it updates the OFFSET field of DR. */
4452 vect_update_init_of_dr (struct data_reference *dr, tree niters)
4454 tree offset = DR_OFFSET (dr);
4456 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
4457 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
4458 DR_OFFSET (dr) = offset;
4462 /* Function vect_update_inits_of_drs
4464 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
4465 This function updates the information recorded for the data references in
4466 the loop to account for the fact that the first NITERS iterations had
4467 already been executed. Specifically, it updates the initial_condition of the
4468 access_function of all the data_references in the loop. */
4471 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
4474 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
4475 struct data_reference *dr;
4477 if (vect_dump && (dump_flags & TDF_DETAILS))
4478 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
4480 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
4481 vect_update_init_of_dr (dr, niters);
4485 /* Function vect_do_peeling_for_alignment
4487 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
4488 'niters' is set to the misalignment of one of the data references in the
4489 loop, thereby forcing it to refer to an aligned location at the beginning
4490 of the execution of this loop. The data reference for which we are
4491 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
4494 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
4496 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4497 tree niters_of_prolog_loop, ni_name;
4499 struct loop *new_loop;
4501 if (vect_print_dump_info (REPORT_DETAILS))
4502 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
4504 initialize_original_copy_tables ();
4506 ni_name = vect_build_loop_niters (loop_vinfo);
4507 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
4509 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
4511 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
4512 niters_of_prolog_loop, ni_name, true);
4513 gcc_assert (new_loop);
4514 #ifdef ENABLE_CHECKING
4515 slpeel_verify_cfg_after_peeling (new_loop, loop);
4518 /* Update number of times loop executes. */
4519 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
4520 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
4521 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
4523 /* Update the init conditions of the access functions of all data refs. */
4524 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
4526 /* After peeling we have to reset scalar evolution analyzer. */
4529 free_original_copy_tables ();
4533 /* Function vect_create_cond_for_align_checks.
4535 Create a conditional expression that represents the alignment checks for
4536 all of data references (array element references) whose alignment must be
4540 LOOP_VINFO - two fields of the loop information are used.
4541 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
4542 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
4545 COND_EXPR_STMT_LIST - statements needed to construct the conditional
4547 The returned value is the conditional expression to be used in the if
4548 statement that controls which version of the loop gets executed at runtime.
4550 The algorithm makes two assumptions:
4551 1) The number of bytes "n" in a vector is a power of 2.
4552 2) An address "a" is aligned if a%n is zero and that this
4553 test can be done as a&(n-1) == 0. For example, for 16
4554 byte vectors the test is a&0xf == 0. */
4557 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
4558 tree *cond_expr_stmt_list)
4560 VEC(tree,heap) *may_misalign_stmts
4561 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
4563 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
4567 tree int_ptrsize_type;
4569 tree or_tmp_name = NULL_TREE;
4570 tree and_tmp, and_tmp_name, and_stmt;
4573 /* Check that mask is one less than a power of 2, i.e., mask is
4574 all zeros followed by all ones. */
4575 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
4577 /* CHECKME: what is the best integer or unsigned type to use to hold a
4578 cast from a pointer value? */
4579 psize = TYPE_SIZE (ptr_type_node);
4581 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
4583 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
4584 of the first vector of the i'th data reference. */
4586 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
4588 tree new_stmt_list = NULL_TREE;
4590 tree addr_tmp, addr_tmp_name, addr_stmt;
4591 tree or_tmp, new_or_tmp_name, or_stmt;
4593 /* create: addr_tmp = (int)(address_of_first_vector) */
4594 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
4598 if (new_stmt_list != NULL_TREE)
4599 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
4601 sprintf (tmp_name, "%s%d", "addr2int", i);
4602 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4603 add_referenced_var (addr_tmp);
4604 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
4605 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
4606 addr_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
4607 addr_tmp_name, addr_stmt);
4608 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
4609 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
4611 /* The addresses are OR together. */
4613 if (or_tmp_name != NULL_TREE)
4615 /* create: or_tmp = or_tmp | addr_tmp */
4616 sprintf (tmp_name, "%s%d", "orptrs", i);
4617 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4618 add_referenced_var (or_tmp);
4619 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
4620 or_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
4622 build2 (BIT_IOR_EXPR, int_ptrsize_type,
4625 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
4626 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
4627 or_tmp_name = new_or_tmp_name;
4630 or_tmp_name = addr_tmp_name;
4634 mask_cst = build_int_cst (int_ptrsize_type, mask);
4636 /* create: and_tmp = or_tmp & mask */
4637 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
4638 add_referenced_var (and_tmp);
4639 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
4641 and_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
4643 build2 (BIT_AND_EXPR, int_ptrsize_type,
4644 or_tmp_name, mask_cst));
4645 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
4646 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
4648 /* Make and_tmp the left operand of the conditional test against zero.
4649 if and_tmp has a nonzero bit then some address is unaligned. */
4650 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
4651 return build2 (EQ_EXPR, boolean_type_node,
4652 and_tmp_name, ptrsize_zero);
4656 /* Function vect_transform_loop.
4658 The analysis phase has determined that the loop is vectorizable.
4659 Vectorize the loop - created vectorized stmts to replace the scalar
4660 stmts in the loop, and update the loop exit condition. */
4663 vect_transform_loop (loop_vec_info loop_vinfo)
4665 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4666 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4667 int nbbs = loop->num_nodes;
4668 block_stmt_iterator si;
4671 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4674 if (vect_print_dump_info (REPORT_DETAILS))
4675 fprintf (vect_dump, "=== vec_transform_loop ===");
4677 /* If the loop has data references that may or may not be aligned then
4678 two versions of the loop need to be generated, one which is vectorized
4679 and one which isn't. A test is then generated to control which of the
4680 loops is executed. The test checks for the alignment of all of the
4681 data references that may or may not be aligned. */
4683 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
4687 tree cond_expr_stmt_list = NULL_TREE;
4688 basic_block condition_bb;
4689 block_stmt_iterator cond_exp_bsi;
4690 basic_block merge_bb;
4691 basic_block new_exit_bb;
4693 tree orig_phi, new_phi, arg;
4695 cond_expr = vect_create_cond_for_align_checks (loop_vinfo,
4696 &cond_expr_stmt_list);
4697 initialize_original_copy_tables ();
4698 nloop = loop_version (loop, cond_expr, &condition_bb, true);
4699 free_original_copy_tables();
4701 /** Loop versioning violates an assumption we try to maintain during
4702 vectorization - that the loop exit block has a single predecessor.
4703 After versioning, the exit block of both loop versions is the same
4704 basic block (i.e. it has two predecessors). Just in order to simplify
4705 following transformations in the vectorizer, we fix this situation
4706 here by adding a new (empty) block on the exit-edge of the loop,
4707 with the proper loop-exit phis to maintain loop-closed-form. **/
4709 merge_bb = single_exit (loop)->dest;
4710 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
4711 new_exit_bb = split_edge (single_exit (loop));
4712 new_exit_e = single_exit (loop);
4713 e = EDGE_SUCC (new_exit_bb, 0);
4715 for (orig_phi = phi_nodes (merge_bb); orig_phi;
4716 orig_phi = PHI_CHAIN (orig_phi))
4718 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
4720 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
4721 add_phi_arg (new_phi, arg, new_exit_e);
4722 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
4725 /** end loop-exit-fixes after versioning **/
4727 update_ssa (TODO_update_ssa);
4728 cond_exp_bsi = bsi_last (condition_bb);
4729 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
4732 /* CHECKME: we wouldn't need this if we called update_ssa once
4734 bitmap_zero (vect_memsyms_to_rename);
4736 /* Peel the loop if there are data refs with unknown alignment.
4737 Only one data ref with unknown store is allowed. */
4739 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4740 vect_do_peeling_for_alignment (loop_vinfo);
4742 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4743 compile time constant), or it is a constant that doesn't divide by the
4744 vectorization factor, then an epilog loop needs to be created.
4745 We therefore duplicate the loop: the original loop will be vectorized,
4746 and will compute the first (n/VF) iterations. The second copy of the loop
4747 will remain scalar and will compute the remaining (n%VF) iterations.
4748 (VF is the vectorization factor). */
4750 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4751 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4752 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
4753 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
4755 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4756 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4758 /* 1) Make sure the loop header has exactly two entries
4759 2) Make sure we have a preheader basic block. */
4761 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4763 split_edge (loop_preheader_edge (loop));
4765 /* FORNOW: the vectorizer supports only loops which body consist
4766 of one basic block (header + empty latch). When the vectorizer will
4767 support more involved loop forms, the order by which the BBs are
4768 traversed need to be reconsidered. */
4770 for (i = 0; i < nbbs; i++)
4772 basic_block bb = bbs[i];
4774 for (si = bsi_start (bb); !bsi_end_p (si);)
4776 tree stmt = bsi_stmt (si);
4777 stmt_vec_info stmt_info;
4780 if (vect_print_dump_info (REPORT_DETAILS))
4782 fprintf (vect_dump, "------>vectorizing statement: ");
4783 print_generic_expr (vect_dump, stmt, TDF_SLIM);
4785 stmt_info = vinfo_for_stmt (stmt);
4786 gcc_assert (stmt_info);
4787 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4788 && !STMT_VINFO_LIVE_P (stmt_info))
4794 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4795 != (unsigned HOST_WIDE_INT) vectorization_factor)
4796 && vect_print_dump_info (REPORT_DETAILS))
4797 fprintf (vect_dump, "multiple-types.");
4799 /* -------- vectorize statement ------------ */
4800 if (vect_print_dump_info (REPORT_DETAILS))
4801 fprintf (vect_dump, "transform statement.");
4803 strided_store = false;
4804 is_store = vect_transform_stmt (stmt, &si, &strided_store);
4808 if (DR_GROUP_FIRST_DR (stmt_info))
4810 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4811 interleaving chain was completed - free all the stores in
4813 tree next = DR_GROUP_FIRST_DR (stmt_info);
4815 stmt_vec_info next_stmt_info;
4819 next_stmt_info = vinfo_for_stmt (next);
4820 /* Free the attached stmt_vec_info and remove the stmt. */
4821 ann = stmt_ann (next);
4822 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
4823 free (next_stmt_info);
4824 set_stmt_info (ann, NULL);
4827 bsi_remove (&si, true);
4832 /* Free the attached stmt_vec_info and remove the stmt. */
4833 ann = stmt_ann (stmt);
4835 set_stmt_info (ann, NULL);
4836 bsi_remove (&si, true);
4844 /* This is case of skipped interleaved store. We don't free
4845 its stmt_vec_info. */
4846 bsi_remove (&si, true);
4854 slpeel_make_loop_iterate_ntimes (loop, ratio);
4856 mark_set_for_renaming (vect_memsyms_to_rename);
4858 /* The memory tags and pointers in vectorized statements need to
4859 have their SSA forms updated. FIXME, why can't this be delayed
4860 until all the loops have been transformed? */
4861 update_ssa (TODO_update_ssa);
4863 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
4864 fprintf (vect_dump, "LOOP VECTORIZED.");