1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to the Free
19 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
24 #include "coretypes.h"
30 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
50 static tree vect_create_destination_var (tree, tree);
51 static tree vect_create_data_ref_ptr
52 (tree, block_stmt_iterator *, tree, tree *, tree *, bool, tree);
53 static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree);
54 static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *);
55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
57 static tree vect_init_vector (tree, tree, tree);
58 static void vect_finish_stmt_generation
59 (tree stmt, tree vec_stmt, block_stmt_iterator *bsi);
60 static bool vect_is_simple_cond (tree, loop_vec_info);
61 static void update_vuses_to_preheader (tree, struct loop*);
62 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
63 static tree get_initial_def_for_reduction (tree, tree, tree *);
65 /* Utility function dealing with loop peeling (not peeling itself). */
66 static void vect_generate_tmps_on_preheader
67 (loop_vec_info, tree *, tree *, tree *);
68 static tree vect_build_loop_niters (loop_vec_info);
69 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
70 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
71 static void vect_update_init_of_dr (struct data_reference *, tree niters);
72 static void vect_update_inits_of_drs (loop_vec_info, tree);
73 static int vect_min_worthwhile_factor (enum tree_code);
76 /* Function vect_get_new_vect_var.
78 Returns a name for a new variable. The current naming scheme appends the
79 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
80 the name of vectorizer generated variables, and appends that to NAME if
84 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
97 case vect_pointer_var:
105 new_vect_var = create_tmp_var (type, concat (prefix, name, NULL));
107 new_vect_var = create_tmp_var (type, prefix);
113 /* Function vect_create_addr_base_for_vector_ref.
115 Create an expression that computes the address of the first memory location
116 that will be accessed for a data reference.
119 STMT: The statement containing the data reference.
120 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
121 OFFSET: Optional. If supplied, it is be added to the initial address.
124 1. Return an SSA_NAME whose value is the address of the memory location of
125 the first vector of the data reference.
126 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
127 these statement(s) which define the returned SSA_NAME.
129 FORNOW: We are only handling array accesses with step 1. */
132 vect_create_addr_base_for_vector_ref (tree stmt,
136 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
137 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
138 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
139 tree base_name = build_fold_indirect_ref (data_ref_base);
141 tree addr_base, addr_expr;
143 tree base_offset = unshare_expr (DR_OFFSET (dr));
144 tree init = unshare_expr (DR_INIT (dr));
145 tree vect_ptr_type, addr_expr2;
147 /* Create base_offset */
148 base_offset = size_binop (PLUS_EXPR, base_offset, init);
149 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
150 add_referenced_var (dest);
151 base_offset = force_gimple_operand (base_offset, &new_stmt, false, dest);
152 append_to_statement_list_force (new_stmt, new_stmt_list);
156 tree tmp = create_tmp_var (TREE_TYPE (base_offset), "offset");
159 /* For interleaved access step we divide STEP by the size of the
160 interleaving group. */
161 if (DR_GROUP_SIZE (stmt_info))
162 step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr),
163 build_int_cst (TREE_TYPE (offset),
164 DR_GROUP_SIZE (stmt_info)));
168 add_referenced_var (tmp);
169 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
170 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
171 base_offset, offset);
172 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
173 append_to_statement_list_force (new_stmt, new_stmt_list);
176 /* base + base_offset */
177 addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base,
180 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
182 /* addr_expr = addr_base */
183 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
184 get_name (base_name));
185 add_referenced_var (addr_expr);
186 vec_stmt = fold_convert (vect_ptr_type, addr_base);
187 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
188 get_name (base_name));
189 add_referenced_var (addr_expr2);
190 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
191 append_to_statement_list_force (new_stmt, new_stmt_list);
193 if (vect_print_dump_info (REPORT_DETAILS))
195 fprintf (vect_dump, "created ");
196 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
202 /* Function vect_create_data_ref_ptr.
204 Create a new pointer to vector type (vp), that points to the first location
205 accessed in the loop by STMT, along with the def-use update chain to
206 appropriately advance the pointer through the loop iterations. Also set
207 aliasing information for the pointer. This vector pointer is used by the
208 callers to this function to create a memory reference expression for vector
212 1. STMT: a stmt that references memory. Expected to be of the form
213 MODIFY_EXPR <name, data-ref> or MODIFY_EXPR <data-ref, name>.
214 2. BSI: block_stmt_iterator where new stmts can be added.
215 3. OFFSET (optional): an offset to be added to the initial address accessed
216 by the data-ref in STMT.
217 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
218 pointing to the initial address.
219 5. TYPE: if not NULL indicates the required type of the data-ref
222 1. Declare a new ptr to vector_type, and have it point to the base of the
223 data reference (initial addressed accessed by the data reference).
224 For example, for vector of type V8HI, the following code is generated:
227 vp = (v8hi *)initial_address;
229 if OFFSET is not supplied:
230 initial_address = &a[init];
231 if OFFSET is supplied:
232 initial_address = &a[init + OFFSET];
234 Return the initial_address in INITIAL_ADDRESS.
236 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
237 update the pointer in each iteration of the loop.
239 Return the increment stmt that updates the pointer in PTR_INCR.
241 3. Return the pointer. */
244 vect_create_data_ref_ptr (tree stmt,
245 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
246 tree offset, tree *initial_address, tree *ptr_incr,
247 bool only_init, tree type)
250 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
251 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
252 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
253 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
259 tree new_stmt_list = NULL_TREE;
260 edge pe = loop_preheader_edge (loop);
263 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
265 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
267 if (vect_print_dump_info (REPORT_DETAILS))
269 tree data_ref_base = base_name;
270 fprintf (vect_dump, "create vector-pointer variable to type: ");
271 print_generic_expr (vect_dump, vectype, TDF_SLIM);
272 if (TREE_CODE (data_ref_base) == VAR_DECL)
273 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
274 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
275 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
276 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
277 fprintf (vect_dump, " vectorizing a record based array ref: ");
278 else if (TREE_CODE (data_ref_base) == SSA_NAME)
279 fprintf (vect_dump, " vectorizing a pointer ref: ");
280 print_generic_expr (vect_dump, base_name, TDF_SLIM);
283 /** (1) Create the new vector-pointer variable: **/
285 vect_ptr_type = build_pointer_type (type);
287 vect_ptr_type = build_pointer_type (vectype);
288 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
289 get_name (base_name));
290 add_referenced_var (vect_ptr);
292 /** (2) Add aliasing information to the new vector-pointer:
293 (The points-to info (DR_PTR_INFO) may be defined later.) **/
295 tag = DR_MEMTAG (dr);
298 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
299 tag must be created with tag added to its may alias list. */
301 new_type_alias (vect_ptr, tag, DR_REF (dr));
303 var_ann (vect_ptr)->symbol_mem_tag = tag;
305 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
307 /** (3) Calculate the initial address the vector-pointer, and set
308 the vector-pointer to point to it before the loop: **/
310 /* Create: (&(base[init_val+offset]) in the loop preheader. */
311 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
313 pe = loop_preheader_edge (loop);
314 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
315 gcc_assert (!new_bb);
316 *initial_address = new_temp;
318 /* Create: p = (vectype *) initial_base */
319 vec_stmt = fold_convert (vect_ptr_type, new_temp);
320 vec_stmt = build2 (MODIFY_EXPR, void_type_node, vect_ptr, vec_stmt);
321 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
322 TREE_OPERAND (vec_stmt, 0) = vect_ptr_init;
323 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
324 gcc_assert (!new_bb);
327 /** (4) Handle the updating of the vector-pointer inside the loop: **/
329 if (only_init) /* No update in loop is required. */
331 /* Copy the points-to information if it exists. */
332 if (DR_PTR_INFO (dr))
333 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
334 return vect_ptr_init;
338 block_stmt_iterator incr_bsi;
340 tree indx_before_incr, indx_after_incr;
343 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
344 create_iv (vect_ptr_init,
345 fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)),
346 NULL_TREE, loop, &incr_bsi, insert_after,
347 &indx_before_incr, &indx_after_incr);
348 incr = bsi_stmt (incr_bsi);
349 set_stmt_info (stmt_ann (incr),
350 new_stmt_vec_info (incr, loop_vinfo));
352 /* Copy the points-to information if it exists. */
353 if (DR_PTR_INFO (dr))
355 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
356 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
358 merge_alias_info (vect_ptr_init, indx_before_incr);
359 merge_alias_info (vect_ptr_init, indx_after_incr);
363 return indx_before_incr;
368 /* Function bump_vector_ptr
370 Increment a pointer (to a vector type) by vector-size. Connect the new
371 increment stmt to the exising def-use update-chain of the pointer.
373 The pointer def-use update-chain before this function:
374 DATAREF_PTR = phi (p_0, p_2)
376 PTR_INCR: p_2 = DATAREF_PTR + step
378 The pointer def-use update-chain after this function:
379 DATAREF_PTR = phi (p_0, p_2)
381 NEW_DATAREF_PTR = DATAREF_PTR + vector_size
383 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
386 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
388 PTR_INCR - the stmt that updates the pointer in each iteration of the loop.
389 The increment amount across iterations is also expected to be
391 BSI - location where the new update stmt is to be placed.
392 STMT - the original scalar memory-access stmt that is being vectorized.
394 Output: Return NEW_DATAREF_PTR as illustrated above.
399 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
402 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
403 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
404 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
405 tree vptr_type = TREE_TYPE (dataref_ptr);
406 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
407 tree update = fold_convert (vptr_type, TYPE_SIZE_UNIT (vectype));
411 tree new_dataref_ptr;
413 incr_stmt = build2 (MODIFY_EXPR, void_type_node, ptr_var,
414 build2 (PLUS_EXPR, vptr_type, dataref_ptr, update));
415 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
416 TREE_OPERAND (incr_stmt, 0) = new_dataref_ptr;
417 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
419 /* Update the vector-pointer's cross-iteration increment. */
420 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
422 tree use = USE_FROM_PTR (use_p);
424 if (use == dataref_ptr)
425 SET_USE (use_p, new_dataref_ptr);
427 gcc_assert (tree_int_cst_compare (use, update) == 0);
430 /* Copy the points-to information if it exists. */
431 if (DR_PTR_INFO (dr))
432 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
433 merge_alias_info (new_dataref_ptr, dataref_ptr);
435 return new_dataref_ptr;
439 /* Function vect_create_destination_var.
441 Create a new temporary of type VECTYPE. */
444 vect_create_destination_var (tree scalar_dest, tree vectype)
447 const char *new_name;
449 enum vect_var_kind kind;
451 kind = vectype ? vect_simple_var : vect_scalar_var;
452 type = vectype ? vectype : TREE_TYPE (scalar_dest);
454 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
456 new_name = get_name (scalar_dest);
459 vec_dest = vect_get_new_vect_var (type, vect_simple_var, new_name);
460 add_referenced_var (vec_dest);
466 /* Function vect_init_vector.
468 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
469 the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be
470 used in the vectorization of STMT. */
473 vect_init_vector (tree stmt, tree vector_var, tree vector_type)
475 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
476 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
477 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
485 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
486 add_referenced_var (new_var);
488 init_stmt = build2 (MODIFY_EXPR, void_type_node, new_var, vector_var);
489 new_temp = make_ssa_name (new_var, init_stmt);
490 TREE_OPERAND (init_stmt, 0) = new_temp;
492 pe = loop_preheader_edge (loop);
493 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
494 gcc_assert (!new_bb);
496 if (vect_print_dump_info (REPORT_DETAILS))
498 fprintf (vect_dump, "created new init_stmt: ");
499 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
502 vec_oprnd = TREE_OPERAND (init_stmt, 0);
507 /* Function vect_get_vec_def_for_operand.
509 OP is an operand in STMT. This function returns a (vector) def that will be
510 used in the vectorized stmt for STMT.
512 In the case that OP is an SSA_NAME which is defined in the loop, then
513 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
515 In case OP is an invariant or constant, a new stmt that creates a vector def
516 needs to be introduced. */
519 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
524 stmt_vec_info def_stmt_info = NULL;
525 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
526 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
527 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
528 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
529 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
535 enum vect_def_type dt;
539 if (vect_print_dump_info (REPORT_DETAILS))
541 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
542 print_generic_expr (vect_dump, op, TDF_SLIM);
545 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
546 gcc_assert (is_simple_use);
547 if (vect_print_dump_info (REPORT_DETAILS))
551 fprintf (vect_dump, "def = ");
552 print_generic_expr (vect_dump, def, TDF_SLIM);
556 fprintf (vect_dump, " def_stmt = ");
557 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
563 /* Case 1: operand is a constant. */
564 case vect_constant_def:
569 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
570 if (vect_print_dump_info (REPORT_DETAILS))
571 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
573 for (i = nunits - 1; i >= 0; --i)
575 t = tree_cons (NULL_TREE, op, t);
577 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
578 vec_cst = build_vector (vector_type, t);
580 return vect_init_vector (stmt, vec_cst, vector_type);
583 /* Case 2: operand is defined outside the loop - loop invariant. */
584 case vect_invariant_def:
589 /* Create 'vec_inv = {inv,inv,..,inv}' */
590 if (vect_print_dump_info (REPORT_DETAILS))
591 fprintf (vect_dump, "Create vector_inv.");
593 for (i = nunits - 1; i >= 0; --i)
595 t = tree_cons (NULL_TREE, def, t);
598 /* FIXME: use build_constructor directly. */
599 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
600 vec_inv = build_constructor_from_list (vector_type, t);
601 return vect_init_vector (stmt, vec_inv, vector_type);
604 /* Case 3: operand is defined inside the loop. */
608 *scalar_def = def_stmt;
610 /* Get the def from the vectorized stmt. */
611 def_stmt_info = vinfo_for_stmt (def_stmt);
612 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
613 gcc_assert (vec_stmt);
614 vec_oprnd = TREE_OPERAND (vec_stmt, 0);
618 /* Case 4: operand is defined by a loop header phi - reduction */
619 case vect_reduction_def:
621 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
623 /* Get the def before the loop */
624 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
625 return get_initial_def_for_reduction (stmt, op, scalar_def);
628 /* Case 5: operand is defined by loop-header phi - induction. */
629 case vect_induction_def:
631 if (vect_print_dump_info (REPORT_DETAILS))
632 fprintf (vect_dump, "induction - unsupported.");
633 internal_error ("no support for induction"); /* FORNOW */
642 /* Function vect_get_vec_def_for_stmt_copy
644 Return a vector-def for an operand. This function is used when the
645 vectorized stmt to be created (by the caller to this function) is a "copy"
646 created in case the vectorized result cannot fit in one vector, and several
647 copies of the vector-stmt are required. In this case the vector-def is
648 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
649 of the stmt that defines VEC_OPRND.
650 DT is the type of the vector def VEC_OPRND.
653 In case the vectorization factor (VF) is bigger than the number
654 of elements that can fit in a vectype (nunits), we have to generate
655 more than one vector stmt to vectorize the scalar stmt. This situation
656 arises when there are multiple data-types operated upon in the loop; the
657 smallest data-type determines the VF, and as a result, when vectorizing
658 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
659 vector stmt (each computing a vector of 'nunits' results, and together
660 computing 'VF' results in each iteration). This function is called when
661 vectorizing such a stmt (e.g. vectorizing S2 in the illusration below, in
662 which VF=16 and nuniti=4, so the number of copies required is 4):
664 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
666 S1: x = load VS1.0: vx.0 = memref0 VS1.1
667 VS1.1: vx.1 = memref1 VS1.2
668 VS1.2: vx.2 = memref2 VS1.3
669 VS1.3: vx.3 = memref3
671 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
672 VSnew.1: vz1 = vx.1 + ... VSnew.2
673 VSnew.2: vz2 = vx.2 + ... VSnew.3
674 VSnew.3: vz3 = vx.3 + ...
676 The vectorization of S1 is explained in vectorizable_load.
677 The vectorization of S2:
678 To create the first vector-stmt out of the 4 copies - VSnew.0 -
679 the function 'vect_get_vec_def_for_operand' is called to
680 get the relevant vector-def for each operand of S2. For operand x it
681 returns the vector-def 'vx.0'.
683 To create the remaining copies of the vector-stmt (VSnew.j), this
684 function is called to get the relevant vector-def for each operand. It is
685 obtained from the respective VS1.j stmt, which is recorded in the
686 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
688 For example, to obtain the vector-def 'vx.1' in order to create the
689 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
690 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
691 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
692 and return its def ('vx.1').
693 Overall, to create the above sequence this function will be called 3 times:
694 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
695 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
696 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
699 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
701 tree vec_stmt_for_operand;
702 stmt_vec_info def_stmt_info;
704 if (dt == vect_invariant_def || dt == vect_constant_def)
706 /* Do nothing; can reuse same def. */ ;
710 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
711 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
712 gcc_assert (def_stmt_info);
713 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
714 gcc_assert (vec_stmt_for_operand);
715 vec_oprnd = TREE_OPERAND (vec_stmt_for_operand, 0);
721 /* Function vect_finish_stmt_generation.
723 Insert a new stmt. */
726 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
727 block_stmt_iterator *bsi)
729 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
730 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
732 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
733 set_stmt_info (get_stmt_ann (vec_stmt),
734 new_stmt_vec_info (vec_stmt, loop_vinfo));
736 if (vect_print_dump_info (REPORT_DETAILS))
738 fprintf (vect_dump, "add new stmt: ");
739 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
742 /* Make sure bsi points to the stmt that is being vectorized. */
743 gcc_assert (stmt == bsi_stmt (*bsi));
745 #ifdef USE_MAPPED_LOCATION
746 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
748 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
753 #define ADJUST_IN_EPILOG 1
755 /* Function get_initial_def_for_reduction
758 STMT - a stmt that performs a reduction operation in the loop.
759 INIT_VAL - the initial value of the reduction variable
762 SCALAR_DEF - a tree that holds a value to be added to the final result
763 of the reduction (used for "ADJUST_IN_EPILOG" - see below).
764 Return a vector variable, initialized according to the operation that STMT
765 performs. This vector will be used as the initial value of the
766 vector of partial results.
768 Option1 ("ADJUST_IN_EPILOG"): Initialize the vector as follows:
771 min/max: [init_val,init_val,..,init_val,init_val]
772 bit and/or: [init_val,init_val,..,init_val,init_val]
773 and when necessary (e.g. add/mult case) let the caller know
774 that it needs to adjust the result by init_val.
776 Option2: Initialize the vector as follows:
777 add: [0,0,...,0,init_val]
778 mult: [1,1,...,1,init_val]
779 min/max: [init_val,init_val,...,init_val]
780 bit and/or: [init_val,init_val,...,init_val]
781 and no adjustments are needed.
783 For example, for the following code:
789 STMT is 's = s + a[i]', and the reduction variable is 's'.
790 For a vector of 4 units, we want to return either [0,0,0,init_val],
791 or [0,0,0,0] and let the caller know that it needs to adjust
792 the result at the end by 'init_val'.
794 FORNOW: We use the "ADJUST_IN_EPILOG" scheme.
795 TODO: Use some cost-model to estimate which scheme is more profitable.
799 get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def)
801 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
802 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
803 int nunits = GET_MODE_NUNITS (TYPE_MODE (vectype));
805 enum tree_code code = TREE_CODE (TREE_OPERAND (stmt, 1));
806 tree type = TREE_TYPE (init_val);
808 tree vec, t = NULL_TREE;
809 bool need_epilog_adjust;
813 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
820 if (INTEGRAL_TYPE_P (type))
821 def = build_int_cst (type, 0);
823 def = build_real (type, dconst0);
825 #ifdef ADJUST_IN_EPILOG
826 /* All the 'nunits' elements are set to 0. The final result will be
827 adjusted by 'init_val' at the loop epilog. */
829 need_epilog_adjust = true;
831 /* 'nunits - 1' elements are set to 0; The last element is set to
832 'init_val'. No further adjustments at the epilog are needed. */
833 nelements = nunits - 1;
834 need_epilog_adjust = false;
842 need_epilog_adjust = false;
849 for (i = nelements - 1; i >= 0; --i)
850 t = tree_cons (NULL_TREE, def, t);
852 if (nelements == nunits - 1)
854 /* Set the last element of the vector. */
855 t = tree_cons (NULL_TREE, init_val, t);
858 gcc_assert (nelements == nunits);
860 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
861 if (TREE_CODE (init_val) == INTEGER_CST || TREE_CODE (init_val) == REAL_CST)
862 vec = build_vector (vector_type, t);
864 vec = build_constructor_from_list (vector_type, t);
866 if (!need_epilog_adjust)
867 *scalar_def = NULL_TREE;
869 *scalar_def = init_val;
871 return vect_init_vector (stmt, vec, vector_type);
875 /* Function vect_create_epilog_for_reduction
877 Create code at the loop-epilog to finalize the result of a reduction
880 VECT_DEF is a vector of partial results.
881 REDUC_CODE is the tree-code for the epilog reduction.
882 STMT is the scalar reduction stmt that is being vectorized.
883 REDUCTION_PHI is the phi-node that carries the reduction computation.
886 1. Creates the reduction def-use cycle: sets the the arguments for
888 The loop-entry argument is the vectorized initial-value of the reduction.
889 The loop-latch argument is VECT_DEF - the vector of partial sums.
890 2. "Reduces" the vector of partial results VECT_DEF into a single result,
891 by applying the operation specified by REDUC_CODE if available, or by
892 other means (whole-vector shifts or a scalar loop).
893 The function also creates a new phi node at the loop exit to preserve
894 loop-closed form, as illustrated below.
896 The flow at the entry to this function:
899 vec_def = phi <null, null> # REDUCTION_PHI
900 VECT_DEF = vector_stmt # vectorized form of STMT
901 s_loop = scalar_stmt # (scalar) STMT
903 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
907 The above is transformed by this function into:
910 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
911 VECT_DEF = vector_stmt # vectorized form of STMT
912 s_loop = scalar_stmt # (scalar) STMT
914 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
915 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
916 v_out2 = reduce <v_out1>
917 s_out3 = extract_field <v_out2, 0>
918 s_out4 = adjust_result <s_out3>
924 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
925 enum tree_code reduc_code, tree reduction_phi)
927 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
929 enum machine_mode mode;
930 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
931 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
936 block_stmt_iterator exit_bsi;
941 tree new_scalar_dest, exit_phi;
942 tree bitsize, bitpos, bytesize;
943 enum tree_code code = TREE_CODE (TREE_OPERAND (stmt, 1));
944 tree scalar_initial_def;
945 tree vec_initial_def;
947 imm_use_iterator imm_iter;
949 bool extract_scalar_result;
953 tree operation = TREE_OPERAND (stmt, 1);
956 op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
957 reduction_op = TREE_OPERAND (operation, op_type-1);
958 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
959 mode = TYPE_MODE (vectype);
961 /*** 1. Create the reduction def-use cycle ***/
963 /* 1.1 set the loop-entry arg of the reduction-phi: */
964 /* For the case of reduction, vect_get_vec_def_for_operand returns
965 the scalar def before the loop, that defines the initial value
966 of the reduction variable. */
967 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
968 &scalar_initial_def);
969 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
971 /* 1.2 set the loop-latch arg for the reduction-phi: */
972 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
974 if (vect_print_dump_info (REPORT_DETAILS))
976 fprintf (vect_dump, "transform reduction: created def-use cycle:");
977 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
978 fprintf (vect_dump, "\n");
979 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
983 /*** 2. Create epilog code
984 The reduction epilog code operates across the elements of the vector
985 of partial results computed by the vectorized loop.
986 The reduction epilog code consists of:
987 step 1: compute the scalar result in a vector (v_out2)
988 step 2: extract the scalar result (s_out3) from the vector (v_out2)
989 step 3: adjust the scalar result (s_out3) if needed.
991 Step 1 can be accomplished using one the following three schemes:
992 (scheme 1) using reduc_code, if available.
993 (scheme 2) using whole-vector shifts, if available.
994 (scheme 3) using a scalar loop. In this case steps 1+2 above are
997 The overall epilog code looks like this:
999 s_out0 = phi <s_loop> # original EXIT_PHI
1000 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1001 v_out2 = reduce <v_out1> # step 1
1002 s_out3 = extract_field <v_out2, 0> # step 2
1003 s_out4 = adjust_result <s_out3> # step 3
1005 (step 3 is optional, and step2 1 and 2 may be combined).
1006 Lastly, the uses of s_out0 are replaced by s_out4.
1010 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
1011 v_out1 = phi <v_loop> */
1013 exit_bb = single_exit (loop)->dest;
1014 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
1015 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
1016 exit_bsi = bsi_start (exit_bb);
1018 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
1019 (i.e. when reduc_code is not available) and in the final adjustment code
1020 (if needed). Also get the original scalar reduction variable as
1021 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
1022 represents a reduction pattern), the tree-code and scalar-def are
1023 taken from the original stmt that the pattern-stmt (STMT) replaces.
1024 Otherwise (it is a regular reduction) - the tree-code and scalar-def
1025 are taken from STMT. */
1027 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1030 /* Regular reduction */
1035 /* Reduction pattern */
1036 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
1037 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1038 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
1040 code = TREE_CODE (TREE_OPERAND (orig_stmt, 1));
1041 scalar_dest = TREE_OPERAND (orig_stmt, 0);
1042 scalar_type = TREE_TYPE (scalar_dest);
1043 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
1044 bitsize = TYPE_SIZE (scalar_type);
1045 bytesize = TYPE_SIZE_UNIT (scalar_type);
1047 /* 2.3 Create the reduction code, using one of the three schemes described
1050 if (reduc_code < NUM_TREE_CODES)
1052 /*** Case 1: Create:
1053 v_out2 = reduc_expr <v_out1> */
1055 if (vect_print_dump_info (REPORT_DETAILS))
1056 fprintf (vect_dump, "Reduce using direct vector reduction.");
1058 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1059 epilog_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest,
1060 build1 (reduc_code, vectype, PHI_RESULT (new_phi)));
1061 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1062 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1063 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1065 extract_scalar_result = true;
1069 enum tree_code shift_code = 0;
1070 bool have_whole_vector_shift = true;
1072 int element_bitsize = tree_low_cst (bitsize, 1);
1073 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1076 if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
1077 shift_code = VEC_RSHIFT_EXPR;
1079 have_whole_vector_shift = false;
1081 /* Regardless of whether we have a whole vector shift, if we're
1082 emulating the operation via tree-vect-generic, we don't want
1083 to use it. Only the first round of the reduction is likely
1084 to still be profitable via emulation. */
1085 /* ??? It might be better to emit a reduction tree code here, so that
1086 tree-vect-generic can expand the first round via bit tricks. */
1087 if (!VECTOR_MODE_P (mode))
1088 have_whole_vector_shift = false;
1091 optab optab = optab_for_tree_code (code, vectype);
1092 if (optab->handlers[mode].insn_code == CODE_FOR_nothing)
1093 have_whole_vector_shift = false;
1096 if (have_whole_vector_shift)
1098 /*** Case 2: Create:
1099 for (offset = VS/2; offset >= element_size; offset/=2)
1101 Create: va' = vec_shift <va, offset>
1102 Create: va = vop <va, va'>
1105 if (vect_print_dump_info (REPORT_DETAILS))
1106 fprintf (vect_dump, "Reduce using vector shifts");
1108 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1109 new_temp = PHI_RESULT (new_phi);
1111 for (bit_offset = vec_size_in_bits/2;
1112 bit_offset >= element_bitsize;
1115 tree bitpos = size_int (bit_offset);
1117 epilog_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest,
1118 build2 (shift_code, vectype,
1120 new_name = make_ssa_name (vec_dest, epilog_stmt);
1121 TREE_OPERAND (epilog_stmt, 0) = new_name;
1122 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1124 epilog_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest,
1125 build2 (code, vectype,
1126 new_name, new_temp));
1127 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1128 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1129 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1132 extract_scalar_result = true;
1138 /*** Case 3: Create:
1139 s = extract_field <v_out2, 0>
1140 for (offset = element_size;
1141 offset < vector_size;
1142 offset += element_size;)
1144 Create: s' = extract_field <v_out2, offset>
1145 Create: s = op <s, s'>
1148 if (vect_print_dump_info (REPORT_DETAILS))
1149 fprintf (vect_dump, "Reduce using scalar code. ");
1151 vec_temp = PHI_RESULT (new_phi);
1152 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1153 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1155 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1156 epilog_stmt = build2 (MODIFY_EXPR, void_type_node, new_scalar_dest, rhs);
1157 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1158 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1159 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1161 for (bit_offset = element_bitsize;
1162 bit_offset < vec_size_in_bits;
1163 bit_offset += element_bitsize)
1165 tree bitpos = bitsize_int (bit_offset);
1166 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1169 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1170 epilog_stmt = build2 (MODIFY_EXPR, void_type_node, new_scalar_dest,
1172 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
1173 TREE_OPERAND (epilog_stmt, 0) = new_name;
1174 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1176 epilog_stmt = build2 (MODIFY_EXPR, void_type_node, new_scalar_dest,
1177 build2 (code, scalar_type, new_name, new_temp));
1178 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1179 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1180 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1183 extract_scalar_result = false;
1187 /* 2.4 Extract the final scalar result. Create:
1188 s_out3 = extract_field <v_out2, bitpos> */
1190 if (extract_scalar_result)
1194 if (vect_print_dump_info (REPORT_DETAILS))
1195 fprintf (vect_dump, "extract scalar result");
1197 if (BYTES_BIG_ENDIAN)
1198 bitpos = size_binop (MULT_EXPR,
1199 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
1200 TYPE_SIZE (scalar_type));
1202 bitpos = bitsize_zero_node;
1204 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
1205 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1206 epilog_stmt = build2 (MODIFY_EXPR, void_type_node, new_scalar_dest, rhs);
1207 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1208 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1209 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1212 /* 2.4 Adjust the final result by the initial value of the reduction
1213 variable. (When such adjustment is not needed, then
1214 'scalar_initial_def' is zero).
1217 s_out4 = scalar_expr <s_out3, scalar_initial_def> */
1219 if (scalar_initial_def)
1221 epilog_stmt = build2 (MODIFY_EXPR, void_type_node, new_scalar_dest,
1222 build2 (code, scalar_type, new_temp, scalar_initial_def));
1223 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1224 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1225 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1228 /* 2.6 Replace uses of s_out0 with uses of s_out3 */
1230 /* Find the loop-closed-use at the loop exit of the original scalar result.
1231 (The reduction result is expected to have two immediate uses - one at the
1232 latch block, and one at the loop exit). */
1234 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
1236 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
1238 exit_phi = USE_STMT (use_p);
1242 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
1243 gcc_assert (exit_phi);
1244 /* Replace the uses: */
1245 orig_name = PHI_RESULT (exit_phi);
1246 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
1247 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1248 SET_USE (use_p, new_temp);
1252 /* Function vectorizable_reduction.
1254 Check if STMT performs a reduction operation that can be vectorized.
1255 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1256 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1257 Return FALSE if not a vectorizable STMT, TRUE otherwise.
1259 This function also handles reduction idioms (patterns) that have been
1260 recognized in advance during vect_pattern_recog. In this case, STMT may be
1262 X = pattern_expr (arg0, arg1, ..., X)
1263 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
1264 sequence that had been detected and replaced by the pattern-stmt (STMT).
1266 In some cases of reduction patterns, the type of the reduction variable X is
1267 different than the type of the other arguments of STMT.
1268 In such cases, the vectype that is used when transforming STMT into a vector
1269 stmt is different than the vectype that is used to determine the
1270 vectorization factor, because it consists of a different number of elements
1271 than the actual number of elements that are being operated upon in parallel.
1273 For example, consider an accumulation of shorts into an int accumulator.
1274 On some targets it's possible to vectorize this pattern operating on 8
1275 shorts at a time (hence, the vectype for purposes of determining the
1276 vectorization factor should be V8HI); on the other hand, the vectype that
1277 is used to create the vector form is actually V4SI (the type of the result).
1279 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
1280 indicates what is the actual level of parallelism (V8HI in the example), so
1281 that the right vectorization factor would be derived. This vectype
1282 corresponds to the type of arguments to the reduction stmt, and should *NOT*
1283 be used to create the vectorized stmt. The right vectype for the vectorized
1284 stmt is obtained from the type of the result X:
1285 get_vectype_for_scalar_type (TREE_TYPE (X))
1287 This means that, contrary to "regular" reductions (or "regular" stmts in
1288 general), the following equation:
1289 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
1290 does *NOT* necessarily hold for reduction patterns. */
1293 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1298 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
1299 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1300 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1301 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1302 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1304 enum tree_code code, orig_code, epilog_reduc_code = 0;
1305 enum machine_mode vec_mode;
1307 optab optab, reduc_optab;
1308 tree new_temp = NULL_TREE;
1310 enum vect_def_type dt;
1315 stmt_vec_info orig_stmt_info;
1316 tree expr = NULL_TREE;
1318 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1319 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1320 stmt_vec_info prev_stmt_info;
1322 tree new_stmt = NULL_TREE;
1325 gcc_assert (ncopies >= 1);
1327 /* 1. Is vectorizable reduction? */
1329 /* Not supportable if the reduction variable is used in the loop. */
1330 if (STMT_VINFO_RELEVANT_P (stmt_info))
1333 if (!STMT_VINFO_LIVE_P (stmt_info))
1336 /* Make sure it was already recognized as a reduction computation. */
1337 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
1340 /* 2. Has this been recognized as a reduction pattern?
1342 Check if STMT represents a pattern that has been recognized
1343 in earlier analysis stages. For stmts that represent a pattern,
1344 the STMT_VINFO_RELATED_STMT field records the last stmt in
1345 the original sequence that constitutes the pattern. */
1347 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1350 orig_stmt_info = vinfo_for_stmt (orig_stmt);
1351 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
1352 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
1353 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
1356 /* 3. Check the operands of the operation. The first operands are defined
1357 inside the loop body. The last operand is the reduction variable,
1358 which is defined by the loop-header-phi. */
1360 gcc_assert (TREE_CODE (stmt) == MODIFY_EXPR);
1362 operation = TREE_OPERAND (stmt, 1);
1363 code = TREE_CODE (operation);
1364 op_type = TREE_CODE_LENGTH (code);
1365 if (op_type != binary_op && op_type != ternary_op)
1367 scalar_dest = TREE_OPERAND (stmt, 0);
1368 scalar_type = TREE_TYPE (scalar_dest);
1370 /* All uses but the last are expected to be defined in the loop.
1371 The last use is the reduction variable. */
1372 for (i = 0; i < op_type-1; i++)
1374 op = TREE_OPERAND (operation, i);
1375 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1376 gcc_assert (is_simple_use);
1377 gcc_assert (dt == vect_loop_def || dt == vect_invariant_def ||
1378 dt == vect_constant_def);
1381 op = TREE_OPERAND (operation, i);
1382 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1383 gcc_assert (is_simple_use);
1384 gcc_assert (dt == vect_reduction_def);
1385 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1387 gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt));
1389 gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt));
1391 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
1394 /* 4. Supportable by target? */
1396 /* 4.1. check support for the operation in the loop */
1397 optab = optab_for_tree_code (code, vectype);
1400 if (vect_print_dump_info (REPORT_DETAILS))
1401 fprintf (vect_dump, "no optab.");
1404 vec_mode = TYPE_MODE (vectype);
1405 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1407 if (vect_print_dump_info (REPORT_DETAILS))
1408 fprintf (vect_dump, "op not supported by target.");
1409 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1410 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1411 < vect_min_worthwhile_factor (code))
1413 if (vect_print_dump_info (REPORT_DETAILS))
1414 fprintf (vect_dump, "proceeding using word mode.");
1417 /* Worthwhile without SIMD support? */
1418 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1419 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1420 < vect_min_worthwhile_factor (code))
1422 if (vect_print_dump_info (REPORT_DETAILS))
1423 fprintf (vect_dump, "not worthwhile without SIMD support.");
1427 /* 4.2. Check support for the epilog operation.
1429 If STMT represents a reduction pattern, then the type of the
1430 reduction variable may be different than the type of the rest
1431 of the arguments. For example, consider the case of accumulation
1432 of shorts into an int accumulator; The original code:
1433 S1: int_a = (int) short_a;
1434 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
1437 STMT: int_acc = widen_sum <short_a, int_acc>
1440 1. The tree-code that is used to create the vector operation in the
1441 epilog code (that reduces the partial results) is not the
1442 tree-code of STMT, but is rather the tree-code of the original
1443 stmt from the pattern that STMT is replacing. I.e, in the example
1444 above we want to use 'widen_sum' in the loop, but 'plus' in the
1446 2. The type (mode) we use to check available target support
1447 for the vector operation to be created in the *epilog*, is
1448 determined by the type of the reduction variable (in the example
1449 above we'd check this: plus_optab[vect_int_mode]).
1450 However the type (mode) we use to check available target support
1451 for the vector operation to be created *inside the loop*, is
1452 determined by the type of the other arguments to STMT (in the
1453 example we'd check this: widen_sum_optab[vect_short_mode]).
1455 This is contrary to "regular" reductions, in which the types of all
1456 the arguments are the same as the type of the reduction variable.
1457 For "regular" reductions we can therefore use the same vector type
1458 (and also the same tree-code) when generating the epilog code and
1459 when generating the code inside the loop. */
1463 /* This is a reduction pattern: get the vectype from the type of the
1464 reduction variable, and get the tree-code from orig_stmt. */
1465 orig_code = TREE_CODE (TREE_OPERAND (orig_stmt, 1));
1466 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
1467 vec_mode = TYPE_MODE (vectype);
1471 /* Regular reduction: use the same vectype and tree-code as used for
1472 the vector code inside the loop can be used for the epilog code. */
1476 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
1478 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
1481 if (vect_print_dump_info (REPORT_DETAILS))
1482 fprintf (vect_dump, "no optab for reduction.");
1483 epilog_reduc_code = NUM_TREE_CODES;
1485 if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1487 if (vect_print_dump_info (REPORT_DETAILS))
1488 fprintf (vect_dump, "reduc op not supported by target.");
1489 epilog_reduc_code = NUM_TREE_CODES;
1492 if (!vec_stmt) /* transformation not required. */
1494 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
1500 if (vect_print_dump_info (REPORT_DETAILS))
1501 fprintf (vect_dump, "transform reduction.");
1503 /* Create the destination vector */
1504 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1506 /* Create the reduction-phi that defines the reduction-operand. */
1507 new_phi = create_phi_node (vec_dest, loop->header);
1509 /* In case the vectorization factor (VF) is bigger than the number
1510 of elements that we can fit in a vectype (nunits), we have to generate
1511 more than one vector stmt - i.e - we need to "unroll" the
1512 vector stmt by a factor VF/nunits. For more details see documentation
1513 in vectorizable_operation. */
1515 prev_stmt_info = NULL;
1516 for (j = 0; j < ncopies; j++)
1521 op = TREE_OPERAND (operation, 0);
1522 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
1523 if (op_type == ternary_op)
1525 op = TREE_OPERAND (operation, 1);
1526 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
1529 /* Get the vector def for the reduction variable from the phi node */
1530 reduc_def = PHI_RESULT (new_phi);
1534 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
1535 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
1536 if (op_type == ternary_op)
1537 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
1539 /* Get the vector def for the reduction variable from the vectorized
1540 reduction operation generated in the previous iteration (j-1) */
1541 reduc_def = TREE_OPERAND (new_stmt ,0);
1544 /* Arguments are ready. create the new vector stmt. */
1546 if (op_type == binary_op)
1547 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
1549 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
1551 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, expr);
1552 new_temp = make_ssa_name (vec_dest, new_stmt);
1553 TREE_OPERAND (new_stmt, 0) = new_temp;
1554 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1557 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1559 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1560 prev_stmt_info = vinfo_for_stmt (new_stmt);
1563 /* Finalize the reduction-phi (set it's arguments) and create the
1564 epilog reduction code. */
1565 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
1570 /* Function vectorizable_assignment.
1572 Check if STMT performs an assignment (copy) that can be vectorized.
1573 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1574 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1575 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1578 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1584 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1585 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1586 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1589 enum vect_def_type dt;
1590 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1591 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1593 gcc_assert (ncopies >= 1);
1595 return false; /* FORNOW */
1597 /* Is vectorizable assignment? */
1598 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1601 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1603 if (TREE_CODE (stmt) != MODIFY_EXPR)
1606 scalar_dest = TREE_OPERAND (stmt, 0);
1607 if (TREE_CODE (scalar_dest) != SSA_NAME)
1610 op = TREE_OPERAND (stmt, 1);
1611 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
1613 if (vect_print_dump_info (REPORT_DETAILS))
1614 fprintf (vect_dump, "use not simple.");
1618 if (!vec_stmt) /* transformation not required. */
1620 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
1625 if (vect_print_dump_info (REPORT_DETAILS))
1626 fprintf (vect_dump, "transform assignment.");
1629 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1632 op = TREE_OPERAND (stmt, 1);
1633 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
1635 /* Arguments are ready. create the new vector stmt. */
1636 *vec_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, vec_oprnd);
1637 new_temp = make_ssa_name (vec_dest, *vec_stmt);
1638 TREE_OPERAND (*vec_stmt, 0) = new_temp;
1639 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
1645 /* Function vect_min_worthwhile_factor.
1647 For a loop where we could vectorize the operation indicated by CODE,
1648 return the minimum vectorization factor that makes it worthwhile
1649 to use generic vectors. */
1651 vect_min_worthwhile_factor (enum tree_code code)
1672 /* Function vectorizable_operation.
1674 Check if STMT performs a binary or unary operation that can be vectorized.
1675 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1676 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1677 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1680 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1685 tree op0, op1 = NULL;
1686 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
1687 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1688 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1689 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1690 enum tree_code code;
1691 enum machine_mode vec_mode;
1696 enum machine_mode optab_op2_mode;
1698 enum vect_def_type dt0, dt1;
1700 stmt_vec_info prev_stmt_info;
1701 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
1704 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
1707 gcc_assert (ncopies >= 1);
1709 /* Is STMT a vectorizable binary/unary operation? */
1710 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1713 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1715 if (STMT_VINFO_LIVE_P (stmt_info))
1717 /* FORNOW: not yet supported. */
1718 if (vect_print_dump_info (REPORT_DETAILS))
1719 fprintf (vect_dump, "value used after loop.");
1723 if (TREE_CODE (stmt) != MODIFY_EXPR)
1726 if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME)
1729 scalar_dest = TREE_OPERAND (stmt, 0);
1730 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
1731 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
1732 if (nunits_out != nunits_in)
1735 operation = TREE_OPERAND (stmt, 1);
1736 code = TREE_CODE (operation);
1737 optab = optab_for_tree_code (code, vectype);
1739 /* Support only unary or binary operations. */
1740 op_type = TREE_CODE_LENGTH (code);
1741 if (op_type != unary_op && op_type != binary_op)
1743 if (vect_print_dump_info (REPORT_DETAILS))
1744 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
1748 op0 = TREE_OPERAND (operation, 0);
1749 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
1751 if (vect_print_dump_info (REPORT_DETAILS))
1752 fprintf (vect_dump, "use not simple.");
1756 if (op_type == binary_op)
1758 op1 = TREE_OPERAND (operation, 1);
1759 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
1761 if (vect_print_dump_info (REPORT_DETAILS))
1762 fprintf (vect_dump, "use not simple.");
1767 /* Supportable by target? */
1770 if (vect_print_dump_info (REPORT_DETAILS))
1771 fprintf (vect_dump, "no optab.");
1774 vec_mode = TYPE_MODE (vectype);
1775 icode = (int) optab->handlers[(int) vec_mode].insn_code;
1776 if (icode == CODE_FOR_nothing)
1778 if (vect_print_dump_info (REPORT_DETAILS))
1779 fprintf (vect_dump, "op not supported by target.");
1780 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1781 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1782 < vect_min_worthwhile_factor (code))
1784 if (vect_print_dump_info (REPORT_DETAILS))
1785 fprintf (vect_dump, "proceeding using word mode.");
1788 /* Worthwhile without SIMD support? */
1789 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1790 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1791 < vect_min_worthwhile_factor (code))
1793 if (vect_print_dump_info (REPORT_DETAILS))
1794 fprintf (vect_dump, "not worthwhile without SIMD support.");
1798 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
1800 /* FORNOW: not yet supported. */
1801 if (!VECTOR_MODE_P (vec_mode))
1804 /* Invariant argument is needed for a vector shift
1805 by a scalar shift operand. */
1806 optab_op2_mode = insn_data[icode].operand[2].mode;
1807 if (! (VECTOR_MODE_P (optab_op2_mode)
1808 || dt1 == vect_constant_def
1809 || dt1 == vect_invariant_def))
1811 if (vect_print_dump_info (REPORT_DETAILS))
1812 fprintf (vect_dump, "operand mode requires invariant argument.");
1817 if (!vec_stmt) /* transformation not required. */
1819 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
1825 if (vect_print_dump_info (REPORT_DETAILS))
1826 fprintf (vect_dump, "transform binary/unary operation.");
1829 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1831 /* In case the vectorization factor (VF) is bigger than the number
1832 of elements that we can fit in a vectype (nunits), we have to generate
1833 more than one vector stmt - i.e - we need to "unroll" the
1834 vector stmt by a factor VF/nunits. In doing so, we record a pointer
1835 from one copy of the vector stmt to the next, in the field
1836 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
1837 stages to find the correct vector defs to be used when vectorizing
1838 stmts that use the defs of the current stmt. The example below illustrates
1839 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
1840 4 vectorized stmts):
1842 before vectorization:
1843 RELATED_STMT VEC_STMT
1847 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
1849 RELATED_STMT VEC_STMT
1850 VS1_0: vx0 = memref0 VS1_1 -
1851 VS1_1: vx1 = memref1 VS1_2 -
1852 VS1_2: vx2 = memref2 VS1_3 -
1853 VS1_3: vx3 = memref3 - -
1854 S1: x = load - VS1_0
1857 step2: vectorize stmt S2 (done here):
1858 To vectorize stmt S2 we first need to find the relevant vector
1859 def for the first operand 'x'. This is, as usual, obtained from
1860 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
1861 that defines 'x' (S1). This way we find the stmt VS1_0, and the
1862 relevant vector def 'vx0'. Having found 'vx0' we can generate
1863 the vector stmt VS2_0, and as usual, record it in the
1864 STMT_VINFO_VEC_STMT of stmt S2.
1865 When creating the second copy (VS2_1), we obtain the relevant vector
1866 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
1867 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
1868 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
1869 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
1870 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
1871 chain of stmts and pointers:
1872 RELATED_STMT VEC_STMT
1873 VS1_0: vx0 = memref0 VS1_1 -
1874 VS1_1: vx1 = memref1 VS1_2 -
1875 VS1_2: vx2 = memref2 VS1_3 -
1876 VS1_3: vx3 = memref3 - -
1877 S1: x = load - VS1_0
1878 VS2_0: vz0 = vx0 + v1 VS2_1 -
1879 VS2_1: vz1 = vx1 + v1 VS2_2 -
1880 VS2_2: vz2 = vx2 + v1 VS2_3 -
1881 VS2_3: vz3 = vx3 + v1 - -
1882 S2: z = x + 1 - VS2_0 */
1884 prev_stmt_info = NULL;
1885 for (j = 0; j < ncopies; j++)
1890 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
1891 if (op_type == binary_op)
1893 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
1895 /* Vector shl and shr insn patterns can be defined with
1896 scalar operand 2 (shift operand). In this case, use
1897 constant or loop invariant op1 directly, without
1898 extending it to vector mode first. */
1899 optab_op2_mode = insn_data[icode].operand[2].mode;
1900 if (!VECTOR_MODE_P (optab_op2_mode))
1902 if (vect_print_dump_info (REPORT_DETAILS))
1903 fprintf (vect_dump, "operand 1 using scalar mode.");
1908 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
1913 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
1914 if (op_type == binary_op)
1915 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
1918 /* Arguments are ready. create the new vector stmt. */
1920 if (op_type == binary_op)
1921 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest,
1922 build2 (code, vectype, vec_oprnd0, vec_oprnd1));
1924 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest,
1925 build1 (code, vectype, vec_oprnd0));
1926 new_temp = make_ssa_name (vec_dest, new_stmt);
1927 TREE_OPERAND (new_stmt, 0) = new_temp;
1928 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1931 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1933 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1934 prev_stmt_info = vinfo_for_stmt (new_stmt);
1941 /* Function vectorizable_type_demotion
1943 Check if STMT performs a binary or unary operation that involves
1944 type demotion, and if it can be vectorized.
1945 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1946 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1947 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1950 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
1957 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
1958 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1959 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1960 enum tree_code code;
1963 enum vect_def_type dt0;
1965 stmt_vec_info prev_stmt_info;
1975 enum machine_mode vec_mode;
1977 /* Is STMT a vectorizable type-demotion operation? */
1979 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1982 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1984 if (STMT_VINFO_LIVE_P (stmt_info))
1986 /* FORNOW: not yet supported. */
1987 if (vect_print_dump_info (REPORT_DETAILS))
1988 fprintf (vect_dump, "value used after loop.");
1992 if (TREE_CODE (stmt) != MODIFY_EXPR)
1995 if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME)
1998 operation = TREE_OPERAND (stmt, 1);
1999 code = TREE_CODE (operation);
2000 if (code != NOP_EXPR && code != CONVERT_EXPR)
2003 op0 = TREE_OPERAND (operation, 0);
2004 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2005 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2007 scalar_dest = TREE_OPERAND (stmt, 0);
2008 scalar_type = TREE_TYPE (scalar_dest);
2009 vectype_out = get_vectype_for_scalar_type (scalar_type);
2010 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2011 if (nunits_in != nunits_out / 2) /* FORNOW */
2014 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2015 gcc_assert (ncopies >= 1);
2017 /* Check the operands of the operation. */
2018 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2020 if (vect_print_dump_info (REPORT_DETAILS))
2021 fprintf (vect_dump, "use not simple.");
2025 /* Supportable by target? */
2026 code = VEC_PACK_MOD_EXPR;
2027 optab = optab_for_tree_code (VEC_PACK_MOD_EXPR, vectype_in);
2031 vec_mode = TYPE_MODE (vectype_in);
2032 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
2035 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2037 if (!vec_stmt) /* transformation not required. */
2039 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
2045 if (vect_print_dump_info (REPORT_DETAILS))
2046 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
2050 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2052 /* In case the vectorization factor (VF) is bigger than the number
2053 of elements that we can fit in a vectype (nunits), we have to generate
2054 more than one vector stmt - i.e - we need to "unroll" the
2055 vector stmt by a factor VF/nunits. */
2056 prev_stmt_info = NULL;
2057 for (j = 0; j < ncopies; j++)
2062 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2063 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2064 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd0);
2068 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
2069 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2072 /* Arguments are ready. Create the new vector stmt. */
2073 expr = build2 (code, vectype_out, vec_oprnd0, vec_oprnd1);
2074 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, expr);
2075 new_temp = make_ssa_name (vec_dest, new_stmt);
2076 TREE_OPERAND (new_stmt, 0) = new_temp;
2077 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2080 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2082 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2084 prev_stmt_info = vinfo_for_stmt (new_stmt);
2087 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2092 /* Function vect_gen_widened_results_half
2094 Create a vector stmt whose code, type, number of arguments, and result
2095 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
2096 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
2097 In the case that CODE is a CALL_EXPR, this means that a call to DECL
2098 needs to be created (DECL is a function-decl of a target-builtin).
2099 STMT is the original scalar stmt that we are vectorizing. */
2102 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
2103 tree vec_oprnd0, tree vec_oprnd1, int op_type,
2104 tree vec_dest, block_stmt_iterator *bsi,
2114 /* Generate half of the widened result: */
2115 if (code == CALL_EXPR)
2117 /* Target specific support */
2118 vec_params = build_tree_list (NULL_TREE, vec_oprnd0);
2119 if (op_type == binary_op)
2120 vec_params = tree_cons (NULL_TREE, vec_oprnd1, vec_params);
2121 expr = build_function_call_expr (decl, vec_params);
2125 /* Generic support */
2126 gcc_assert (op_type == TREE_CODE_LENGTH (code));
2127 if (op_type == binary_op)
2128 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
2130 expr = build1 (code, vectype, vec_oprnd0);
2132 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, expr);
2133 new_temp = make_ssa_name (vec_dest, new_stmt);
2134 TREE_OPERAND (new_stmt, 0) = new_temp;
2135 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2137 if (code == CALL_EXPR)
2139 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2141 if (TREE_CODE (sym) == SSA_NAME)
2142 sym = SSA_NAME_VAR (sym);
2143 mark_sym_for_renaming (sym);
2151 /* Function vectorizable_type_promotion
2153 Check if STMT performs a binary or unary operation that involves
2154 type promotion, and if it can be vectorized.
2155 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2156 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2157 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2160 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
2166 tree op0, op1 = NULL;
2167 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2168 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2169 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2170 enum tree_code code, code1 = CODE_FOR_nothing, code2 = CODE_FOR_nothing;
2171 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
2174 enum vect_def_type dt0, dt1;
2176 stmt_vec_info prev_stmt_info;
2184 /* Is STMT a vectorizable type-promotion operation? */
2186 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2189 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2191 if (STMT_VINFO_LIVE_P (stmt_info))
2193 /* FORNOW: not yet supported. */
2194 if (vect_print_dump_info (REPORT_DETAILS))
2195 fprintf (vect_dump, "value used after loop.");
2199 if (TREE_CODE (stmt) != MODIFY_EXPR)
2202 if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME)
2205 operation = TREE_OPERAND (stmt, 1);
2206 code = TREE_CODE (operation);
2207 if (code != NOP_EXPR && code != WIDEN_MULT_EXPR)
2210 op0 = TREE_OPERAND (operation, 0);
2211 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2212 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2213 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2214 gcc_assert (ncopies >= 1);
2216 scalar_dest = TREE_OPERAND (stmt, 0);
2217 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2218 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2219 if (nunits_out != nunits_in / 2) /* FORNOW */
2222 /* Check the operands of the operation. */
2223 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2225 if (vect_print_dump_info (REPORT_DETAILS))
2226 fprintf (vect_dump, "use not simple.");
2230 op_type = TREE_CODE_LENGTH (code);
2231 if (op_type == binary_op)
2233 op1 = TREE_OPERAND (operation, 1);
2234 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2236 if (vect_print_dump_info (REPORT_DETAILS))
2237 fprintf (vect_dump, "use not simple.");
2242 /* Supportable by target? */
2243 if (!supportable_widening_operation (code, stmt, vectype_in,
2244 &decl1, &decl2, &code1, &code2))
2247 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2249 if (!vec_stmt) /* transformation not required. */
2251 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
2257 if (vect_print_dump_info (REPORT_DETAILS))
2258 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
2262 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2264 /* In case the vectorization factor (VF) is bigger than the number
2265 of elements that we can fit in a vectype (nunits), we have to generate
2266 more than one vector stmt - i.e - we need to "unroll" the
2267 vector stmt by a factor VF/nunits. */
2269 prev_stmt_info = NULL;
2270 for (j = 0; j < ncopies; j++)
2275 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2276 if (op_type == binary_op)
2277 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2281 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2282 if (op_type == binary_op)
2283 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2286 /* Arguments are ready. Create the new vector stmt. We are creating
2287 two vector defs because the widened result does not fit in one vector.
2288 The vectorized stmt can be expressed as a call to a taregt builtin,
2289 or a using a tree-code. */
2290 /* Generate first half of the widened result: */
2291 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
2292 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2294 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2296 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2297 prev_stmt_info = vinfo_for_stmt (new_stmt);
2299 /* Generate second half of the widened result: */
2300 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
2301 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2302 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2303 prev_stmt_info = vinfo_for_stmt (new_stmt);
2307 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2312 /* Function vect_strided_store_supported.
2314 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
2315 and FALSE otherwise. */
2318 vect_strided_store_supported (tree vectype)
2320 optab interleave_high_optab, interleave_low_optab;
2323 mode = (int) TYPE_MODE (vectype);
2325 /* Check that the operation is supported. */
2326 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
2328 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
2330 if (!interleave_high_optab || !interleave_low_optab)
2332 if (vect_print_dump_info (REPORT_DETAILS))
2333 fprintf (vect_dump, "no optab for interleave.");
2337 if (interleave_high_optab->handlers[(int) mode].insn_code
2339 || interleave_low_optab->handlers[(int) mode].insn_code
2340 == CODE_FOR_nothing)
2342 if (vect_print_dump_info (REPORT_DETAILS))
2343 fprintf (vect_dump, "interleave op not supported by target.");
2350 /* Function vect_permute_store_chain.
2352 Given a chain of interleaved strores in DR_CHAIN of LENGTH that must be
2353 a power of 2, generate interleave_high/low stmts to reorder the data
2354 correctly for the stores. Return the final references for stores in
2357 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
2358 The input is 4 vectors each containg 8 elements. We assign a number to each
2359 element, the input sequence is:
2361 1st vec: 0 1 2 3 4 5 6 7
2362 2nd vec: 8 9 10 11 12 13 14 15
2363 3rd vec: 16 17 18 19 20 21 22 23
2364 4th vec: 24 25 26 27 28 29 30 31
2366 The output sequence should be:
2368 1st vec: 0 8 16 24 1 9 17 25
2369 2nd vec: 2 10 18 26 3 11 19 27
2370 3rd vec: 4 12 20 28 5 13 21 30
2371 4th vec: 6 14 22 30 7 15 23 31
2373 i.e., we interleave the contents of the four vectors in their order.
2375 We use interleave_high/low instructions to create such output. The input of
2376 each interleave_high/low operation is two vectors:
2379 the even elements of the result vector are obtained left-to-right from the
2380 high/low elements of the first vector. The odd elements of the result are
2381 obtained left-to-right from the high/low elements of the second vector.
2382 The output of interleave_high will be: 0 4 1 5
2383 and of interleave_low: 2 6 3 7
2386 The permutaion is done in log LENGTH stages. In each stage interleave_high
2387 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
2388 where the first argument is taken from the first half of DR_CHAIN and the
2389 second argument from it's second half.
2392 I1: interleave_high (1st vec, 3rd vec)
2393 I2: interleave_low (1st vec, 3rd vec)
2394 I3: interleave_high (2nd vec, 4th vec)
2395 I4: interleave_low (2nd vec, 4th vec)
2397 The output for the first stage is:
2399 I1: 0 16 1 17 2 18 3 19
2400 I2: 4 20 5 21 6 22 7 23
2401 I3: 8 24 9 25 10 26 11 27
2402 I4: 12 28 13 29 14 30 15 31
2404 The output of the second stage, i.e. the final result is:
2406 I1: 0 8 16 24 1 9 17 25
2407 I2: 2 10 18 26 3 11 19 27
2408 I3: 4 12 20 28 5 13 21 30
2409 I4: 6 14 22 30 7 15 23 31. */
2412 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
2413 unsigned int length,
2415 block_stmt_iterator *bsi,
2416 VEC(tree,heap) **result_chain)
2418 tree perm_dest, perm_stmt, vect1, vect2, high, low;
2419 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2423 VEC(tree,heap) *first, *second;
2425 scalar_dest = TREE_OPERAND (stmt, 0);
2426 first = VEC_alloc (tree, heap, length/2);
2427 second = VEC_alloc (tree, heap, length/2);
2429 /* Check that the operation is supported. */
2430 if (!vect_strided_store_supported (vectype))
2433 *result_chain = VEC_copy (tree, heap, dr_chain);
2435 for (i = 0; i < exact_log2 (length); i++)
2437 for (j = 0; j < length/2; j++)
2439 vect1 = VEC_index (tree, dr_chain, j);
2440 vect2 = VEC_index (tree, dr_chain, j+length/2);
2442 /* high = interleave_high (vect1, vect2); */
2443 perm_dest = create_tmp_var (vectype, "vect_inter_high");
2444 add_referenced_var (perm_dest);
2445 perm_stmt = build2 (MODIFY_EXPR, void_type_node, perm_dest,
2446 build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1,
2448 high = make_ssa_name (perm_dest, perm_stmt);
2449 TREE_OPERAND (perm_stmt, 0) = high;
2450 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
2451 VEC_replace (tree, *result_chain, 2*j, high);
2453 /* low = interleave_low (vect1, vect2); */
2454 perm_dest = create_tmp_var (vectype, "vect_inter_low");
2455 add_referenced_var (perm_dest);
2456 perm_stmt = build2 (MODIFY_EXPR, void_type_node, perm_dest,
2457 build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1,
2459 low = make_ssa_name (perm_dest, perm_stmt);
2460 TREE_OPERAND (perm_stmt, 0) = low;
2461 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
2462 VEC_replace (tree, *result_chain, 2*j+1, low);
2464 dr_chain = VEC_copy (tree, heap, *result_chain);
2470 /* Function vectorizable_store.
2472 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
2474 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2475 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2476 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2479 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2484 tree vec_oprnd = NULL_TREE;
2485 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2486 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
2487 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2488 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2489 enum machine_mode vec_mode;
2491 enum dr_alignment_support alignment_support_cheme;
2493 def_operand_p def_p;
2495 enum vect_def_type dt;
2496 stmt_vec_info prev_stmt_info = NULL;
2497 tree dataref_ptr = NULL_TREE;
2498 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2499 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2501 tree next_stmt, first_stmt;
2502 bool strided_store = false;
2503 unsigned int group_size, i;
2504 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
2505 gcc_assert (ncopies >= 1);
2507 /* Is vectorizable store? */
2509 if (TREE_CODE (stmt) != MODIFY_EXPR)
2512 scalar_dest = TREE_OPERAND (stmt, 0);
2513 if (TREE_CODE (scalar_dest) != ARRAY_REF
2514 && TREE_CODE (scalar_dest) != INDIRECT_REF
2515 && !DR_GROUP_FIRST_DR (stmt_info))
2518 op = TREE_OPERAND (stmt, 1);
2519 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
2521 if (vect_print_dump_info (REPORT_DETAILS))
2522 fprintf (vect_dump, "use not simple.");
2526 vec_mode = TYPE_MODE (vectype);
2527 /* FORNOW. In some cases can vectorize even if data-type not supported
2528 (e.g. - array initialization with 0). */
2529 if (mov_optab->handlers[(int)vec_mode].insn_code == CODE_FOR_nothing)
2532 if (!STMT_VINFO_DATA_REF (stmt_info))
2535 if (DR_GROUP_FIRST_DR (stmt_info))
2537 strided_store = true;
2538 if (!vect_strided_store_supported (vectype))
2542 if (!vec_stmt) /* transformation not required. */
2544 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
2550 if (vect_print_dump_info (REPORT_DETAILS))
2551 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
2555 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
2556 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2557 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
2559 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
2561 /* We vectorize all the stmts of the interleaving group when we
2562 reach the last stmt in the group. */
2563 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
2564 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
2566 *vec_stmt = NULL_TREE;
2577 dr_chain = VEC_alloc (tree, heap, group_size);
2578 oprnds = VEC_alloc (tree, heap, group_size);
2580 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
2581 gcc_assert (alignment_support_cheme);
2582 gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */
2584 /* In case the vectorization factor (VF) is bigger than the number
2585 of elements that we can fit in a vectype (nunits), we have to generate
2586 more than one vector stmt - i.e - we need to "unroll" the
2587 vector stmt by a factor VF/nunits. For more details see documentation in
2588 vect_get_vec_def_for_copy_stmt. */
2590 /* In case of interleaving (non-unit strided access):
2597 We create vectorized storess starting from base address (the access of the
2598 first stmt in the chain (S2 in the above example), when the last store stmt
2599 of the chain (S4) is reached:
2602 VS2: &base + vec_size*1 = vx0
2603 VS3: &base + vec_size*2 = vx1
2604 VS4: &base + vec_size*3 = vx3
2606 Then permutation statements are generated:
2608 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
2609 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
2612 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
2613 (the order of the data-refs in the output of vect_permute_store_chain
2614 corresponds to the order of scalar stmts in the interleaving chain - see
2615 the documentaion of vect_permute_store_chain()).
2617 In case of both multiple types and interleaving, above vector stores and
2618 permutation stmts are created for every copy. The result vector stmts are
2619 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
2620 STMT_VINFO_RELATED_STMT for the next copies.
2623 prev_stmt_info = NULL;
2624 for (j = 0; j < ncopies; j++)
2631 /* For interleaved stores we collect vectorized defs for all the
2632 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
2633 as an input to vect_permute_store_chain(), and OPRNDS as an input
2634 to vect_get_vec_def_for_stmt_copy() for the next copy.
2635 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
2636 OPRNDS are of size 1.
2638 next_stmt = first_stmt;
2639 for (i = 0; i < group_size; i++)
2641 /* Since gaps are not supported for interleaved stores, GROUP_SIZE
2642 is the exact number of stmts in the chain. Therefore, NEXT_STMT
2643 can't be NULL_TREE. In case that there is no interleaving,
2644 GROUP_SIZE is 1, and only one iteration of the loop will be
2647 gcc_assert (next_stmt);
2648 op = TREE_OPERAND (next_stmt, 1);
2649 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
2650 VEC_quick_push(tree, dr_chain, vec_oprnd);
2651 VEC_quick_push(tree, oprnds, vec_oprnd);
2652 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
2654 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE,
2655 &dummy, &ptr_incr, false,
2656 TREE_TYPE (vec_oprnd));
2660 /* For interleaved stores we created vectorized defs for all the
2661 defs stored in OPRNDS in the previous iteration (previous copy).
2662 DR_CHAIN is then used as an input to vect_permute_store_chain(),
2663 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
2665 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
2666 OPRNDS are of size 1.
2668 for (i = 0; i < group_size; i++)
2670 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
2671 VEC_index (tree, oprnds, i));
2672 VEC_replace(tree, dr_chain, i, vec_oprnd);
2673 VEC_replace(tree, oprnds, i, vec_oprnd);
2675 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
2680 result_chain = VEC_alloc (tree, heap, group_size);
2682 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
2687 next_stmt = first_stmt;
2688 for (i = 0; i < group_size; i++)
2690 /* For strided stores vectorized defs are interleaved in
2691 vect_permute_store_chain(). */
2693 vec_oprnd = VEC_index(tree, result_chain, i);
2695 data_ref = build_fold_indirect_ref (dataref_ptr);
2696 /* Arguments are ready. Create the new vector stmt. */
2697 new_stmt = build2 (MODIFY_EXPR, void_type_node, data_ref,
2699 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2701 /* Set the V_MAY_DEFS for the vector pointer. If this virtual def has a
2702 use outside the loop and a loop peel is performed then the def may be
2703 renamed by the peel. Mark it for renaming so the later use will also
2705 copy_virtual_operands (new_stmt, next_stmt);
2708 /* The original store is deleted so the same SSA_NAMEs can be used.
2710 FOR_EACH_SSA_TREE_OPERAND (def, next_stmt, iter, SSA_OP_VMAYDEF)
2712 SSA_NAME_DEF_STMT (def) = new_stmt;
2713 mark_sym_for_renaming (SSA_NAME_VAR (def));
2716 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2720 /* Create new names for all the definitions created by COPY and
2721 add replacement mappings for each new name. */
2722 FOR_EACH_SSA_DEF_OPERAND (def_p, new_stmt, iter, SSA_OP_VMAYDEF)
2724 create_new_def_for (DEF_FROM_PTR (def_p), new_stmt, def_p);
2725 mark_sym_for_renaming (SSA_NAME_VAR (DEF_FROM_PTR (def_p)));
2728 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2731 prev_stmt_info = vinfo_for_stmt (new_stmt);
2732 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
2735 /* Bump the vector pointer. */
2736 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
2744 /* Function vect_setup_realignment
2746 This function is called when vectorizing an unaligned load using
2747 the dr_unaligned_software_pipeline scheme.
2748 This function generates the following code at the loop prolog:
2751 msq_init = *(floor(p)); # prolog load
2752 realignment_token = call target_builtin;
2754 msq = phi (msq_init, ---)
2756 The code above sets up a new (vector) pointer, pointing to the first
2757 location accessed by STMT, and a "floor-aligned" load using that pointer.
2758 It also generates code to compute the "realignment-token" (if the relevant
2759 target hook was defined), and creates a phi-node at the loop-header bb
2760 whose arguments are the result of the prolog-load (created by this
2761 function) and the result of a load that takes place in the loop (to be
2762 created by the caller to this function).
2763 The caller to this function uses the phi-result (msq) to create the
2764 realignment code inside the loop, and sets up the missing phi argument,
2768 msq = phi (msq_init, lsq)
2769 lsq = *(floor(p')); # load in loop
2770 result = realign_load (msq, lsq, realignment_token);
2773 STMT - (scalar) load stmt to be vectorized. This load accesses
2774 a memory location that may be unaligned.
2775 BSI - place where new code is to be inserted.
2778 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
2779 target hook, if defined.
2780 Return value - the result of the loop-header phi node. */
2783 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
2784 tree *realignment_token)
2786 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2787 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2788 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2789 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2790 edge pe = loop_preheader_edge (loop);
2791 tree scalar_dest = TREE_OPERAND (stmt, 0);
2804 /* 1. Create msq_init = *(floor(p1)) in the loop preheader */
2805 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2806 ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true,
2808 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
2809 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, data_ref);
2810 new_temp = make_ssa_name (vec_dest, new_stmt);
2811 TREE_OPERAND (new_stmt, 0) = new_temp;
2812 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
2813 gcc_assert (!new_bb);
2814 msq_init = TREE_OPERAND (new_stmt, 0);
2815 copy_virtual_operands (new_stmt, stmt);
2816 update_vuses_to_preheader (new_stmt, loop);
2818 /* 2. Create permutation mask, if required, in loop preheader. */
2819 if (targetm.vectorize.builtin_mask_for_load)
2822 tree params = build_tree_list (NULL_TREE, init_addr);
2824 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
2825 new_stmt = build_function_call_expr (builtin_decl, params);
2826 vec_dest = vect_create_destination_var (scalar_dest,
2827 TREE_TYPE (new_stmt));
2828 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, new_stmt);
2829 new_temp = make_ssa_name (vec_dest, new_stmt);
2830 TREE_OPERAND (new_stmt, 0) = new_temp;
2831 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
2832 gcc_assert (!new_bb);
2833 *realignment_token = TREE_OPERAND (new_stmt, 0);
2835 /* The result of the CALL_EXPR to this builtin is determined from
2836 the value of the parameter and no global variables are touched
2837 which makes the builtin a "const" function. Requiring the
2838 builtin to have the "const" attribute makes it unnecessary
2839 to call mark_call_clobbered. */
2840 gcc_assert (TREE_READONLY (builtin_decl));
2843 /* 3. Create msq = phi <msq_init, lsq> in loop */
2844 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2845 msq = make_ssa_name (vec_dest, NULL_TREE);
2846 phi_stmt = create_phi_node (msq, loop->header);
2847 SSA_NAME_DEF_STMT (msq) = phi_stmt;
2848 add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop));
2854 /* Function vect_strided_load_supported.
2856 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
2857 and FALSE otherwise. */
2860 vect_strided_load_supported (tree vectype)
2862 optab perm_even_optab, perm_odd_optab;
2865 mode = (int) TYPE_MODE (vectype);
2867 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
2868 if (!perm_even_optab)
2870 if (vect_print_dump_info (REPORT_DETAILS))
2871 fprintf (vect_dump, "no optab for perm_even.");
2875 if (perm_even_optab->handlers[mode].insn_code == CODE_FOR_nothing)
2877 if (vect_print_dump_info (REPORT_DETAILS))
2878 fprintf (vect_dump, "perm_even op not supported by target.");
2882 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
2883 if (!perm_odd_optab)
2885 if (vect_print_dump_info (REPORT_DETAILS))
2886 fprintf (vect_dump, "no optab for perm_odd.");
2890 if (perm_odd_optab->handlers[mode].insn_code == CODE_FOR_nothing)
2892 if (vect_print_dump_info (REPORT_DETAILS))
2893 fprintf (vect_dump, "perm_odd op not supported by target.");
2900 /* Function vect_permute_load_chain.
2902 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
2903 a power of 2, generate extract_even/odd stmts to reorder the input data
2904 correctly. Return the final references for loads in RESULT_CHAIN.
2906 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
2907 The input is 4 vectors each containg 8 elements. We assign a number to each
2908 element, the input sequence is:
2910 1st vec: 0 1 2 3 4 5 6 7
2911 2nd vec: 8 9 10 11 12 13 14 15
2912 3rd vec: 16 17 18 19 20 21 22 23
2913 4th vec: 24 25 26 27 28 29 30 31
2915 The output sequence should be:
2917 1st vec: 0 4 8 12 16 20 24 28
2918 2nd vec: 1 5 9 13 17 21 25 29
2919 3rd vec: 2 6 10 14 18 22 26 30
2920 4th vec: 3 7 11 15 19 23 27 31
2922 i.e., the first output vector should contain the first elements of each
2923 interleaving group, etc.
2925 We use extract_even/odd instructions to create such output. The input of each
2926 extract_even/odd operation is two vectors
2930 and the output is the vector of extracted even/odd elements. The output of
2931 extract_even will be: 0 2 4 6
2932 and of extract_odd: 1 3 5 7
2935 The permutaion is done in log LENGTH stages. In each stage extract_even and
2936 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
2937 order. In our example,
2939 E1: extract_even (1st vec, 2nd vec)
2940 E2: extract_odd (1st vec, 2nd vec)
2941 E3: extract_even (3rd vec, 4th vec)
2942 E4: extract_odd (3rd vec, 4th vec)
2944 The output for the first stage will be:
2946 E1: 0 2 4 6 8 10 12 14
2947 E2: 1 3 5 7 9 11 13 15
2948 E3: 16 18 20 22 24 26 28 30
2949 E4: 17 19 21 23 25 27 29 31
2951 In order to proceed and create the correct sequence for the next stage (or
2952 for the correct output, if the second stage is the last one, as in our
2953 example), we first put the output of extract_even operation and then the
2954 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
2955 The input for the second stage is:
2957 1st vec (E1): 0 2 4 6 8 10 12 14
2958 2nd vec (E3): 16 18 20 22 24 26 28 30
2959 3rd vec (E2): 1 3 5 7 9 11 13 15
2960 4th vec (E4): 17 19 21 23 25 27 29 31
2962 The output of the second stage:
2964 E1: 0 4 8 12 16 20 24 28
2965 E2: 2 6 10 14 18 22 26 30
2966 E3: 1 5 9 13 17 21 25 29
2967 E4: 3 7 11 15 19 23 27 31
2969 And RESULT_CHAIN after reordering:
2971 1st vec (E1): 0 4 8 12 16 20 24 28
2972 2nd vec (E3): 1 5 9 13 17 21 25 29
2973 3rd vec (E2): 2 6 10 14 18 22 26 30
2974 4th vec (E4): 3 7 11 15 19 23 27 31. */
2977 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
2978 unsigned int length,
2980 block_stmt_iterator *bsi,
2981 VEC(tree,heap) **result_chain)
2983 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
2984 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2988 /* Check that the operation is supported. */
2989 if (!vect_strided_load_supported (vectype))
2992 *result_chain = VEC_copy (tree, heap, dr_chain);
2993 for (i = 0; i < exact_log2 (length); i++)
2995 for (j = 0; j < length; j +=2)
2997 first_vect = VEC_index (tree, dr_chain, j);
2998 second_vect = VEC_index (tree, dr_chain, j+1);
3000 /* data_ref = permute_even (first_data_ref, second_data_ref); */
3001 perm_dest = create_tmp_var (vectype, "vect_perm_even");
3002 add_referenced_var (perm_dest);
3004 perm_stmt = build2 (MODIFY_EXPR, void_type_node, perm_dest,
3005 build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
3006 first_vect, second_vect));
3008 data_ref = make_ssa_name (perm_dest, perm_stmt);
3009 TREE_OPERAND (perm_stmt, 0) = data_ref;
3010 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3011 mark_new_vars_to_rename (perm_stmt);
3013 VEC_replace (tree, *result_chain, j/2, data_ref);
3015 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
3016 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
3017 add_referenced_var (perm_dest);
3019 perm_stmt = build2 (MODIFY_EXPR, void_type_node, perm_dest,
3020 build2 (VEC_EXTRACT_ODD_EXPR, vectype,
3021 first_vect, second_vect));
3022 data_ref = make_ssa_name (perm_dest, perm_stmt);
3023 TREE_OPERAND (perm_stmt, 0) = data_ref;
3024 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3025 mark_new_vars_to_rename (perm_stmt);
3027 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
3029 dr_chain = VEC_copy (tree, heap, *result_chain);
3035 /* Function vect_transform_strided_load.
3037 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
3038 to perform their permutation and ascribe the result vectorized statements to
3039 the scalar statements.
3043 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
3044 block_stmt_iterator *bsi)
3046 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3047 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3048 tree next_stmt, new_stmt;
3049 VEC(tree,heap) *result_chain = NULL;
3050 unsigned int i, gap_count;
3053 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
3054 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
3055 vectors, that are ready for vector computation. */
3056 result_chain = VEC_alloc (tree, heap, size);
3058 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
3061 /* Put a permuted data-ref in the VECTORIZED_STMT field.
3062 Since we scan the chain starting from it's first node, their order
3063 corresponds the order of data-refs in RESULT_CHAIN. */
3064 next_stmt = first_stmt;
3066 for (i = 0; VEC_iterate(tree, result_chain, i, tmp_data_ref); i++)
3071 /* Skip the gaps. Loads created for the gaps will be removed by dead
3072 code elimination pass later.
3073 DR_GROUP_GAP is the number of steps in elements from the previous
3074 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
3075 correspond to the gaps.
3077 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
3085 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
3086 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
3087 copies, and we put the new vector statement in the first available
3089 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
3090 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
3093 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
3094 tree rel_stmt = STMT_VINFO_RELATED_STMT (
3095 vinfo_for_stmt (prev_stmt));
3098 prev_stmt = rel_stmt;
3099 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
3101 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
3103 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3105 /* If NEXT_STMT accesses the same DR as the previous statement,
3106 put the same TMP_DATA_REF as its vectorized statement; otherwise
3107 get the next data-ref from RESULT_CHAIN. */
3108 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
3116 /* vectorizable_load.
3118 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
3120 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3121 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3122 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3125 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3128 tree vec_dest = NULL;
3129 tree data_ref = NULL;
3131 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3132 stmt_vec_info prev_stmt_info;
3133 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3134 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3135 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
3136 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3139 tree new_stmt = NULL_TREE;
3141 enum dr_alignment_support alignment_support_cheme;
3142 tree dataref_ptr = NULL_TREE;
3144 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3145 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3146 int i, j, group_size;
3147 tree msq = NULL_TREE, lsq;
3148 tree offset = NULL_TREE;
3149 tree realignment_token = NULL_TREE;
3150 tree phi_stmt = NULL_TREE;
3151 VEC(tree,heap) *dr_chain = NULL;
3152 bool strided_load = false;
3155 /* Is vectorizable load? */
3156 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3159 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3161 if (STMT_VINFO_LIVE_P (stmt_info))
3163 /* FORNOW: not yet supported. */
3164 if (vect_print_dump_info (REPORT_DETAILS))
3165 fprintf (vect_dump, "value used after loop.");
3169 if (TREE_CODE (stmt) != MODIFY_EXPR)
3172 scalar_dest = TREE_OPERAND (stmt, 0);
3173 if (TREE_CODE (scalar_dest) != SSA_NAME)
3176 op = TREE_OPERAND (stmt, 1);
3177 if (TREE_CODE (op) != ARRAY_REF
3178 && TREE_CODE (op) != INDIRECT_REF
3179 && !DR_GROUP_FIRST_DR (stmt_info))
3182 if (!STMT_VINFO_DATA_REF (stmt_info))
3185 mode = (int) TYPE_MODE (vectype);
3187 /* FORNOW. In some cases can vectorize even if data-type not supported
3188 (e.g. - data copies). */
3189 if (mov_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3191 if (vect_print_dump_info (REPORT_DETAILS))
3192 fprintf (vect_dump, "Aligned load, but unsupported type.");
3196 /* Check if the load is a part of an interleaving chain. */
3197 if (DR_GROUP_FIRST_DR (stmt_info))
3199 strided_load = true;
3201 /* Check if interleaving is supported. */
3202 if (!vect_strided_load_supported (vectype))
3206 if (!vec_stmt) /* transformation not required. */
3208 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
3214 if (vect_print_dump_info (REPORT_DETAILS))
3215 fprintf (vect_dump, "transform load.");
3219 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3220 /* Check if the chain of loads is already vectorized. */
3221 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
3223 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3226 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3227 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3228 dr_chain = VEC_alloc (tree, heap, group_size);
3237 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3238 gcc_assert (alignment_support_cheme);
3241 /* In case the vectorization factor (VF) is bigger than the number
3242 of elements that we can fit in a vectype (nunits), we have to generate
3243 more than one vector stmt - i.e - we need to "unroll" the
3244 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3245 from one copy of the vector stmt to the next, in the field
3246 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3247 stages to find the correct vector defs to be used when vectorizing
3248 stmts that use the defs of the current stmt. The example below illustrates
3249 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3250 4 vectorized stmts):
3252 before vectorization:
3253 RELATED_STMT VEC_STMT
3257 step 1: vectorize stmt S1:
3258 We first create the vector stmt VS1_0, and, as usual, record a
3259 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
3260 Next, we create the vector stmt VS1_1, and record a pointer to
3261 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
3262 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
3264 RELATED_STMT VEC_STMT
3265 VS1_0: vx0 = memref0 VS1_1 -
3266 VS1_1: vx1 = memref1 VS1_2 -
3267 VS1_2: vx2 = memref2 VS1_3 -
3268 VS1_3: vx3 = memref3 - -
3269 S1: x = load - VS1_0
3272 See in documentation in vect_get_vec_def_for_stmt_copy for how the
3273 information we recorded in RELATED_STMT field is used to vectorize
3276 /* In case of interleaving (non-unit strided access):
3283 Vectorized loads are created in the order of memory accesses
3284 starting from the access of the first stmt of the chain:
3287 VS2: vx1 = &base + vec_size*1
3288 VS3: vx3 = &base + vec_size*2
3289 VS4: vx4 = &base + vec_size*3
3291 Then permutation statements are generated:
3293 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
3294 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
3297 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3298 (the order of the data-refs in the output of vect_permute_load_chain
3299 corresponds to the order of scalar stmts in the interleaving chain - see
3300 the documentaion of vect_permute_load_chain()).
3301 The generation of permutation stmts and recording them in
3302 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
3304 In case of both multiple types and interleaving, the vector loads and
3305 permutation stmts above are created for every copy. The result vector stmts
3306 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3307 STMT_VINFO_RELATED_STMT for the next copies. */
3309 /* If the data reference is aligned (dr_aligned) or potentially unaligned
3310 on a target that supports unaligned accesses (dr_unaligned_supported)
3311 we generate the following code:
3315 p = p + indx * vectype_size;
3320 Otherwise, the data reference is potentially unaligned on a target that
3321 does not support unaligned accesses (dr_unaligned_software_pipeline) -
3322 then generate the following code, in which the data in each iteration is
3323 obtained by two vector loads, one from the previous iteration, and one
3324 from the current iteration:
3326 msq_init = *(floor(p1))
3327 p2 = initial_addr + VS - 1;
3328 realignment_token = call target_builtin;
3331 p2 = p2 + indx * vectype_size
3333 vec_dest = realign_load (msq, lsq, realignment_token)
3338 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3340 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token);
3341 phi_stmt = SSA_NAME_DEF_STMT (msq);
3342 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
3345 prev_stmt_info = NULL;
3346 for (j = 0; j < ncopies; j++)
3348 /* 1. Create the vector pointer update chain. */
3350 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset, &dummy,
3351 &ptr_incr, false, NULL_TREE);
3353 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3355 for (i = 0; i < group_size; i++)
3357 /* 2. Create the vector-load in the loop. */
3358 switch (alignment_support_cheme)
3361 gcc_assert (aligned_access_p (first_dr));
3362 data_ref = build_fold_indirect_ref (dataref_ptr);
3364 case dr_unaligned_supported:
3366 int mis = DR_MISALIGNMENT (first_dr);
3367 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
3369 gcc_assert (!aligned_access_p (first_dr));
3370 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
3372 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
3375 case dr_unaligned_software_pipeline:
3376 gcc_assert (!aligned_access_p (first_dr));
3377 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
3382 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3383 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, data_ref);
3384 new_temp = make_ssa_name (vec_dest, new_stmt);
3385 TREE_OPERAND (new_stmt, 0) = new_temp;
3386 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3387 copy_virtual_operands (new_stmt, stmt);
3388 mark_new_vars_to_rename (new_stmt);
3390 /* 3. Handle explicit realignment if necessary/supported. */
3391 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3394 <vec_dest = realign_load (msq, lsq, realignment_token)> */
3395 lsq = TREE_OPERAND (new_stmt, 0);
3396 if (!realignment_token)
3397 realignment_token = dataref_ptr;
3398 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3400 build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token);
3401 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, new_stmt);
3402 new_temp = make_ssa_name (vec_dest, new_stmt);
3403 TREE_OPERAND (new_stmt, 0) = new_temp;
3404 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3405 if (i == group_size - 1 && j == ncopies - 1)
3406 add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop));
3410 VEC_quick_push (tree, dr_chain, new_temp);
3411 if (i < group_size - 1)
3412 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3417 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
3419 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3420 dr_chain = VEC_alloc (tree, heap, group_size);
3425 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3427 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3428 prev_stmt_info = vinfo_for_stmt (new_stmt);
3436 /* Function vectorizable_live_operation.
3438 STMT computes a value that is used outside the loop. Check if
3439 it can be supported. */
3442 vectorizable_live_operation (tree stmt,
3443 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3444 tree *vec_stmt ATTRIBUTE_UNUSED)
3447 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3448 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3450 enum tree_code code;
3454 enum vect_def_type dt;
3456 if (!STMT_VINFO_LIVE_P (stmt_info))
3459 if (TREE_CODE (stmt) != MODIFY_EXPR)
3462 if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME)
3465 operation = TREE_OPERAND (stmt, 1);
3466 code = TREE_CODE (operation);
3468 op_type = TREE_CODE_LENGTH (code);
3470 /* FORNOW: support only if all uses are invariant. This means
3471 that the scalar operations can remain in place, unvectorized.
3472 The original last scalar value that they compute will be used. */
3474 for (i = 0; i < op_type; i++)
3476 op = TREE_OPERAND (operation, i);
3477 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
3479 if (vect_print_dump_info (REPORT_DETAILS))
3480 fprintf (vect_dump, "use not simple.");
3484 if (dt != vect_invariant_def && dt != vect_constant_def)
3488 /* No transformation is required for the cases we currently support. */
3493 /* Function vect_is_simple_cond.
3496 LOOP - the loop that is being vectorized.
3497 COND - Condition that is checked for simple use.
3499 Returns whether a COND can be vectorized. Checks whether
3500 condition operands are supportable using vec_is_simple_use. */
3503 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
3507 enum vect_def_type dt;
3509 if (!COMPARISON_CLASS_P (cond))
3512 lhs = TREE_OPERAND (cond, 0);
3513 rhs = TREE_OPERAND (cond, 1);
3515 if (TREE_CODE (lhs) == SSA_NAME)
3517 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
3518 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
3521 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST)
3524 if (TREE_CODE (rhs) == SSA_NAME)
3526 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
3527 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
3530 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST)
3536 /* vectorizable_condition.
3538 Check if STMT is conditional modify expression that can be vectorized.
3539 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3540 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
3543 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3546 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3548 tree scalar_dest = NULL_TREE;
3549 tree vec_dest = NULL_TREE;
3550 tree op = NULL_TREE;
3551 tree cond_expr, then_clause, else_clause;
3552 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3553 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3554 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
3555 tree vec_compare, vec_cond_expr;
3557 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3558 enum machine_mode vec_mode;
3560 enum vect_def_type dt;
3561 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3562 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3564 gcc_assert (ncopies >= 1);
3566 return false; /* FORNOW */
3568 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3571 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3573 if (STMT_VINFO_LIVE_P (stmt_info))
3575 /* FORNOW: not yet supported. */
3576 if (vect_print_dump_info (REPORT_DETAILS))
3577 fprintf (vect_dump, "value used after loop.");
3581 if (TREE_CODE (stmt) != MODIFY_EXPR)
3584 op = TREE_OPERAND (stmt, 1);
3586 if (TREE_CODE (op) != COND_EXPR)
3589 cond_expr = TREE_OPERAND (op, 0);
3590 then_clause = TREE_OPERAND (op, 1);
3591 else_clause = TREE_OPERAND (op, 2);
3593 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
3596 /* We do not handle two different vector types for the condition
3598 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
3601 if (TREE_CODE (then_clause) == SSA_NAME)
3603 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
3604 if (!vect_is_simple_use (then_clause, loop_vinfo,
3605 &then_def_stmt, &def, &dt))
3608 else if (TREE_CODE (then_clause) != INTEGER_CST
3609 && TREE_CODE (then_clause) != REAL_CST)
3612 if (TREE_CODE (else_clause) == SSA_NAME)
3614 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
3615 if (!vect_is_simple_use (else_clause, loop_vinfo,
3616 &else_def_stmt, &def, &dt))
3619 else if (TREE_CODE (else_clause) != INTEGER_CST
3620 && TREE_CODE (else_clause) != REAL_CST)
3624 vec_mode = TYPE_MODE (vectype);
3628 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
3629 return expand_vec_cond_expr_p (op, vec_mode);
3635 scalar_dest = TREE_OPERAND (stmt, 0);
3636 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3638 /* Handle cond expr. */
3640 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
3642 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
3643 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
3644 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
3646 /* Arguments are ready. create the new vector stmt. */
3647 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
3648 vec_cond_lhs, vec_cond_rhs);
3649 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
3650 vec_compare, vec_then_clause, vec_else_clause);
3652 *vec_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, vec_cond_expr);
3653 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3654 TREE_OPERAND (*vec_stmt, 0) = new_temp;
3655 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3660 /* Function vect_transform_stmt.
3662 Create a vectorized stmt to replace STMT, and insert it at BSI. */
3665 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
3667 bool is_store = false;
3668 tree vec_stmt = NULL_TREE;
3669 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3670 tree orig_stmt_in_pattern;
3673 if (STMT_VINFO_RELEVANT_P (stmt_info))
3675 switch (STMT_VINFO_TYPE (stmt_info))
3677 case type_demotion_vec_info_type:
3678 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
3682 case type_promotion_vec_info_type:
3683 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
3687 case op_vec_info_type:
3688 done = vectorizable_operation (stmt, bsi, &vec_stmt);
3692 case assignment_vec_info_type:
3693 done = vectorizable_assignment (stmt, bsi, &vec_stmt);
3697 case load_vec_info_type:
3698 done = vectorizable_load (stmt, bsi, &vec_stmt);
3702 case store_vec_info_type:
3703 done = vectorizable_store (stmt, bsi, &vec_stmt);
3705 if (DR_GROUP_FIRST_DR (stmt_info))
3707 /* In case of interleaving, the whole chain is vectorized when the
3708 last store in the chain is reached. Store stmts before the last
3709 one are skipped, and there vec_stmt_info shoudn't be freed
3711 *strided_store = true;
3712 if (STMT_VINFO_VEC_STMT (stmt_info))
3719 case condition_vec_info_type:
3720 done = vectorizable_condition (stmt, bsi, &vec_stmt);
3725 if (vect_print_dump_info (REPORT_DETAILS))
3726 fprintf (vect_dump, "stmt not supported.");
3730 gcc_assert (vec_stmt || *strided_store);
3733 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
3734 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
3735 if (orig_stmt_in_pattern)
3737 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
3738 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
3740 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3742 /* STMT was inserted by the vectorizer to replace a
3743 computation idiom. ORIG_STMT_IN_PATTERN is a stmt in the
3744 original sequence that computed this idiom. We need to
3745 record a pointer to VEC_STMT in the stmt_info of
3746 ORIG_STMT_IN_PATTERN. See more details in the
3747 documentation of vect_pattern_recog. */
3749 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
3755 if (STMT_VINFO_LIVE_P (stmt_info))
3757 switch (STMT_VINFO_TYPE (stmt_info))
3759 case reduc_vec_info_type:
3760 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
3765 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
3774 /* This function builds ni_name = number of iterations loop executes
3775 on the loop preheader. */
3778 vect_build_loop_niters (loop_vec_info loop_vinfo)
3780 tree ni_name, stmt, var;
3782 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3783 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
3785 var = create_tmp_var (TREE_TYPE (ni), "niters");
3786 add_referenced_var (var);
3787 ni_name = force_gimple_operand (ni, &stmt, false, var);
3789 pe = loop_preheader_edge (loop);
3792 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
3793 gcc_assert (!new_bb);
3800 /* This function generates the following statements:
3802 ni_name = number of iterations loop executes
3803 ratio = ni_name / vf
3804 ratio_mult_vf_name = ratio * vf
3806 and places them at the loop preheader edge. */
3809 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
3811 tree *ratio_mult_vf_name_ptr,
3812 tree *ratio_name_ptr)
3820 tree ratio_mult_vf_name;
3821 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3822 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
3823 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3826 pe = loop_preheader_edge (loop);
3828 /* Generate temporary variable that contains
3829 number of iterations loop executes. */
3831 ni_name = vect_build_loop_niters (loop_vinfo);
3832 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
3834 /* Create: ratio = ni >> log2(vf) */
3836 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
3837 if (!is_gimple_val (ratio_name))
3839 var = create_tmp_var (TREE_TYPE (ni), "bnd");
3840 add_referenced_var (var);
3842 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
3843 pe = loop_preheader_edge (loop);
3844 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
3845 gcc_assert (!new_bb);
3848 /* Create: ratio_mult_vf = ratio << log2 (vf). */
3850 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
3851 ratio_name, log_vf);
3852 if (!is_gimple_val (ratio_mult_vf_name))
3854 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
3855 add_referenced_var (var);
3857 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
3859 pe = loop_preheader_edge (loop);
3860 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
3861 gcc_assert (!new_bb);
3864 *ni_name_ptr = ni_name;
3865 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
3866 *ratio_name_ptr = ratio_name;
3872 /* Function update_vuses_to_preheader.
3875 STMT - a statement with potential VUSEs.
3876 LOOP - the loop whose preheader will contain STMT.
3878 It's possible to vectorize a loop even though an SSA_NAME from a VUSE
3879 appears to be defined in a V_MAY_DEF in another statement in a loop.
3880 One such case is when the VUSE is at the dereference of a __restricted__
3881 pointer in a load and the V_MAY_DEF is at the dereference of a different
3882 __restricted__ pointer in a store. Vectorization may result in
3883 copy_virtual_uses being called to copy the problematic VUSE to a new
3884 statement that is being inserted in the loop preheader. This procedure
3885 is called to change the SSA_NAME in the new statement's VUSE from the
3886 SSA_NAME updated in the loop to the related SSA_NAME available on the
3887 path entering the loop.
3889 When this function is called, we have the following situation:
3894 # name1 = phi < name0 , name2>
3899 # name2 = vdef <name1>
3904 Stmt S1 was created in the loop preheader block as part of misaligned-load
3905 handling. This function fixes the name of the vuse of S1 from 'name1' to
3909 update_vuses_to_preheader (tree stmt, struct loop *loop)
3911 basic_block header_bb = loop->header;
3912 edge preheader_e = loop_preheader_edge (loop);
3914 use_operand_p use_p;
3916 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VUSE)
3918 tree ssa_name = USE_FROM_PTR (use_p);
3919 tree def_stmt = SSA_NAME_DEF_STMT (ssa_name);
3920 tree name_var = SSA_NAME_VAR (ssa_name);
3921 basic_block bb = bb_for_stmt (def_stmt);
3923 /* For a use before any definitions, def_stmt is a NOP_EXPR. */
3924 if (!IS_EMPTY_STMT (def_stmt)
3925 && flow_bb_inside_loop_p (loop, bb))
3927 /* If the block containing the statement defining the SSA_NAME
3928 is in the loop then it's necessary to find the definition
3929 outside the loop using the PHI nodes of the header. */
3931 bool updated = false;
3933 for (phi = phi_nodes (header_bb); phi; phi = TREE_CHAIN (phi))
3935 if (SSA_NAME_VAR (PHI_RESULT (phi)) == name_var)
3937 SET_USE (use_p, PHI_ARG_DEF (phi, preheader_e->dest_idx));
3942 gcc_assert (updated);
3948 /* Function vect_update_ivs_after_vectorizer.
3950 "Advance" the induction variables of LOOP to the value they should take
3951 after the execution of LOOP. This is currently necessary because the
3952 vectorizer does not handle induction variables that are used after the
3953 loop. Such a situation occurs when the last iterations of LOOP are
3955 1. We introduced new uses after LOOP for IVs that were not originally used
3956 after LOOP: the IVs of LOOP are now used by an epilog loop.
3957 2. LOOP is going to be vectorized; this means that it will iterate N/VF
3958 times, whereas the loop IVs should be bumped N times.
3961 - LOOP - a loop that is going to be vectorized. The last few iterations
3962 of LOOP were peeled.
3963 - NITERS - the number of iterations that LOOP executes (before it is
3964 vectorized). i.e, the number of times the ivs should be bumped.
3965 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
3966 coming out from LOOP on which there are uses of the LOOP ivs
3967 (this is the path from LOOP->exit to epilog_loop->preheader).
3969 The new definitions of the ivs are placed in LOOP->exit.
3970 The phi args associated with the edge UPDATE_E in the bb
3971 UPDATE_E->dest are updated accordingly.
3973 Assumption 1: Like the rest of the vectorizer, this function assumes
3974 a single loop exit that has a single predecessor.
3976 Assumption 2: The phi nodes in the LOOP header and in update_bb are
3977 organized in the same order.
3979 Assumption 3: The access function of the ivs is simple enough (see
3980 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
3982 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
3983 coming out of LOOP on which the ivs of LOOP are used (this is the path
3984 that leads to the epilog loop; other paths skip the epilog loop). This
3985 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
3986 needs to have its phis updated.
3990 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
3993 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3994 basic_block exit_bb = single_exit (loop)->dest;
3996 basic_block update_bb = update_e->dest;
3998 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
4000 /* Make sure there exists a single-predecessor exit bb: */
4001 gcc_assert (single_pred_p (exit_bb));
4003 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
4005 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
4007 tree access_fn = NULL;
4008 tree evolution_part;
4011 tree var, stmt, ni, ni_name;
4012 block_stmt_iterator last_bsi;
4014 if (vect_print_dump_info (REPORT_DETAILS))
4016 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
4017 print_generic_expr (vect_dump, phi, TDF_SLIM);
4020 /* Skip virtual phi's. */
4021 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
4023 if (vect_print_dump_info (REPORT_DETAILS))
4024 fprintf (vect_dump, "virtual phi. skip.");
4028 /* Skip reduction phis. */
4029 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
4031 if (vect_print_dump_info (REPORT_DETAILS))
4032 fprintf (vect_dump, "reduc phi. skip.");
4036 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
4037 gcc_assert (access_fn);
4039 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
4040 gcc_assert (evolution_part != NULL_TREE);
4042 /* FORNOW: We do not support IVs whose evolution function is a polynomial
4043 of degree >= 2 or exponential. */
4044 gcc_assert (!tree_is_chrec (evolution_part));
4046 step_expr = evolution_part;
4047 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
4050 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
4051 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
4052 fold_convert (TREE_TYPE (init_expr),
4057 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
4058 add_referenced_var (var);
4060 ni_name = force_gimple_operand (ni, &stmt, false, var);
4062 /* Insert stmt into exit_bb. */
4063 last_bsi = bsi_last (exit_bb);
4065 bsi_insert_before (&last_bsi, stmt, BSI_SAME_STMT);
4067 /* Fix phi expressions in the successor bb. */
4068 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
4073 /* Function vect_do_peeling_for_loop_bound
4075 Peel the last iterations of the loop represented by LOOP_VINFO.
4076 The peeled iterations form a new epilog loop. Given that the loop now
4077 iterates NITERS times, the new epilog loop iterates
4078 NITERS % VECTORIZATION_FACTOR times.
4080 The original loop will later be made to iterate
4081 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
4084 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
4086 tree ni_name, ratio_mult_vf_name;
4087 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4088 struct loop *new_loop;
4090 basic_block preheader;
4093 if (vect_print_dump_info (REPORT_DETAILS))
4094 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
4096 initialize_original_copy_tables ();
4098 /* Generate the following variables on the preheader of original loop:
4100 ni_name = number of iteration the original loop executes
4101 ratio = ni_name / vf
4102 ratio_mult_vf_name = ratio * vf */
4103 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
4104 &ratio_mult_vf_name, ratio);
4106 loop_num = loop->num;
4107 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
4108 ratio_mult_vf_name, ni_name, false);
4109 gcc_assert (new_loop);
4110 gcc_assert (loop_num == loop->num);
4111 #ifdef ENABLE_CHECKING
4112 slpeel_verify_cfg_after_peeling (loop, new_loop);
4115 /* A guard that controls whether the new_loop is to be executed or skipped
4116 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
4117 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
4118 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
4119 is on the path where the LOOP IVs are used and need to be updated. */
4121 preheader = loop_preheader_edge (new_loop)->src;
4122 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
4123 update_e = EDGE_PRED (preheader, 0);
4125 update_e = EDGE_PRED (preheader, 1);
4127 /* Update IVs of original loop as if they were advanced
4128 by ratio_mult_vf_name steps. */
4129 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
4131 /* After peeling we have to reset scalar evolution analyzer. */
4134 free_original_copy_tables ();
4138 /* Function vect_gen_niters_for_prolog_loop
4140 Set the number of iterations for the loop represented by LOOP_VINFO
4141 to the minimum between LOOP_NITERS (the original iteration count of the loop)
4142 and the misalignment of DR - the data reference recorded in
4143 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
4144 this loop, the data reference DR will refer to an aligned location.
4146 The following computation is generated:
4148 If the misalignment of DR is known at compile time:
4149 addr_mis = int mis = DR_MISALIGNMENT (dr);
4150 Else, compute address misalignment in bytes:
4151 addr_mis = addr & (vectype_size - 1)
4153 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
4155 (elem_size = element type size; an element is the scalar element
4156 whose type is the inner type of the vectype)
4160 prolog_niters = min ( LOOP_NITERS ,
4161 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
4162 where group_size is the size of the interleaved group.
4166 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
4168 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
4169 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4170 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4172 tree iters, iters_name;
4175 tree dr_stmt = DR_STMT (dr);
4176 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
4177 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4178 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
4179 tree niters_type = TREE_TYPE (loop_niters);
4181 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
4183 if (DR_GROUP_FIRST_DR (stmt_info))
4185 /* For interleaved access element size must be multipled by the size of
4186 the interleaved group. */
4187 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
4188 DR_GROUP_FIRST_DR (stmt_info)));
4189 element_size *= group_size;
4192 pe = loop_preheader_edge (loop);
4194 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
4196 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
4197 int elem_misalign = byte_misalign / element_size;
4199 if (vect_print_dump_info (REPORT_DETAILS))
4200 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
4201 iters = build_int_cst (niters_type,
4202 (vf - elem_misalign)&(vf/group_size-1));
4206 tree new_stmts = NULL_TREE;
4208 vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE);
4209 tree ptr_type = TREE_TYPE (start_addr);
4210 tree size = TYPE_SIZE (ptr_type);
4211 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
4212 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
4213 tree elem_size_log =
4214 build_int_cst (type, exact_log2 (vectype_align/vf));
4215 tree vf_minus_1 = build_int_cst (type, vf - 1);
4216 tree vf_tree = build_int_cst (type, vf);
4220 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
4221 gcc_assert (!new_bb);
4223 /* Create: byte_misalign = addr & (vectype_size - 1) */
4225 fold_build2 (BIT_AND_EXPR, type, start_addr, vectype_size_minus_1);
4227 /* Create: elem_misalign = byte_misalign / element_size */
4229 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
4231 /* Create: (niters_type) (VF - elem_misalign)&(VF - 1) */
4232 iters = fold_build2 (MINUS_EXPR, type, vf_tree, elem_misalign);
4233 iters = fold_build2 (BIT_AND_EXPR, type, iters, vf_minus_1);
4234 iters = fold_convert (niters_type, iters);
4237 /* Create: prolog_loop_niters = min (iters, loop_niters) */
4238 /* If the loop bound is known at compile time we already verified that it is
4239 greater than vf; since the misalignment ('iters') is at most vf, there's
4240 no need to generate the MIN_EXPR in this case. */
4241 if (TREE_CODE (loop_niters) != INTEGER_CST)
4242 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
4244 if (vect_print_dump_info (REPORT_DETAILS))
4246 fprintf (vect_dump, "niters for prolog loop: ");
4247 print_generic_expr (vect_dump, iters, TDF_SLIM);
4250 var = create_tmp_var (niters_type, "prolog_loop_niters");
4251 add_referenced_var (var);
4252 iters_name = force_gimple_operand (iters, &stmt, false, var);
4254 /* Insert stmt on loop preheader edge. */
4257 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4258 gcc_assert (!new_bb);
4265 /* Function vect_update_init_of_dr
4267 NITERS iterations were peeled from LOOP. DR represents a data reference
4268 in LOOP. This function updates the information recorded in DR to
4269 account for the fact that the first NITERS iterations had already been
4270 executed. Specifically, it updates the OFFSET field of DR. */
4273 vect_update_init_of_dr (struct data_reference *dr, tree niters)
4275 tree offset = DR_OFFSET (dr);
4277 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
4278 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
4279 DR_OFFSET (dr) = offset;
4283 /* Function vect_update_inits_of_drs
4285 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
4286 This function updates the information recorded for the data references in
4287 the loop to account for the fact that the first NITERS iterations had
4288 already been executed. Specifically, it updates the initial_condition of the
4289 access_function of all the data_references in the loop. */
4292 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
4295 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
4296 struct data_reference *dr;
4298 if (vect_dump && (dump_flags & TDF_DETAILS))
4299 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
4301 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
4302 vect_update_init_of_dr (dr, niters);
4306 /* Function vect_do_peeling_for_alignment
4308 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
4309 'niters' is set to the misalignment of one of the data references in the
4310 loop, thereby forcing it to refer to an aligned location at the beginning
4311 of the execution of this loop. The data reference for which we are
4312 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
4315 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
4317 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4318 tree niters_of_prolog_loop, ni_name;
4320 struct loop *new_loop;
4322 if (vect_print_dump_info (REPORT_DETAILS))
4323 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
4325 initialize_original_copy_tables ();
4327 ni_name = vect_build_loop_niters (loop_vinfo);
4328 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
4330 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
4332 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
4333 niters_of_prolog_loop, ni_name, true);
4334 gcc_assert (new_loop);
4335 #ifdef ENABLE_CHECKING
4336 slpeel_verify_cfg_after_peeling (new_loop, loop);
4339 /* Update number of times loop executes. */
4340 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
4341 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
4342 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
4344 /* Update the init conditions of the access functions of all data refs. */
4345 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
4347 /* After peeling we have to reset scalar evolution analyzer. */
4350 free_original_copy_tables ();
4354 /* Function vect_create_cond_for_align_checks.
4356 Create a conditional expression that represents the alignment checks for
4357 all of data references (array element references) whose alignment must be
4361 LOOP_VINFO - two fields of the loop information are used.
4362 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
4363 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
4366 COND_EXPR_STMT_LIST - statements needed to construct the conditional
4368 The returned value is the conditional expression to be used in the if
4369 statement that controls which version of the loop gets executed at runtime.
4371 The algorithm makes two assumptions:
4372 1) The number of bytes "n" in a vector is a power of 2.
4373 2) An address "a" is aligned if a%n is zero and that this
4374 test can be done as a&(n-1) == 0. For example, for 16
4375 byte vectors the test is a&0xf == 0. */
4378 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
4379 tree *cond_expr_stmt_list)
4381 VEC(tree,heap) *may_misalign_stmts
4382 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
4384 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
4388 tree int_ptrsize_type;
4390 tree or_tmp_name = NULL_TREE;
4391 tree and_tmp, and_tmp_name, and_stmt;
4394 /* Check that mask is one less than a power of 2, i.e., mask is
4395 all zeros followed by all ones. */
4396 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
4398 /* CHECKME: what is the best integer or unsigned type to use to hold a
4399 cast from a pointer value? */
4400 psize = TYPE_SIZE (ptr_type_node);
4402 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
4404 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
4405 of the first vector of the i'th data reference. */
4407 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
4409 tree new_stmt_list = NULL_TREE;
4411 tree addr_tmp, addr_tmp_name, addr_stmt;
4412 tree or_tmp, new_or_tmp_name, or_stmt;
4414 /* create: addr_tmp = (int)(address_of_first_vector) */
4415 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
4419 if (new_stmt_list != NULL_TREE)
4420 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
4422 sprintf (tmp_name, "%s%d", "addr2int", i);
4423 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4424 add_referenced_var (addr_tmp);
4425 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
4426 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
4427 addr_stmt = build2 (MODIFY_EXPR, void_type_node,
4428 addr_tmp_name, addr_stmt);
4429 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
4430 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
4432 /* The addresses are OR together. */
4434 if (or_tmp_name != NULL_TREE)
4436 /* create: or_tmp = or_tmp | addr_tmp */
4437 sprintf (tmp_name, "%s%d", "orptrs", i);
4438 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4439 add_referenced_var (or_tmp);
4440 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
4441 or_stmt = build2 (MODIFY_EXPR, void_type_node, new_or_tmp_name,
4442 build2 (BIT_IOR_EXPR, int_ptrsize_type,
4445 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
4446 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
4447 or_tmp_name = new_or_tmp_name;
4450 or_tmp_name = addr_tmp_name;
4454 mask_cst = build_int_cst (int_ptrsize_type, mask);
4456 /* create: and_tmp = or_tmp & mask */
4457 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
4458 add_referenced_var (and_tmp);
4459 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
4461 and_stmt = build2 (MODIFY_EXPR, void_type_node,
4463 build2 (BIT_AND_EXPR, int_ptrsize_type,
4464 or_tmp_name, mask_cst));
4465 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
4466 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
4468 /* Make and_tmp the left operand of the conditional test against zero.
4469 if and_tmp has a nonzero bit then some address is unaligned. */
4470 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
4471 return build2 (EQ_EXPR, boolean_type_node,
4472 and_tmp_name, ptrsize_zero);
4476 /* Function vect_transform_loop.
4478 The analysis phase has determined that the loop is vectorizable.
4479 Vectorize the loop - created vectorized stmts to replace the scalar
4480 stmts in the loop, and update the loop exit condition. */
4483 vect_transform_loop (loop_vec_info loop_vinfo)
4485 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4486 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4487 int nbbs = loop->num_nodes;
4488 block_stmt_iterator si;
4491 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4496 if (vect_print_dump_info (REPORT_DETAILS))
4497 fprintf (vect_dump, "=== vec_transform_loop ===");
4499 /* If the loop has data references that may or may not be aligned then
4500 two versions of the loop need to be generated, one which is vectorized
4501 and one which isn't. A test is then generated to control which of the
4502 loops is executed. The test checks for the alignment of all of the
4503 data references that may or may not be aligned. */
4505 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
4509 tree cond_expr_stmt_list = NULL_TREE;
4510 basic_block condition_bb;
4511 block_stmt_iterator cond_exp_bsi;
4512 basic_block merge_bb;
4513 basic_block new_exit_bb;
4515 tree orig_phi, new_phi, arg;
4517 cond_expr = vect_create_cond_for_align_checks (loop_vinfo,
4518 &cond_expr_stmt_list);
4519 initialize_original_copy_tables ();
4520 nloop = loop_version (loop, cond_expr, &condition_bb, true);
4521 free_original_copy_tables();
4523 /** Loop versioning violates an assumption we try to maintain during
4524 vectorization - that the loop exit block has a single predecessor.
4525 After versioning, the exit block of both loop versions is the same
4526 basic block (i.e. it has two predecessors). Just in order to simplify
4527 following transformations in the vectorizer, we fix this situation
4528 here by adding a new (empty) block on the exit-edge of the loop,
4529 with the proper loop-exit phis to maintain loop-closed-form. **/
4531 merge_bb = single_exit (loop)->dest;
4532 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
4533 new_exit_bb = split_edge (single_exit (loop));
4534 new_exit_e = single_exit (loop);
4535 e = EDGE_SUCC (new_exit_bb, 0);
4537 for (orig_phi = phi_nodes (merge_bb); orig_phi;
4538 orig_phi = PHI_CHAIN (orig_phi))
4540 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
4542 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
4543 add_phi_arg (new_phi, arg, new_exit_e);
4544 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
4547 /** end loop-exit-fixes after versioning **/
4549 update_ssa (TODO_update_ssa);
4550 cond_exp_bsi = bsi_last (condition_bb);
4551 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
4554 /* CHECKME: we wouldn't need this if we called update_ssa once
4556 bitmap_zero (vect_vnames_to_rename);
4558 /* Peel the loop if there are data refs with unknown alignment.
4559 Only one data ref with unknown store is allowed. */
4561 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4562 vect_do_peeling_for_alignment (loop_vinfo);
4564 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4565 compile time constant), or it is a constant that doesn't divide by the
4566 vectorization factor, then an epilog loop needs to be created.
4567 We therefore duplicate the loop: the original loop will be vectorized,
4568 and will compute the first (n/VF) iterations. The second copy of the loop
4569 will remain scalar and will compute the remaining (n%VF) iterations.
4570 (VF is the vectorization factor). */
4572 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4573 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4574 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
4575 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
4577 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4578 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4580 /* 1) Make sure the loop header has exactly two entries
4581 2) Make sure we have a preheader basic block. */
4583 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4585 split_edge (loop_preheader_edge (loop));
4587 /* FORNOW: the vectorizer supports only loops which body consist
4588 of one basic block (header + empty latch). When the vectorizer will
4589 support more involved loop forms, the order by which the BBs are
4590 traversed need to be reconsidered. */
4592 for (i = 0; i < nbbs; i++)
4594 basic_block bb = bbs[i];
4596 for (si = bsi_start (bb); !bsi_end_p (si);)
4598 tree stmt = bsi_stmt (si);
4599 stmt_vec_info stmt_info;
4602 if (vect_print_dump_info (REPORT_DETAILS))
4604 fprintf (vect_dump, "------>vectorizing statement: ");
4605 print_generic_expr (vect_dump, stmt, TDF_SLIM);
4607 stmt_info = vinfo_for_stmt (stmt);
4608 gcc_assert (stmt_info);
4609 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4610 && !STMT_VINFO_LIVE_P (stmt_info))
4616 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4617 != (unsigned HOST_WIDE_INT) vectorization_factor)
4618 && vect_print_dump_info (REPORT_DETAILS))
4619 fprintf (vect_dump, "multiple-types.");
4621 /* -------- vectorize statement ------------ */
4622 if (vect_print_dump_info (REPORT_DETAILS))
4623 fprintf (vect_dump, "transform statement.");
4625 strided_store = false;
4626 is_store = vect_transform_stmt (stmt, &si, &strided_store);
4630 if (DR_GROUP_FIRST_DR (stmt_info))
4632 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4633 interleaving chain was completed - free all the stores in
4635 tree next = DR_GROUP_FIRST_DR (stmt_info);
4637 stmt_vec_info next_stmt_info;
4641 next_stmt_info = vinfo_for_stmt (next);
4642 /* Free the attached stmt_vec_info and remove the stmt. */
4643 ann = stmt_ann (next);
4644 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
4645 free (next_stmt_info);
4646 set_stmt_info (ann, NULL);
4649 bsi_remove (&si, true);
4654 /* Free the attached stmt_vec_info and remove the stmt. */
4655 ann = stmt_ann (stmt);
4657 set_stmt_info (ann, NULL);
4658 bsi_remove (&si, true);
4666 /* This is case of skipped interleaved store. We don't free
4667 its stmt_vec_info. */
4668 bsi_remove (&si, true);
4676 slpeel_make_loop_iterate_ntimes (loop, ratio);
4678 EXECUTE_IF_SET_IN_BITMAP (vect_vnames_to_rename, 0, j, bi)
4679 mark_sym_for_renaming (SSA_NAME_VAR (ssa_name (j)));
4681 /* The memory tags and pointers in vectorized statements need to
4682 have their SSA forms updated. FIXME, why can't this be delayed
4683 until all the loops have been transformed? */
4684 update_ssa (TODO_update_ssa);
4686 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
4687 fprintf (vect_dump, "LOOP VECTORIZED.");