1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to the Free
19 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
24 #include "coretypes.h"
30 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
50 static tree vect_create_destination_var (tree, tree);
51 static tree vect_create_data_ref_ptr
52 (tree, block_stmt_iterator *, tree, tree *, tree *, bool);
53 static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree);
54 static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *);
55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
57 static tree vect_init_vector (tree, tree);
58 static void vect_finish_stmt_generation
59 (tree stmt, tree vec_stmt, block_stmt_iterator *bsi);
60 static bool vect_is_simple_cond (tree, loop_vec_info);
61 static void update_vuses_to_preheader (tree, struct loop*);
62 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
63 static tree get_initial_def_for_reduction (tree, tree, tree *);
65 /* Utility function dealing with loop peeling (not peeling itself). */
66 static void vect_generate_tmps_on_preheader
67 (loop_vec_info, tree *, tree *, tree *);
68 static tree vect_build_loop_niters (loop_vec_info);
69 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
70 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
71 static void vect_update_init_of_dr (struct data_reference *, tree niters);
72 static void vect_update_inits_of_drs (loop_vec_info, tree);
73 static void vect_do_peeling_for_alignment (loop_vec_info, struct loops *);
74 static void vect_do_peeling_for_loop_bound
75 (loop_vec_info, tree *, struct loops *);
76 static int vect_min_worthwhile_factor (enum tree_code);
79 /* Function vect_get_new_vect_var.
81 Returns a name for a new variable. The current naming scheme appends the
82 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
83 the name of vectorizer generated variables, and appends that to NAME if
87 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
100 case vect_pointer_var:
108 new_vect_var = create_tmp_var (type, concat (prefix, name, NULL));
110 new_vect_var = create_tmp_var (type, prefix);
116 /* Function vect_create_addr_base_for_vector_ref.
118 Create an expression that computes the address of the first memory location
119 that will be accessed for a data reference.
122 STMT: The statement containing the data reference.
123 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
124 OFFSET: Optional. If supplied, it is be added to the initial address.
127 1. Return an SSA_NAME whose value is the address of the memory location of
128 the first vector of the data reference.
129 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
130 these statement(s) which define the returned SSA_NAME.
132 FORNOW: We are only handling array accesses with step 1. */
135 vect_create_addr_base_for_vector_ref (tree stmt,
139 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
140 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
141 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
142 tree base_name = build_fold_indirect_ref (data_ref_base);
143 tree ref = DR_REF (dr);
144 tree scalar_type = TREE_TYPE (ref);
145 tree scalar_ptr_type = build_pointer_type (scalar_type);
148 tree addr_base, addr_expr;
150 tree base_offset = unshare_expr (DR_OFFSET (dr));
151 tree init = unshare_expr (DR_INIT (dr));
153 /* Create base_offset */
154 base_offset = size_binop (PLUS_EXPR, base_offset, init);
155 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
156 add_referenced_var (dest);
157 base_offset = force_gimple_operand (base_offset, &new_stmt, false, dest);
158 append_to_statement_list_force (new_stmt, new_stmt_list);
162 tree tmp = create_tmp_var (TREE_TYPE (base_offset), "offset");
165 /* For interleaved access step we divide STEP by the size of the
166 interleaving group. */
167 if (DR_GROUP_SIZE (stmt_info))
168 step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr),
169 build_int_cst (TREE_TYPE (offset),
170 DR_GROUP_SIZE (stmt_info)));
174 add_referenced_var (tmp);
175 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
176 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
177 base_offset, offset);
178 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
179 append_to_statement_list_force (new_stmt, new_stmt_list);
182 /* base + base_offset */
183 addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base,
186 /* addr_expr = addr_base */
187 addr_expr = vect_get_new_vect_var (scalar_ptr_type, vect_pointer_var,
188 get_name (base_name));
189 add_referenced_var (addr_expr);
190 vec_stmt = build2 (MODIFY_EXPR, void_type_node, addr_expr, addr_base);
191 new_temp = make_ssa_name (addr_expr, vec_stmt);
192 TREE_OPERAND (vec_stmt, 0) = new_temp;
193 append_to_statement_list_force (vec_stmt, new_stmt_list);
195 if (vect_print_dump_info (REPORT_DETAILS))
197 fprintf (vect_dump, "created ");
198 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
204 /* Function vect_create_data_ref_ptr.
206 Create a new pointer to vector type (vp), that points to the first location
207 accessed in the loop by STMT, along with the def-use update chain to
208 appropriately advance the pointer through the loop iterations. Also set
209 aliasing information for the pointer. This vector pointer is used by the
210 callers to this function to create a memory reference expression for vector
214 1. STMT: a stmt that references memory. Expected to be of the form
215 MODIFY_EXPR <name, data-ref> or MODIFY_EXPR <data-ref, name>.
216 2. BSI: block_stmt_iterator where new stmts can be added.
217 3. OFFSET (optional): an offset to be added to the initial address accessed
218 by the data-ref in STMT.
219 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
220 pointing to the initial address.
223 1. Declare a new ptr to vector_type, and have it point to the base of the
224 data reference (initial addressed accessed by the data reference).
225 For example, for vector of type V8HI, the following code is generated:
228 vp = (v8hi *)initial_address;
230 if OFFSET is not supplied:
231 initial_address = &a[init];
232 if OFFSET is supplied:
233 initial_address = &a[init + OFFSET];
235 Return the initial_address in INITIAL_ADDRESS.
237 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
238 update the pointer in each iteration of the loop.
240 Return the increment stmt that updates the pointer in PTR_INCR.
242 3. Return the pointer. */
245 vect_create_data_ref_ptr (tree stmt,
246 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
247 tree offset, tree *initial_address, tree *ptr_incr,
251 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
252 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
253 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
254 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
260 tree new_stmt_list = NULL_TREE;
261 edge pe = loop_preheader_edge (loop);
264 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
266 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
268 if (vect_print_dump_info (REPORT_DETAILS))
270 tree data_ref_base = base_name;
271 fprintf (vect_dump, "create vector-pointer variable to type: ");
272 print_generic_expr (vect_dump, vectype, TDF_SLIM);
273 if (TREE_CODE (data_ref_base) == VAR_DECL)
274 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
275 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
276 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
277 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
278 fprintf (vect_dump, " vectorizing a record based array ref: ");
279 else if (TREE_CODE (data_ref_base) == SSA_NAME)
280 fprintf (vect_dump, " vectorizing a pointer ref: ");
281 print_generic_expr (vect_dump, base_name, TDF_SLIM);
284 /** (1) Create the new vector-pointer variable: **/
286 vect_ptr_type = build_pointer_type (vectype);
287 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
288 get_name (base_name));
289 add_referenced_var (vect_ptr);
292 /** (2) Add aliasing information to the new vector-pointer:
293 (The points-to info (DR_PTR_INFO) may be defined later.) **/
295 tag = DR_MEMTAG (dr);
298 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
299 tag must be created with tag added to its may alias list. */
301 new_type_alias (vect_ptr, tag, DR_REF (dr));
303 var_ann (vect_ptr)->symbol_mem_tag = tag;
305 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
307 /** (3) Calculate the initial address the vector-pointer, and set
308 the vector-pointer to point to it before the loop: **/
310 /* Create: (&(base[init_val+offset]) in the loop preheader. */
311 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
313 pe = loop_preheader_edge (loop);
314 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
315 gcc_assert (!new_bb);
316 *initial_address = new_temp;
318 /* Create: p = (vectype *) initial_base */
319 vec_stmt = fold_convert (vect_ptr_type, new_temp);
320 vec_stmt = build2 (MODIFY_EXPR, void_type_node, vect_ptr, vec_stmt);
321 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
322 TREE_OPERAND (vec_stmt, 0) = vect_ptr_init;
323 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
324 gcc_assert (!new_bb);
327 /** (4) Handle the updating of the vector-pointer inside the loop: **/
329 if (only_init) /* No update in loop is required. */
331 /* Copy the points-to information if it exists. */
332 if (DR_PTR_INFO (dr))
333 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
334 return vect_ptr_init;
338 block_stmt_iterator incr_bsi;
340 tree indx_before_incr, indx_after_incr;
343 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
344 create_iv (vect_ptr_init,
345 fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)),
346 NULL_TREE, loop, &incr_bsi, insert_after,
347 &indx_before_incr, &indx_after_incr);
348 incr = bsi_stmt (incr_bsi);
349 set_stmt_info (stmt_ann (incr),
350 new_stmt_vec_info (incr, loop_vinfo));
352 /* Copy the points-to information if it exists. */
353 if (DR_PTR_INFO (dr))
355 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
356 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
358 merge_alias_info (vect_ptr_init, indx_before_incr);
359 merge_alias_info (vect_ptr_init, indx_after_incr);
363 return indx_before_incr;
368 /* Function bump_vector_ptr
370 Increment a pointer (to a vector type) by vector-size. Connect the new
371 increment stmt to the exising def-use update-chain of the pointer.
373 The pointer def-use update-chain before this function:
374 DATAREF_PTR = phi (p_0, p_2)
376 PTR_INCR: p_2 = DATAREF_PTR + step
378 The pointer def-use update-chain after this function:
379 DATAREF_PTR = phi (p_0, p_2)
381 NEW_DATAREF_PTR = DATAREF_PTR + vector_size
383 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
386 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
388 PTR_INCR - the stmt that updates the pointer in each iteration of the loop.
389 The increment amount across iterations is also expected to be
391 BSI - location where the new update stmt is to be placed.
392 STMT - the original scalar memory-access stmt that is being vectorized.
394 Output: Return NEW_DATAREF_PTR as illustrated above.
399 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
402 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
403 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
404 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
405 tree vptr_type = TREE_TYPE (dataref_ptr);
406 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
407 tree update = fold_convert (vptr_type, TYPE_SIZE_UNIT (vectype));
411 tree new_dataref_ptr;
413 incr_stmt = build2 (MODIFY_EXPR, void_type_node, ptr_var,
414 build2 (PLUS_EXPR, vptr_type, dataref_ptr, update));
415 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
416 TREE_OPERAND (incr_stmt, 0) = new_dataref_ptr;
417 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
419 /* Update the vector-pointer's cross-iteration increment. */
420 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
422 tree use = USE_FROM_PTR (use_p);
424 if (use == dataref_ptr)
425 SET_USE (use_p, new_dataref_ptr);
427 gcc_assert (tree_int_cst_compare (use, update) == 0);
430 /* Copy the points-to information if it exists. */
431 if (DR_PTR_INFO (dr))
432 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
433 merge_alias_info (new_dataref_ptr, dataref_ptr);
435 return new_dataref_ptr;
439 /* Function vect_create_destination_var.
441 Create a new temporary of type VECTYPE. */
444 vect_create_destination_var (tree scalar_dest, tree vectype)
447 const char *new_name;
449 enum vect_var_kind kind;
451 kind = vectype ? vect_simple_var : vect_scalar_var;
452 type = vectype ? vectype : TREE_TYPE (scalar_dest);
454 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
456 new_name = get_name (scalar_dest);
459 vec_dest = vect_get_new_vect_var (type, vect_simple_var, new_name);
460 add_referenced_var (vec_dest);
466 /* Function vect_init_vector.
468 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
469 the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be
470 used in the vectorization of STMT. */
473 vect_init_vector (tree stmt, tree vector_var)
475 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
476 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
477 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
480 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
486 new_var = vect_get_new_vect_var (vectype, vect_simple_var, "cst_");
487 add_referenced_var (new_var);
489 init_stmt = build2 (MODIFY_EXPR, vectype, new_var, vector_var);
490 new_temp = make_ssa_name (new_var, init_stmt);
491 TREE_OPERAND (init_stmt, 0) = new_temp;
493 pe = loop_preheader_edge (loop);
494 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
495 gcc_assert (!new_bb);
497 if (vect_print_dump_info (REPORT_DETAILS))
499 fprintf (vect_dump, "created new init_stmt: ");
500 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
503 vec_oprnd = TREE_OPERAND (init_stmt, 0);
508 /* Function vect_get_vec_def_for_operand.
510 OP is an operand in STMT. This function returns a (vector) def that will be
511 used in the vectorized stmt for STMT.
513 In the case that OP is an SSA_NAME which is defined in the loop, then
514 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
516 In case OP is an invariant or constant, a new stmt that creates a vector def
517 needs to be introduced. */
520 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
525 stmt_vec_info def_stmt_info = NULL;
526 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
527 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
528 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
529 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
530 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
536 enum vect_def_type dt;
539 if (vect_print_dump_info (REPORT_DETAILS))
541 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
542 print_generic_expr (vect_dump, op, TDF_SLIM);
545 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
546 gcc_assert (is_simple_use);
547 if (vect_print_dump_info (REPORT_DETAILS))
551 fprintf (vect_dump, "def = ");
552 print_generic_expr (vect_dump, def, TDF_SLIM);
556 fprintf (vect_dump, " def_stmt = ");
557 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
563 /* Case 1: operand is a constant. */
564 case vect_constant_def:
569 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
570 if (vect_print_dump_info (REPORT_DETAILS))
571 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
573 for (i = nunits - 1; i >= 0; --i)
575 t = tree_cons (NULL_TREE, op, t);
577 vec_cst = build_vector (vectype, t);
578 return vect_init_vector (stmt, vec_cst);
581 /* Case 2: operand is defined outside the loop - loop invariant. */
582 case vect_invariant_def:
587 /* Create 'vec_inv = {inv,inv,..,inv}' */
588 if (vect_print_dump_info (REPORT_DETAILS))
589 fprintf (vect_dump, "Create vector_inv.");
591 for (i = nunits - 1; i >= 0; --i)
593 t = tree_cons (NULL_TREE, def, t);
596 /* FIXME: use build_constructor directly. */
597 vec_inv = build_constructor_from_list (vectype, t);
598 return vect_init_vector (stmt, vec_inv);
601 /* Case 3: operand is defined inside the loop. */
605 *scalar_def = def_stmt;
607 /* Get the def from the vectorized stmt. */
608 def_stmt_info = vinfo_for_stmt (def_stmt);
609 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
610 gcc_assert (vec_stmt);
611 vec_oprnd = TREE_OPERAND (vec_stmt, 0);
615 /* Case 4: operand is defined by a loop header phi - reduction */
616 case vect_reduction_def:
618 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
620 /* Get the def before the loop */
621 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
622 return get_initial_def_for_reduction (stmt, op, scalar_def);
625 /* Case 5: operand is defined by loop-header phi - induction. */
626 case vect_induction_def:
628 if (vect_print_dump_info (REPORT_DETAILS))
629 fprintf (vect_dump, "induction - unsupported.");
630 internal_error ("no support for induction"); /* FORNOW */
639 /* Function vect_get_vec_def_for_stmt_copy
641 Return a vector-def for an operand. This function is used when the
642 vectorized stmt to be created (by the caller to this function) is a "copy"
643 created in case the vectorized result cannot fit in one vector, and several
644 copies of the vector-stmt are required. In this case the vector-def is
645 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
646 of the stmt that defines VEC_OPRND.
647 DT is the type of the vector def VEC_OPRND.
650 In case the vectorization factor (VF) is bigger than the number
651 of elements that can fit in a vectype (nunits), we have to generate
652 more than one vector stmt to vectorize the scalar stmt. This situation
653 arises when there are multiple data-types operated upon in the loop; the
654 smallest data-type determines the VF, and as a result, when vectorizing
655 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
656 vector stmt (each computing a vector of 'nunits' results, and together
657 computing 'VF' results in each iteration). This function is called when
658 vectorizing such a stmt (e.g. vectorizing S2 in the illusration below, in
659 which VF=16 and nuniti=4, so the number of copies required is 4):
661 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
663 S1: x = load VS1.0: vx.0 = memref0 VS1.1
664 VS1.1: vx.1 = memref1 VS1.2
665 VS1.2: vx.2 = memref2 VS1.3
666 VS1.3: vx.3 = memref3
668 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
669 VSnew.1: vz1 = vx.1 + ... VSnew.2
670 VSnew.2: vz2 = vx.2 + ... VSnew.3
671 VSnew.3: vz3 = vx.3 + ...
673 The vectorization of S1 is explained in vectorizable_load.
674 The vectorization of S2:
675 To create the first vector-stmt out of the 4 copies - VSnew.0 -
676 the function 'vect_get_vec_def_for_operand' is called to
677 get the relevant vector-def for each operand of S2. For operand x it
678 returns the vector-def 'vx.0'.
680 To create the remaining copies of the vector-stmt (VSnew.j), this
681 function is called to get the relevant vector-def for each operand. It is
682 obtained from the respective VS1.j stmt, which is recorded in the
683 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
685 For example, to obtain the vector-def 'vx.1' in order to create the
686 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
687 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
688 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
689 and return its def ('vx.1').
690 Overall, to create the above sequence this function will be called 3 times:
691 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
692 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
693 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
696 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
698 tree vec_stmt_for_operand;
699 stmt_vec_info def_stmt_info;
701 if (dt == vect_invariant_def || dt == vect_constant_def)
703 /* Do nothing; can reuse same def. */ ;
707 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
708 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
709 gcc_assert (def_stmt_info);
710 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
711 gcc_assert (vec_stmt_for_operand);
712 vec_oprnd = TREE_OPERAND (vec_stmt_for_operand, 0);
718 /* Function vect_finish_stmt_generation.
720 Insert a new stmt. */
723 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
724 block_stmt_iterator *bsi)
726 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
727 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
729 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
730 set_stmt_info (get_stmt_ann (vec_stmt),
731 new_stmt_vec_info (vec_stmt, loop_vinfo));
733 if (vect_print_dump_info (REPORT_DETAILS))
735 fprintf (vect_dump, "add new stmt: ");
736 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
739 /* Make sure bsi points to the stmt that is being vectorized. */
740 gcc_assert (stmt == bsi_stmt (*bsi));
742 #ifdef USE_MAPPED_LOCATION
743 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
745 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
750 #define ADJUST_IN_EPILOG 1
752 /* Function get_initial_def_for_reduction
755 STMT - a stmt that performs a reduction operation in the loop.
756 INIT_VAL - the initial value of the reduction variable
759 SCALAR_DEF - a tree that holds a value to be added to the final result
760 of the reduction (used for "ADJUST_IN_EPILOG" - see below).
761 Return a vector variable, initialized according to the operation that STMT
762 performs. This vector will be used as the initial value of the
763 vector of partial results.
765 Option1 ("ADJUST_IN_EPILOG"): Initialize the vector as follows:
768 min/max: [init_val,init_val,..,init_val,init_val]
769 bit and/or: [init_val,init_val,..,init_val,init_val]
770 and when necessary (e.g. add/mult case) let the caller know
771 that it needs to adjust the result by init_val.
773 Option2: Initialize the vector as follows:
774 add: [0,0,...,0,init_val]
775 mult: [1,1,...,1,init_val]
776 min/max: [init_val,init_val,...,init_val]
777 bit and/or: [init_val,init_val,...,init_val]
778 and no adjustments are needed.
780 For example, for the following code:
786 STMT is 's = s + a[i]', and the reduction variable is 's'.
787 For a vector of 4 units, we want to return either [0,0,0,init_val],
788 or [0,0,0,0] and let the caller know that it needs to adjust
789 the result at the end by 'init_val'.
791 FORNOW: We use the "ADJUST_IN_EPILOG" scheme.
792 TODO: Use some cost-model to estimate which scheme is more profitable.
796 get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def)
798 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
799 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
800 int nunits = GET_MODE_NUNITS (TYPE_MODE (vectype));
802 enum tree_code code = TREE_CODE (TREE_OPERAND (stmt, 1));
803 tree type = TREE_TYPE (init_val);
805 tree vec, t = NULL_TREE;
806 bool need_epilog_adjust;
809 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
816 if (INTEGRAL_TYPE_P (type))
817 def = build_int_cst (type, 0);
819 def = build_real (type, dconst0);
821 #ifdef ADJUST_IN_EPILOG
822 /* All the 'nunits' elements are set to 0. The final result will be
823 adjusted by 'init_val' at the loop epilog. */
825 need_epilog_adjust = true;
827 /* 'nunits - 1' elements are set to 0; The last element is set to
828 'init_val'. No further adjustments at the epilog are needed. */
829 nelements = nunits - 1;
830 need_epilog_adjust = false;
838 need_epilog_adjust = false;
845 for (i = nelements - 1; i >= 0; --i)
846 t = tree_cons (NULL_TREE, def, t);
848 if (nelements == nunits - 1)
850 /* Set the last element of the vector. */
851 t = tree_cons (NULL_TREE, init_val, t);
854 gcc_assert (nelements == nunits);
856 if (TREE_CODE (init_val) == INTEGER_CST || TREE_CODE (init_val) == REAL_CST)
857 vec = build_vector (vectype, t);
859 vec = build_constructor_from_list (vectype, t);
861 if (!need_epilog_adjust)
862 *scalar_def = NULL_TREE;
864 *scalar_def = init_val;
866 return vect_init_vector (stmt, vec);
870 /* Function vect_create_epilog_for_reduction
872 Create code at the loop-epilog to finalize the result of a reduction
875 VECT_DEF is a vector of partial results.
876 REDUC_CODE is the tree-code for the epilog reduction.
877 STMT is the scalar reduction stmt that is being vectorized.
878 REDUCTION_PHI is the phi-node that carries the reduction computation.
881 1. Creates the reduction def-use cycle: sets the the arguments for
883 The loop-entry argument is the vectorized initial-value of the reduction.
884 The loop-latch argument is VECT_DEF - the vector of partial sums.
885 2. "Reduces" the vector of partial results VECT_DEF into a single result,
886 by applying the operation specified by REDUC_CODE if available, or by
887 other means (whole-vector shifts or a scalar loop).
888 The function also creates a new phi node at the loop exit to preserve
889 loop-closed form, as illustrated below.
891 The flow at the entry to this function:
894 vec_def = phi <null, null> # REDUCTION_PHI
895 VECT_DEF = vector_stmt # vectorized form of STMT
896 s_loop = scalar_stmt # (scalar) STMT
898 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
902 The above is transformed by this function into:
905 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
906 VECT_DEF = vector_stmt # vectorized form of STMT
907 s_loop = scalar_stmt # (scalar) STMT
909 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
910 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
911 v_out2 = reduce <v_out1>
912 s_out3 = extract_field <v_out2, 0>
913 s_out4 = adjust_result <s_out3>
919 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
920 enum tree_code reduc_code, tree reduction_phi)
922 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
924 enum machine_mode mode;
925 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
926 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
931 block_stmt_iterator exit_bsi;
936 tree new_scalar_dest, exit_phi;
937 tree bitsize, bitpos, bytesize;
938 enum tree_code code = TREE_CODE (TREE_OPERAND (stmt, 1));
939 tree scalar_initial_def;
940 tree vec_initial_def;
942 imm_use_iterator imm_iter;
944 bool extract_scalar_result;
948 tree operation = TREE_OPERAND (stmt, 1);
951 op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
952 reduction_op = TREE_OPERAND (operation, op_type-1);
953 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
954 mode = TYPE_MODE (vectype);
956 /*** 1. Create the reduction def-use cycle ***/
958 /* 1.1 set the loop-entry arg of the reduction-phi: */
959 /* For the case of reduction, vect_get_vec_def_for_operand returns
960 the scalar def before the loop, that defines the initial value
961 of the reduction variable. */
962 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
963 &scalar_initial_def);
964 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
966 /* 1.2 set the loop-latch arg for the reduction-phi: */
967 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
969 if (vect_print_dump_info (REPORT_DETAILS))
971 fprintf (vect_dump, "transform reduction: created def-use cycle:");
972 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
973 fprintf (vect_dump, "\n");
974 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
978 /*** 2. Create epilog code
979 The reduction epilog code operates across the elements of the vector
980 of partial results computed by the vectorized loop.
981 The reduction epilog code consists of:
982 step 1: compute the scalar result in a vector (v_out2)
983 step 2: extract the scalar result (s_out3) from the vector (v_out2)
984 step 3: adjust the scalar result (s_out3) if needed.
986 Step 1 can be accomplished using one the following three schemes:
987 (scheme 1) using reduc_code, if available.
988 (scheme 2) using whole-vector shifts, if available.
989 (scheme 3) using a scalar loop. In this case steps 1+2 above are
992 The overall epilog code looks like this:
994 s_out0 = phi <s_loop> # original EXIT_PHI
995 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
996 v_out2 = reduce <v_out1> # step 1
997 s_out3 = extract_field <v_out2, 0> # step 2
998 s_out4 = adjust_result <s_out3> # step 3
1000 (step 3 is optional, and step2 1 and 2 may be combined).
1001 Lastly, the uses of s_out0 are replaced by s_out4.
1005 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
1006 v_out1 = phi <v_loop> */
1008 exit_bb = single_exit (loop)->dest;
1009 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
1010 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
1011 exit_bsi = bsi_start (exit_bb);
1013 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
1014 (i.e. when reduc_code is not available) and in the final adjustment code
1015 (if needed). Also get the original scalar reduction variable as
1016 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
1017 represents a reduction pattern), the tree-code and scalar-def are
1018 taken from the original stmt that the pattern-stmt (STMT) replaces.
1019 Otherwise (it is a regular reduction) - the tree-code and scalar-def
1020 are taken from STMT. */
1022 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1025 /* Regular reduction */
1030 /* Reduction pattern */
1031 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
1032 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1033 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
1035 code = TREE_CODE (TREE_OPERAND (orig_stmt, 1));
1036 scalar_dest = TREE_OPERAND (orig_stmt, 0);
1037 scalar_type = TREE_TYPE (scalar_dest);
1038 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
1039 bitsize = TYPE_SIZE (scalar_type);
1040 bytesize = TYPE_SIZE_UNIT (scalar_type);
1042 /* 2.3 Create the reduction code, using one of the three schemes described
1045 if (reduc_code < NUM_TREE_CODES)
1047 /*** Case 1: Create:
1048 v_out2 = reduc_expr <v_out1> */
1050 if (vect_print_dump_info (REPORT_DETAILS))
1051 fprintf (vect_dump, "Reduce using direct vector reduction.");
1053 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1054 epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest,
1055 build1 (reduc_code, vectype, PHI_RESULT (new_phi)));
1056 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1057 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1058 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1060 extract_scalar_result = true;
1064 enum tree_code shift_code = 0;
1065 bool have_whole_vector_shift = true;
1067 int element_bitsize = tree_low_cst (bitsize, 1);
1068 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1071 if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
1072 shift_code = VEC_RSHIFT_EXPR;
1074 have_whole_vector_shift = false;
1076 /* Regardless of whether we have a whole vector shift, if we're
1077 emulating the operation via tree-vect-generic, we don't want
1078 to use it. Only the first round of the reduction is likely
1079 to still be profitable via emulation. */
1080 /* ??? It might be better to emit a reduction tree code here, so that
1081 tree-vect-generic can expand the first round via bit tricks. */
1082 if (!VECTOR_MODE_P (mode))
1083 have_whole_vector_shift = false;
1086 optab optab = optab_for_tree_code (code, vectype);
1087 if (optab->handlers[mode].insn_code == CODE_FOR_nothing)
1088 have_whole_vector_shift = false;
1091 if (have_whole_vector_shift)
1093 /*** Case 2: Create:
1094 for (offset = VS/2; offset >= element_size; offset/=2)
1096 Create: va' = vec_shift <va, offset>
1097 Create: va = vop <va, va'>
1100 if (vect_print_dump_info (REPORT_DETAILS))
1101 fprintf (vect_dump, "Reduce using vector shifts");
1103 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1104 new_temp = PHI_RESULT (new_phi);
1106 for (bit_offset = vec_size_in_bits/2;
1107 bit_offset >= element_bitsize;
1110 tree bitpos = size_int (bit_offset);
1112 epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest,
1113 build2 (shift_code, vectype,
1115 new_name = make_ssa_name (vec_dest, epilog_stmt);
1116 TREE_OPERAND (epilog_stmt, 0) = new_name;
1117 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1119 epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest,
1120 build2 (code, vectype,
1121 new_name, new_temp));
1122 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1123 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1124 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1127 extract_scalar_result = true;
1133 /*** Case 3: Create:
1134 s = extract_field <v_out2, 0>
1135 for (offset = element_size;
1136 offset < vector_size;
1137 offset += element_size;)
1139 Create: s' = extract_field <v_out2, offset>
1140 Create: s = op <s, s'>
1143 if (vect_print_dump_info (REPORT_DETAILS))
1144 fprintf (vect_dump, "Reduce using scalar code. ");
1146 vec_temp = PHI_RESULT (new_phi);
1147 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1148 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1150 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1151 epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, rhs);
1152 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1153 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1154 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1156 for (bit_offset = element_bitsize;
1157 bit_offset < vec_size_in_bits;
1158 bit_offset += element_bitsize)
1160 tree bitpos = bitsize_int (bit_offset);
1161 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1164 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1165 epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest,
1167 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
1168 TREE_OPERAND (epilog_stmt, 0) = new_name;
1169 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1171 epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest,
1172 build2 (code, scalar_type, new_name, new_temp));
1173 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1174 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1175 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1178 extract_scalar_result = false;
1182 /* 2.4 Extract the final scalar result. Create:
1183 s_out3 = extract_field <v_out2, bitpos> */
1185 if (extract_scalar_result)
1189 if (vect_print_dump_info (REPORT_DETAILS))
1190 fprintf (vect_dump, "extract scalar result");
1192 if (BYTES_BIG_ENDIAN)
1193 bitpos = size_binop (MULT_EXPR,
1194 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
1195 TYPE_SIZE (scalar_type));
1197 bitpos = bitsize_zero_node;
1199 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
1200 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1201 epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, rhs);
1202 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1203 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1204 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1207 /* 2.4 Adjust the final result by the initial value of the reduction
1208 variable. (When such adjustment is not needed, then
1209 'scalar_initial_def' is zero).
1212 s_out4 = scalar_expr <s_out3, scalar_initial_def> */
1214 if (scalar_initial_def)
1216 epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest,
1217 build2 (code, scalar_type, new_temp, scalar_initial_def));
1218 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1219 TREE_OPERAND (epilog_stmt, 0) = new_temp;
1220 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1223 /* 2.6 Replace uses of s_out0 with uses of s_out3 */
1225 /* Find the loop-closed-use at the loop exit of the original scalar result.
1226 (The reduction result is expected to have two immediate uses - one at the
1227 latch block, and one at the loop exit). */
1229 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
1231 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
1233 exit_phi = USE_STMT (use_p);
1237 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
1238 gcc_assert (exit_phi);
1239 /* Replace the uses: */
1240 orig_name = PHI_RESULT (exit_phi);
1241 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
1242 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1243 SET_USE (use_p, new_temp);
1247 /* Function vectorizable_reduction.
1249 Check if STMT performs a reduction operation that can be vectorized.
1250 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1251 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1252 Return FALSE if not a vectorizable STMT, TRUE otherwise.
1254 This function also handles reduction idioms (patterns) that have been
1255 recognized in advance during vect_pattern_recog. In this case, STMT may be
1257 X = pattern_expr (arg0, arg1, ..., X)
1258 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
1259 sequence that had been detected and replaced by the pattern-stmt (STMT).
1261 In some cases of reduction patterns, the type of the reduction variable X is
1262 different than the type of the other arguments of STMT.
1263 In such cases, the vectype that is used when transforming STMT into a vector
1264 stmt is different than the vectype that is used to determine the
1265 vectorization factor, because it consists of a different number of elements
1266 than the actual number of elements that are being operated upon in parallel.
1268 For example, consider an accumulation of shorts into an int accumulator.
1269 On some targets it's possible to vectorize this pattern operating on 8
1270 shorts at a time (hence, the vectype for purposes of determining the
1271 vectorization factor should be V8HI); on the other hand, the vectype that
1272 is used to create the vector form is actually V4SI (the type of the result).
1274 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
1275 indicates what is the actual level of parallelism (V8HI in the example), so
1276 that the right vectorization factor would be derived. This vectype
1277 corresponds to the type of arguments to the reduction stmt, and should *NOT*
1278 be used to create the vectorized stmt. The right vectype for the vectorized
1279 stmt is obtained from the type of the result X:
1280 get_vectype_for_scalar_type (TREE_TYPE (X))
1282 This means that, contrary to "regular" reductions (or "regular" stmts in
1283 general), the following equation:
1284 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
1285 does *NOT* necessarily hold for reduction patterns. */
1288 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1293 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
1294 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1295 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1296 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1297 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1299 enum tree_code code, orig_code, epilog_reduc_code = 0;
1300 enum machine_mode vec_mode;
1302 optab optab, reduc_optab;
1303 tree new_temp = NULL_TREE;
1305 enum vect_def_type dt;
1310 stmt_vec_info orig_stmt_info;
1311 tree expr = NULL_TREE;
1313 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1314 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1315 stmt_vec_info prev_stmt_info;
1317 tree new_stmt = NULL_TREE;
1320 gcc_assert (ncopies >= 1);
1322 /* 1. Is vectorizable reduction? */
1324 /* Not supportable if the reduction variable is used in the loop. */
1325 if (STMT_VINFO_RELEVANT_P (stmt_info))
1328 if (!STMT_VINFO_LIVE_P (stmt_info))
1331 /* Make sure it was already recognized as a reduction computation. */
1332 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
1335 /* 2. Has this been recognized as a reduction pattern?
1337 Check if STMT represents a pattern that has been recognized
1338 in earlier analysis stages. For stmts that represent a pattern,
1339 the STMT_VINFO_RELATED_STMT field records the last stmt in
1340 the original sequence that constitutes the pattern. */
1342 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1345 orig_stmt_info = vinfo_for_stmt (orig_stmt);
1346 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
1347 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
1348 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
1351 /* 3. Check the operands of the operation. The first operands are defined
1352 inside the loop body. The last operand is the reduction variable,
1353 which is defined by the loop-header-phi. */
1355 gcc_assert (TREE_CODE (stmt) == MODIFY_EXPR);
1357 operation = TREE_OPERAND (stmt, 1);
1358 code = TREE_CODE (operation);
1359 op_type = TREE_CODE_LENGTH (code);
1360 if (op_type != binary_op && op_type != ternary_op)
1362 scalar_dest = TREE_OPERAND (stmt, 0);
1363 scalar_type = TREE_TYPE (scalar_dest);
1365 /* All uses but the last are expected to be defined in the loop.
1366 The last use is the reduction variable. */
1367 for (i = 0; i < op_type-1; i++)
1369 op = TREE_OPERAND (operation, i);
1370 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1371 gcc_assert (is_simple_use);
1372 gcc_assert (dt == vect_loop_def || dt == vect_invariant_def ||
1373 dt == vect_constant_def);
1376 op = TREE_OPERAND (operation, i);
1377 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1378 gcc_assert (is_simple_use);
1379 gcc_assert (dt == vect_reduction_def);
1380 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1382 gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt));
1384 gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt));
1386 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
1389 /* 4. Supportable by target? */
1391 /* 4.1. check support for the operation in the loop */
1392 optab = optab_for_tree_code (code, vectype);
1395 if (vect_print_dump_info (REPORT_DETAILS))
1396 fprintf (vect_dump, "no optab.");
1399 vec_mode = TYPE_MODE (vectype);
1400 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1402 if (vect_print_dump_info (REPORT_DETAILS))
1403 fprintf (vect_dump, "op not supported by target.");
1404 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1405 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1406 < vect_min_worthwhile_factor (code))
1408 if (vect_print_dump_info (REPORT_DETAILS))
1409 fprintf (vect_dump, "proceeding using word mode.");
1412 /* Worthwhile without SIMD support? */
1413 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1414 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1415 < vect_min_worthwhile_factor (code))
1417 if (vect_print_dump_info (REPORT_DETAILS))
1418 fprintf (vect_dump, "not worthwhile without SIMD support.");
1422 /* 4.2. Check support for the epilog operation.
1424 If STMT represents a reduction pattern, then the type of the
1425 reduction variable may be different than the type of the rest
1426 of the arguments. For example, consider the case of accumulation
1427 of shorts into an int accumulator; The original code:
1428 S1: int_a = (int) short_a;
1429 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
1432 STMT: int_acc = widen_sum <short_a, int_acc>
1435 1. The tree-code that is used to create the vector operation in the
1436 epilog code (that reduces the partial results) is not the
1437 tree-code of STMT, but is rather the tree-code of the original
1438 stmt from the pattern that STMT is replacing. I.e, in the example
1439 above we want to use 'widen_sum' in the loop, but 'plus' in the
1441 2. The type (mode) we use to check available target support
1442 for the vector operation to be created in the *epilog*, is
1443 determined by the type of the reduction variable (in the example
1444 above we'd check this: plus_optab[vect_int_mode]).
1445 However the type (mode) we use to check available target support
1446 for the vector operation to be created *inside the loop*, is
1447 determined by the type of the other arguments to STMT (in the
1448 example we'd check this: widen_sum_optab[vect_short_mode]).
1450 This is contrary to "regular" reductions, in which the types of all
1451 the arguments are the same as the type of the reduction variable.
1452 For "regular" reductions we can therefore use the same vector type
1453 (and also the same tree-code) when generating the epilog code and
1454 when generating the code inside the loop. */
1458 /* This is a reduction pattern: get the vectype from the type of the
1459 reduction variable, and get the tree-code from orig_stmt. */
1460 orig_code = TREE_CODE (TREE_OPERAND (orig_stmt, 1));
1461 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
1462 vec_mode = TYPE_MODE (vectype);
1466 /* Regular reduction: use the same vectype and tree-code as used for
1467 the vector code inside the loop can be used for the epilog code. */
1471 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
1473 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
1476 if (vect_print_dump_info (REPORT_DETAILS))
1477 fprintf (vect_dump, "no optab for reduction.");
1478 epilog_reduc_code = NUM_TREE_CODES;
1480 if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1482 if (vect_print_dump_info (REPORT_DETAILS))
1483 fprintf (vect_dump, "reduc op not supported by target.");
1484 epilog_reduc_code = NUM_TREE_CODES;
1487 if (!vec_stmt) /* transformation not required. */
1489 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
1495 if (vect_print_dump_info (REPORT_DETAILS))
1496 fprintf (vect_dump, "transform reduction.");
1498 /* Create the destination vector */
1499 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1501 /* Create the reduction-phi that defines the reduction-operand. */
1502 new_phi = create_phi_node (vec_dest, loop->header);
1504 /* In case the vectorization factor (VF) is bigger than the number
1505 of elements that we can fit in a vectype (nunits), we have to generate
1506 more than one vector stmt - i.e - we need to "unroll" the
1507 vector stmt by a factor VF/nunits. For more details see documentation
1508 in vectorizable_operation. */
1510 prev_stmt_info = NULL;
1511 for (j = 0; j < ncopies; j++)
1516 op = TREE_OPERAND (operation, 0);
1517 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
1518 if (op_type == ternary_op)
1520 op = TREE_OPERAND (operation, 1);
1521 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
1524 /* Get the vector def for the reduction variable from the phi node */
1525 reduc_def = PHI_RESULT (new_phi);
1529 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
1530 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
1531 if (op_type == ternary_op)
1532 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
1534 /* Get the vector def for the reduction variable from the vectorized
1535 reduction operation generated in the previous iteration (j-1) */
1536 reduc_def = TREE_OPERAND (new_stmt ,0);
1539 /* Arguments are ready. create the new vector stmt. */
1541 if (op_type == binary_op)
1542 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
1544 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
1546 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, expr);
1547 new_temp = make_ssa_name (vec_dest, new_stmt);
1548 TREE_OPERAND (new_stmt, 0) = new_temp;
1549 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1552 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1554 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1555 prev_stmt_info = vinfo_for_stmt (new_stmt);
1558 /* Finalize the reduction-phi (set it's arguments) and create the
1559 epilog reduction code. */
1560 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
1565 /* Function vectorizable_assignment.
1567 Check if STMT performs an assignment (copy) that can be vectorized.
1568 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1569 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1570 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1573 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1579 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1580 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1581 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1584 enum vect_def_type dt;
1585 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1586 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1588 gcc_assert (ncopies >= 1);
1590 return false; /* FORNOW */
1592 /* Is vectorizable assignment? */
1593 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1596 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1598 if (TREE_CODE (stmt) != MODIFY_EXPR)
1601 scalar_dest = TREE_OPERAND (stmt, 0);
1602 if (TREE_CODE (scalar_dest) != SSA_NAME)
1605 op = TREE_OPERAND (stmt, 1);
1606 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
1608 if (vect_print_dump_info (REPORT_DETAILS))
1609 fprintf (vect_dump, "use not simple.");
1613 if (!vec_stmt) /* transformation not required. */
1615 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
1620 if (vect_print_dump_info (REPORT_DETAILS))
1621 fprintf (vect_dump, "transform assignment.");
1624 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1627 op = TREE_OPERAND (stmt, 1);
1628 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
1630 /* Arguments are ready. create the new vector stmt. */
1631 *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, vec_oprnd);
1632 new_temp = make_ssa_name (vec_dest, *vec_stmt);
1633 TREE_OPERAND (*vec_stmt, 0) = new_temp;
1634 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
1640 /* Function vect_min_worthwhile_factor.
1642 For a loop where we could vectorize the operation indicated by CODE,
1643 return the minimum vectorization factor that makes it worthwhile
1644 to use generic vectors. */
1646 vect_min_worthwhile_factor (enum tree_code code)
1667 /* Function vectorizable_operation.
1669 Check if STMT performs a binary or unary operation that can be vectorized.
1670 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1671 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1672 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1675 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1680 tree op0, op1 = NULL;
1681 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
1682 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1683 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1684 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1685 enum tree_code code;
1686 enum machine_mode vec_mode;
1691 enum machine_mode optab_op2_mode;
1693 enum vect_def_type dt0, dt1;
1695 stmt_vec_info prev_stmt_info;
1696 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
1699 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
1702 gcc_assert (ncopies >= 1);
1704 /* Is STMT a vectorizable binary/unary operation? */
1705 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1708 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1710 if (STMT_VINFO_LIVE_P (stmt_info))
1712 /* FORNOW: not yet supported. */
1713 if (vect_print_dump_info (REPORT_DETAILS))
1714 fprintf (vect_dump, "value used after loop.");
1718 if (TREE_CODE (stmt) != MODIFY_EXPR)
1721 if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME)
1724 scalar_dest = TREE_OPERAND (stmt, 0);
1725 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
1726 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
1727 if (nunits_out != nunits_in)
1730 operation = TREE_OPERAND (stmt, 1);
1731 code = TREE_CODE (operation);
1732 optab = optab_for_tree_code (code, vectype);
1734 /* Support only unary or binary operations. */
1735 op_type = TREE_CODE_LENGTH (code);
1736 if (op_type != unary_op && op_type != binary_op)
1738 if (vect_print_dump_info (REPORT_DETAILS))
1739 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
1743 op0 = TREE_OPERAND (operation, 0);
1744 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
1746 if (vect_print_dump_info (REPORT_DETAILS))
1747 fprintf (vect_dump, "use not simple.");
1751 if (op_type == binary_op)
1753 op1 = TREE_OPERAND (operation, 1);
1754 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
1756 if (vect_print_dump_info (REPORT_DETAILS))
1757 fprintf (vect_dump, "use not simple.");
1762 /* Supportable by target? */
1765 if (vect_print_dump_info (REPORT_DETAILS))
1766 fprintf (vect_dump, "no optab.");
1769 vec_mode = TYPE_MODE (vectype);
1770 icode = (int) optab->handlers[(int) vec_mode].insn_code;
1771 if (icode == CODE_FOR_nothing)
1773 if (vect_print_dump_info (REPORT_DETAILS))
1774 fprintf (vect_dump, "op not supported by target.");
1775 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1776 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1777 < vect_min_worthwhile_factor (code))
1779 if (vect_print_dump_info (REPORT_DETAILS))
1780 fprintf (vect_dump, "proceeding using word mode.");
1783 /* Worthwhile without SIMD support? */
1784 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1785 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1786 < vect_min_worthwhile_factor (code))
1788 if (vect_print_dump_info (REPORT_DETAILS))
1789 fprintf (vect_dump, "not worthwhile without SIMD support.");
1793 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
1795 /* FORNOW: not yet supported. */
1796 if (!VECTOR_MODE_P (vec_mode))
1799 /* Invariant argument is needed for a vector shift
1800 by a scalar shift operand. */
1801 optab_op2_mode = insn_data[icode].operand[2].mode;
1802 if (! (VECTOR_MODE_P (optab_op2_mode)
1803 || dt1 == vect_constant_def
1804 || dt1 == vect_invariant_def))
1806 if (vect_print_dump_info (REPORT_DETAILS))
1807 fprintf (vect_dump, "operand mode requires invariant argument.");
1812 if (!vec_stmt) /* transformation not required. */
1814 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
1820 if (vect_print_dump_info (REPORT_DETAILS))
1821 fprintf (vect_dump, "transform binary/unary operation.");
1824 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1826 /* In case the vectorization factor (VF) is bigger than the number
1827 of elements that we can fit in a vectype (nunits), we have to generate
1828 more than one vector stmt - i.e - we need to "unroll" the
1829 vector stmt by a factor VF/nunits. In doing so, we record a pointer
1830 from one copy of the vector stmt to the next, in the field
1831 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
1832 stages to find the correct vector defs to be used when vectorizing
1833 stmts that use the defs of the current stmt. The example below illustrates
1834 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
1835 4 vectorized stmts):
1837 before vectorization:
1838 RELATED_STMT VEC_STMT
1842 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
1844 RELATED_STMT VEC_STMT
1845 VS1_0: vx0 = memref0 VS1_1 -
1846 VS1_1: vx1 = memref1 VS1_2 -
1847 VS1_2: vx2 = memref2 VS1_3 -
1848 VS1_3: vx3 = memref3 - -
1849 S1: x = load - VS1_0
1852 step2: vectorize stmt S2 (done here):
1853 To vectorize stmt S2 we first need to find the relevant vector
1854 def for the first operand 'x'. This is, as usual, obtained from
1855 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
1856 that defines 'x' (S1). This way we find the stmt VS1_0, and the
1857 relevant vector def 'vx0'. Having found 'vx0' we can generate
1858 the vector stmt VS2_0, and as usual, record it in the
1859 STMT_VINFO_VEC_STMT of stmt S2.
1860 When creating the second copy (VS2_1), we obtain the relevant vector
1861 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
1862 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
1863 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
1864 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
1865 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
1866 chain of stmts and pointers:
1867 RELATED_STMT VEC_STMT
1868 VS1_0: vx0 = memref0 VS1_1 -
1869 VS1_1: vx1 = memref1 VS1_2 -
1870 VS1_2: vx2 = memref2 VS1_3 -
1871 VS1_3: vx3 = memref3 - -
1872 S1: x = load - VS1_0
1873 VS2_0: vz0 = vx0 + v1 VS2_1 -
1874 VS2_1: vz1 = vx1 + v1 VS2_2 -
1875 VS2_2: vz2 = vx2 + v1 VS2_3 -
1876 VS2_3: vz3 = vx3 + v1 - -
1877 S2: z = x + 1 - VS2_0 */
1879 prev_stmt_info = NULL;
1880 for (j = 0; j < ncopies; j++)
1885 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
1886 if (op_type == binary_op)
1888 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
1890 /* Vector shl and shr insn patterns can be defined with
1891 scalar operand 2 (shift operand). In this case, use
1892 constant or loop invariant op1 directly, without
1893 extending it to vector mode first. */
1894 optab_op2_mode = insn_data[icode].operand[2].mode;
1895 if (!VECTOR_MODE_P (optab_op2_mode))
1897 if (vect_print_dump_info (REPORT_DETAILS))
1898 fprintf (vect_dump, "operand 1 using scalar mode.");
1903 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
1908 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
1909 if (op_type == binary_op)
1910 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
1913 /* Arguments are ready. create the new vector stmt. */
1915 if (op_type == binary_op)
1916 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest,
1917 build2 (code, vectype, vec_oprnd0, vec_oprnd1));
1919 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest,
1920 build1 (code, vectype, vec_oprnd0));
1921 new_temp = make_ssa_name (vec_dest, new_stmt);
1922 TREE_OPERAND (new_stmt, 0) = new_temp;
1923 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1926 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1928 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1929 prev_stmt_info = vinfo_for_stmt (new_stmt);
1936 /* Function vectorizable_type_demotion
1938 Check if STMT performs a binary or unary operation that involves
1939 type demotion, and if it can be vectorized.
1940 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1941 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1942 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1945 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
1952 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
1953 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1954 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1955 enum tree_code code;
1958 enum vect_def_type dt0;
1960 stmt_vec_info prev_stmt_info;
1970 enum machine_mode vec_mode;
1972 /* Is STMT a vectorizable type-demotion operation? */
1974 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1977 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1979 if (STMT_VINFO_LIVE_P (stmt_info))
1981 /* FORNOW: not yet supported. */
1982 if (vect_print_dump_info (REPORT_DETAILS))
1983 fprintf (vect_dump, "value used after loop.");
1987 if (TREE_CODE (stmt) != MODIFY_EXPR)
1990 if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME)
1993 operation = TREE_OPERAND (stmt, 1);
1994 code = TREE_CODE (operation);
1995 if (code != NOP_EXPR && code != CONVERT_EXPR)
1998 op0 = TREE_OPERAND (operation, 0);
1999 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2000 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2002 scalar_dest = TREE_OPERAND (stmt, 0);
2003 scalar_type = TREE_TYPE (scalar_dest);
2004 vectype_out = get_vectype_for_scalar_type (scalar_type);
2005 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2006 if (nunits_in != nunits_out / 2) /* FORNOW */
2009 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2010 gcc_assert (ncopies >= 1);
2012 /* Check the operands of the operation. */
2013 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2015 if (vect_print_dump_info (REPORT_DETAILS))
2016 fprintf (vect_dump, "use not simple.");
2020 /* Supportable by target? */
2021 code = VEC_PACK_MOD_EXPR;
2022 optab = optab_for_tree_code (VEC_PACK_MOD_EXPR, vectype_in);
2026 vec_mode = TYPE_MODE (vectype_in);
2027 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
2030 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2032 if (!vec_stmt) /* transformation not required. */
2034 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
2040 if (vect_print_dump_info (REPORT_DETAILS))
2041 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
2045 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2047 /* In case the vectorization factor (VF) is bigger than the number
2048 of elements that we can fit in a vectype (nunits), we have to generate
2049 more than one vector stmt - i.e - we need to "unroll" the
2050 vector stmt by a factor VF/nunits. */
2051 prev_stmt_info = NULL;
2052 for (j = 0; j < ncopies; j++)
2057 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2058 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2059 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd0);
2063 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
2064 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2067 /* Arguments are ready. Create the new vector stmt. */
2068 expr = build2 (code, vectype_out, vec_oprnd0, vec_oprnd1);
2069 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, expr);
2070 new_temp = make_ssa_name (vec_dest, new_stmt);
2071 TREE_OPERAND (new_stmt, 0) = new_temp;
2072 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2075 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2077 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2079 prev_stmt_info = vinfo_for_stmt (new_stmt);
2082 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2087 /* Function vect_gen_widened_results_half
2089 Create a vector stmt whose code, type, number of arguments, and result
2090 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
2091 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
2092 In the case that CODE is a CALL_EXPR, this means that a call to DECL
2093 needs to be created (DECL is a function-decl of a target-builtin).
2094 STMT is the original scalar stmt that we are vectorizing. */
2097 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
2098 tree vec_oprnd0, tree vec_oprnd1, int op_type,
2099 tree vec_dest, block_stmt_iterator *bsi,
2109 /* Generate half of the widened result: */
2110 if (code == CALL_EXPR)
2112 /* Target specific support */
2113 vec_params = build_tree_list (NULL_TREE, vec_oprnd0);
2114 if (op_type == binary_op)
2115 vec_params = tree_cons (NULL_TREE, vec_oprnd1, vec_params);
2116 expr = build_function_call_expr (decl, vec_params);
2120 /* Generic support */
2121 gcc_assert (op_type == TREE_CODE_LENGTH (code));
2122 if (op_type == binary_op)
2123 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
2125 expr = build1 (code, vectype, vec_oprnd0);
2127 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, expr);
2128 new_temp = make_ssa_name (vec_dest, new_stmt);
2129 TREE_OPERAND (new_stmt, 0) = new_temp;
2130 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2132 if (code == CALL_EXPR)
2134 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2136 if (TREE_CODE (sym) == SSA_NAME)
2137 sym = SSA_NAME_VAR (sym);
2138 mark_sym_for_renaming (sym);
2146 /* Function vectorizable_type_promotion
2148 Check if STMT performs a binary or unary operation that involves
2149 type promotion, and if it can be vectorized.
2150 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2151 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2152 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2155 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
2161 tree op0, op1 = NULL;
2162 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2163 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2164 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2165 enum tree_code code, code1 = CODE_FOR_nothing, code2 = CODE_FOR_nothing;
2166 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
2169 enum vect_def_type dt0, dt1;
2171 stmt_vec_info prev_stmt_info;
2179 /* Is STMT a vectorizable type-promotion operation? */
2181 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2184 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2186 if (STMT_VINFO_LIVE_P (stmt_info))
2188 /* FORNOW: not yet supported. */
2189 if (vect_print_dump_info (REPORT_DETAILS))
2190 fprintf (vect_dump, "value used after loop.");
2194 if (TREE_CODE (stmt) != MODIFY_EXPR)
2197 if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME)
2200 operation = TREE_OPERAND (stmt, 1);
2201 code = TREE_CODE (operation);
2202 if (code != NOP_EXPR && code != WIDEN_MULT_EXPR)
2205 op0 = TREE_OPERAND (operation, 0);
2206 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2207 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2208 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2209 gcc_assert (ncopies >= 1);
2211 scalar_dest = TREE_OPERAND (stmt, 0);
2212 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2213 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2214 if (nunits_out != nunits_in / 2) /* FORNOW */
2217 /* Check the operands of the operation. */
2218 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2220 if (vect_print_dump_info (REPORT_DETAILS))
2221 fprintf (vect_dump, "use not simple.");
2225 op_type = TREE_CODE_LENGTH (code);
2226 if (op_type == binary_op)
2228 op1 = TREE_OPERAND (operation, 1);
2229 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2231 if (vect_print_dump_info (REPORT_DETAILS))
2232 fprintf (vect_dump, "use not simple.");
2237 /* Supportable by target? */
2238 if (!supportable_widening_operation (code, stmt, vectype_in,
2239 &decl1, &decl2, &code1, &code2))
2242 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2244 if (!vec_stmt) /* transformation not required. */
2246 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
2252 if (vect_print_dump_info (REPORT_DETAILS))
2253 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
2257 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2259 /* In case the vectorization factor (VF) is bigger than the number
2260 of elements that we can fit in a vectype (nunits), we have to generate
2261 more than one vector stmt - i.e - we need to "unroll" the
2262 vector stmt by a factor VF/nunits. */
2264 prev_stmt_info = NULL;
2265 for (j = 0; j < ncopies; j++)
2270 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2271 if (op_type == binary_op)
2272 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2276 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2277 if (op_type == binary_op)
2278 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2281 /* Arguments are ready. Create the new vector stmt. We are creating
2282 two vector defs because the widened result does not fit in one vector.
2283 The vectorized stmt can be expressed as a call to a taregt builtin,
2284 or a using a tree-code. */
2285 /* Generate first half of the widened result: */
2286 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
2287 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2289 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2291 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2292 prev_stmt_info = vinfo_for_stmt (new_stmt);
2294 /* Generate second half of the widened result: */
2295 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
2296 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2297 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2298 prev_stmt_info = vinfo_for_stmt (new_stmt);
2302 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2307 /* Function vect_strided_store_supported.
2309 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
2310 and FALSE otherwise. */
2313 vect_strided_store_supported (tree vectype)
2315 optab interleave_high_optab, interleave_low_optab;
2318 mode = (int) TYPE_MODE (vectype);
2320 /* Check that the operation is supported. */
2321 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
2323 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
2325 if (!interleave_high_optab || !interleave_low_optab)
2327 if (vect_print_dump_info (REPORT_DETAILS))
2328 fprintf (vect_dump, "no optab for interleave.");
2332 if (interleave_high_optab->handlers[(int) mode].insn_code
2334 || interleave_low_optab->handlers[(int) mode].insn_code
2335 == CODE_FOR_nothing)
2337 if (vect_print_dump_info (REPORT_DETAILS))
2338 fprintf (vect_dump, "interleave op not supported by target.");
2345 /* Function vect_permute_store_chain.
2347 Given a chain of interleaved strores in DR_CHAIN of LENGTH that must be
2348 a power of 2, generate interleave_high/low stmts to reorder the data
2349 correctly for the stores. Return the final references for stores in
2352 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
2353 The input is 4 vectors each containg 8 elements. We assign a number to each
2354 element, the input sequence is:
2356 1st vec: 0 1 2 3 4 5 6 7
2357 2nd vec: 8 9 10 11 12 13 14 15
2358 3rd vec: 16 17 18 19 20 21 22 23
2359 4th vec: 24 25 26 27 28 29 30 31
2361 The output sequence should be:
2363 1st vec: 0 8 16 24 1 9 17 25
2364 2nd vec: 2 10 18 26 3 11 19 27
2365 3rd vec: 4 12 20 28 5 13 21 30
2366 4th vec: 6 14 22 30 7 15 23 31
2368 i.e., we interleave the contents of the four vectors in their order.
2370 We use interleave_high/low instructions to create such output. The input of
2371 each interleave_high/low operation is two vectors:
2374 the even elements of the result vector are obtained left-to-right from the
2375 high/low elements of the first vector. The odd elements of the result are
2376 obtained left-to-right from the high/low elements of the second vector.
2377 The output of interleave_high will be: 0 4 1 5
2378 and of interleave_low: 2 6 3 7
2381 The permutaion is done in log LENGTH stages. In each stage interleave_high
2382 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
2383 where the first argument is taken from the first half of DR_CHAIN and the
2384 second argument from it's second half.
2387 I1: interleave_high (1st vec, 3rd vec)
2388 I2: interleave_low (1st vec, 3rd vec)
2389 I3: interleave_high (2nd vec, 4th vec)
2390 I4: interleave_low (2nd vec, 4th vec)
2392 The output for the first stage is:
2394 I1: 0 16 1 17 2 18 3 19
2395 I2: 4 20 5 21 6 22 7 23
2396 I3: 8 24 9 25 10 26 11 27
2397 I4: 12 28 13 29 14 30 15 31
2399 The output of the second stage, i.e. the final result is:
2401 I1: 0 8 16 24 1 9 17 25
2402 I2: 2 10 18 26 3 11 19 27
2403 I3: 4 12 20 28 5 13 21 30
2404 I4: 6 14 22 30 7 15 23 31. */
2407 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
2408 unsigned int length,
2410 block_stmt_iterator *bsi,
2411 VEC(tree,heap) **result_chain)
2413 tree perm_dest, perm_stmt, vect1, vect2, high, low;
2414 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2418 VEC(tree,heap) *first, *second;
2420 scalar_dest = TREE_OPERAND (stmt, 0);
2421 first = VEC_alloc (tree, heap, length/2);
2422 second = VEC_alloc (tree, heap, length/2);
2424 /* Check that the operation is supported. */
2425 if (!vect_strided_store_supported (vectype))
2428 *result_chain = VEC_copy (tree, heap, dr_chain);
2430 for (i = 0; i < exact_log2 (length); i++)
2432 for (j = 0; j < length/2; j++)
2434 vect1 = VEC_index (tree, dr_chain, j);
2435 vect2 = VEC_index (tree, dr_chain, j+length/2);
2437 /* high = interleave_high (vect1, vect2); */
2438 perm_dest = create_tmp_var (vectype, "vect_inter_high");
2439 add_referenced_var (perm_dest);
2440 perm_stmt = build2 (MODIFY_EXPR, vectype, perm_dest,
2441 build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1,
2443 high = make_ssa_name (perm_dest, perm_stmt);
2444 TREE_OPERAND (perm_stmt, 0) = high;
2445 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
2446 VEC_replace (tree, *result_chain, 2*j, high);
2448 /* low = interleave_low (vect1, vect2); */
2449 perm_dest = create_tmp_var (vectype, "vect_inter_low");
2450 add_referenced_var (perm_dest);
2451 perm_stmt = build2 (MODIFY_EXPR, vectype, perm_dest,
2452 build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1,
2454 low = make_ssa_name (perm_dest, perm_stmt);
2455 TREE_OPERAND (perm_stmt, 0) = low;
2456 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
2457 VEC_replace (tree, *result_chain, 2*j+1, low);
2459 dr_chain = VEC_copy (tree, heap, *result_chain);
2465 /* Function vectorizable_store.
2467 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
2469 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2470 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2471 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2474 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2479 tree vec_oprnd = NULL_TREE;
2480 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2481 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
2482 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2483 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2484 enum machine_mode vec_mode;
2486 enum dr_alignment_support alignment_support_cheme;
2488 def_operand_p def_p;
2490 enum vect_def_type dt;
2491 stmt_vec_info prev_stmt_info = NULL;
2492 tree dataref_ptr = NULL_TREE;
2493 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2494 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2496 tree next_stmt, first_stmt;
2497 bool strided_store = false;
2498 unsigned int group_size, i;
2499 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
2500 gcc_assert (ncopies >= 1);
2502 /* Is vectorizable store? */
2504 if (TREE_CODE (stmt) != MODIFY_EXPR)
2507 scalar_dest = TREE_OPERAND (stmt, 0);
2508 if (TREE_CODE (scalar_dest) != ARRAY_REF
2509 && TREE_CODE (scalar_dest) != INDIRECT_REF
2510 && !DR_GROUP_FIRST_DR (stmt_info))
2513 op = TREE_OPERAND (stmt, 1);
2514 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
2516 if (vect_print_dump_info (REPORT_DETAILS))
2517 fprintf (vect_dump, "use not simple.");
2521 vec_mode = TYPE_MODE (vectype);
2522 /* FORNOW. In some cases can vectorize even if data-type not supported
2523 (e.g. - array initialization with 0). */
2524 if (mov_optab->handlers[(int)vec_mode].insn_code == CODE_FOR_nothing)
2527 if (!STMT_VINFO_DATA_REF (stmt_info))
2530 if (DR_GROUP_FIRST_DR (stmt_info))
2532 strided_store = true;
2533 if (!vect_strided_store_supported (vectype))
2537 if (!vec_stmt) /* transformation not required. */
2539 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
2545 if (vect_print_dump_info (REPORT_DETAILS))
2546 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
2550 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
2551 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2552 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
2554 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
2556 /* We vectorize all the stmts of the interleaving group when we
2557 reach the last stmt in the group. */
2558 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
2559 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
2561 *vec_stmt = NULL_TREE;
2572 dr_chain = VEC_alloc (tree, heap, group_size);
2573 oprnds = VEC_alloc (tree, heap, group_size);
2575 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
2576 gcc_assert (alignment_support_cheme);
2577 gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */
2579 /* In case the vectorization factor (VF) is bigger than the number
2580 of elements that we can fit in a vectype (nunits), we have to generate
2581 more than one vector stmt - i.e - we need to "unroll" the
2582 vector stmt by a factor VF/nunits. For more details see documentation in
2583 vect_get_vec_def_for_copy_stmt. */
2585 /* In case of interleaving (non-unit strided access):
2592 We create vectorized storess starting from base address (the access of the
2593 first stmt in the chain (S2 in the above example), when the last store stmt
2594 of the chain (S4) is reached:
2597 VS2: &base + vec_size*1 = vx0
2598 VS3: &base + vec_size*2 = vx1
2599 VS4: &base + vec_size*3 = vx3
2601 Then permutation statements are generated:
2603 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
2604 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
2607 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
2608 (the order of the data-refs in the output of vect_permute_store_chain
2609 corresponds to the order of scalar stmts in the interleaving chain - see
2610 the documentaion of vect_permute_store_chain()).
2612 In case of both multiple types and interleaving, above vector stores and
2613 permutation stmts are created for every copy. The result vector stmts are
2614 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
2615 STMT_VINFO_RELATED_STMT for the next copies.
2618 prev_stmt_info = NULL;
2619 for (j = 0; j < ncopies; j++)
2626 /* For interleaved stores we collect vectorized defs for all the
2627 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
2628 as an input to vect_permute_store_chain(), and OPRNDS as an input
2629 to vect_get_vec_def_for_stmt_copy() for the next copy.
2630 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
2631 OPRNDS are of size 1.
2633 next_stmt = first_stmt;
2634 for (i = 0; i < group_size; i++)
2636 /* Since gaps are not supported for interleaved stores, GROUP_SIZE
2637 is the exact number of stmts in the chain. Therefore, NEXT_STMT
2638 can't be NULL_TREE. In case that there is no interleaving,
2639 GROUP_SIZE is 1, and only one iteration of the loop will be
2642 gcc_assert (next_stmt);
2643 op = TREE_OPERAND (next_stmt, 1);
2644 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
2645 VEC_quick_push(tree, dr_chain, vec_oprnd);
2646 VEC_quick_push(tree, oprnds, vec_oprnd);
2647 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
2649 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE,
2650 &dummy, &ptr_incr, false);
2654 /* For interleaved stores we created vectorized defs for all the
2655 defs stored in OPRNDS in the previous iteration (previous copy).
2656 DR_CHAIN is then used as an input to vect_permute_store_chain(),
2657 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
2659 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
2660 OPRNDS are of size 1.
2662 for (i = 0; i < group_size; i++)
2664 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
2665 VEC_index (tree, oprnds, i));
2666 VEC_replace(tree, dr_chain, i, vec_oprnd);
2667 VEC_replace(tree, oprnds, i, vec_oprnd);
2669 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
2674 result_chain = VEC_alloc (tree, heap, group_size);
2676 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
2681 next_stmt = first_stmt;
2682 for (i = 0; i < group_size; i++)
2684 /* For strided stores vectorized defs are interleaved in
2685 vect_permute_store_chain(). */
2687 vec_oprnd = VEC_index(tree, result_chain, i);
2689 data_ref = build_fold_indirect_ref (dataref_ptr);
2690 /* Arguments are ready. Create the new vector stmt. */
2691 new_stmt = build2 (MODIFY_EXPR, vectype, data_ref, vec_oprnd);
2692 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2694 /* Set the V_MAY_DEFS for the vector pointer. If this virtual def has a
2695 use outside the loop and a loop peel is performed then the def may be
2696 renamed by the peel. Mark it for renaming so the later use will also
2698 copy_virtual_operands (new_stmt, next_stmt);
2701 /* The original store is deleted so the same SSA_NAMEs can be used.
2703 FOR_EACH_SSA_TREE_OPERAND (def, next_stmt, iter, SSA_OP_VMAYDEF)
2705 SSA_NAME_DEF_STMT (def) = new_stmt;
2706 mark_sym_for_renaming (SSA_NAME_VAR (def));
2709 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2713 /* Create new names for all the definitions created by COPY and
2714 add replacement mappings for each new name. */
2715 FOR_EACH_SSA_DEF_OPERAND (def_p, new_stmt, iter, SSA_OP_VMAYDEF)
2717 create_new_def_for (DEF_FROM_PTR (def_p), new_stmt, def_p);
2718 mark_sym_for_renaming (SSA_NAME_VAR (DEF_FROM_PTR (def_p)));
2721 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2724 prev_stmt_info = vinfo_for_stmt (new_stmt);
2725 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
2728 /* Bump the vector pointer. */
2729 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
2737 /* Function vect_setup_realignment
2739 This function is called when vectorizing an unaligned load using
2740 the dr_unaligned_software_pipeline scheme.
2741 This function generates the following code at the loop prolog:
2744 msq_init = *(floor(p)); # prolog load
2745 realignment_token = call target_builtin;
2747 msq = phi (msq_init, ---)
2749 The code above sets up a new (vector) pointer, pointing to the first
2750 location accessed by STMT, and a "floor-aligned" load using that pointer.
2751 It also generates code to compute the "realignment-token" (if the relevant
2752 target hook was defined), and creates a phi-node at the loop-header bb
2753 whose arguments are the result of the prolog-load (created by this
2754 function) and the result of a load that takes place in the loop (to be
2755 created by the caller to this function).
2756 The caller to this function uses the phi-result (msq) to create the
2757 realignment code inside the loop, and sets up the missing phi argument,
2761 msq = phi (msq_init, lsq)
2762 lsq = *(floor(p')); # load in loop
2763 result = realign_load (msq, lsq, realignment_token);
2766 STMT - (scalar) load stmt to be vectorized. This load accesses
2767 a memory location that may be unaligned.
2768 BSI - place where new code is to be inserted.
2771 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
2772 target hook, if defined.
2773 Return value - the result of the loop-header phi node. */
2776 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
2777 tree *realignment_token)
2779 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2780 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2781 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2782 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2783 edge pe = loop_preheader_edge (loop);
2784 tree scalar_dest = TREE_OPERAND (stmt, 0);
2797 /* 1. Create msq_init = *(floor(p1)) in the loop preheader */
2798 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2799 ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true);
2800 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
2801 new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, data_ref);
2802 new_temp = make_ssa_name (vec_dest, new_stmt);
2803 TREE_OPERAND (new_stmt, 0) = new_temp;
2804 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
2805 gcc_assert (!new_bb);
2806 msq_init = TREE_OPERAND (new_stmt, 0);
2807 copy_virtual_operands (new_stmt, stmt);
2808 update_vuses_to_preheader (new_stmt, loop);
2810 /* 2. Create permutation mask, if required, in loop preheader. */
2811 if (targetm.vectorize.builtin_mask_for_load)
2814 tree params = build_tree_list (NULL_TREE, init_addr);
2816 vec_dest = vect_create_destination_var (scalar_dest,
2817 TREE_TYPE (new_stmt));
2818 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
2819 new_stmt = build_function_call_expr (builtin_decl, params);
2820 new_stmt = build2 (MODIFY_EXPR, void_type_node, vec_dest, new_stmt);
2821 new_temp = make_ssa_name (vec_dest, new_stmt);
2822 TREE_OPERAND (new_stmt, 0) = new_temp;
2823 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
2824 gcc_assert (!new_bb);
2825 *realignment_token = TREE_OPERAND (new_stmt, 0);
2827 /* The result of the CALL_EXPR to this builtin is determined from
2828 the value of the parameter and no global variables are touched
2829 which makes the builtin a "const" function. Requiring the
2830 builtin to have the "const" attribute makes it unnecessary
2831 to call mark_call_clobbered. */
2832 gcc_assert (TREE_READONLY (builtin_decl));
2835 /* 3. Create msq = phi <msq_init, lsq> in loop */
2836 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2837 msq = make_ssa_name (vec_dest, NULL_TREE);
2838 phi_stmt = create_phi_node (msq, loop->header);
2839 SSA_NAME_DEF_STMT (msq) = phi_stmt;
2840 add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop));
2846 /* Function vect_strided_load_supported.
2848 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
2849 and FALSE otherwise. */
2852 vect_strided_load_supported (tree vectype)
2854 optab perm_even_optab, perm_odd_optab;
2857 mode = (int) TYPE_MODE (vectype);
2859 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
2860 if (!perm_even_optab)
2862 if (vect_print_dump_info (REPORT_DETAILS))
2863 fprintf (vect_dump, "no optab for perm_even.");
2867 if (perm_even_optab->handlers[mode].insn_code == CODE_FOR_nothing)
2869 if (vect_print_dump_info (REPORT_DETAILS))
2870 fprintf (vect_dump, "perm_even op not supported by target.");
2874 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
2875 if (!perm_odd_optab)
2877 if (vect_print_dump_info (REPORT_DETAILS))
2878 fprintf (vect_dump, "no optab for perm_odd.");
2882 if (perm_odd_optab->handlers[mode].insn_code == CODE_FOR_nothing)
2884 if (vect_print_dump_info (REPORT_DETAILS))
2885 fprintf (vect_dump, "perm_odd op not supported by target.");
2892 /* Function vect_permute_load_chain.
2894 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
2895 a power of 2, generate extract_even/odd stmts to reorder the input data
2896 correctly. Return the final references for loads in RESULT_CHAIN.
2898 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
2899 The input is 4 vectors each containg 8 elements. We assign a number to each
2900 element, the input sequence is:
2902 1st vec: 0 1 2 3 4 5 6 7
2903 2nd vec: 8 9 10 11 12 13 14 15
2904 3rd vec: 16 17 18 19 20 21 22 23
2905 4th vec: 24 25 26 27 28 29 30 31
2907 The output sequence should be:
2909 1st vec: 0 4 8 12 16 20 24 28
2910 2nd vec: 1 5 9 13 17 21 25 29
2911 3rd vec: 2 6 10 14 18 22 26 30
2912 4th vec: 3 7 11 15 19 23 27 31
2914 i.e., the first output vector should contain the first elements of each
2915 interleaving group, etc.
2917 We use extract_even/odd instructions to create such output. The input of each
2918 extract_even/odd operation is two vectors
2922 and the output is the vector of extracted even/odd elements. The output of
2923 extract_even will be: 0 2 4 6
2924 and of extract_odd: 1 3 5 7
2927 The permutaion is done in log LENGTH stages. In each stage extract_even and
2928 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
2929 order. In our example,
2931 E1: extract_even (1st vec, 2nd vec)
2932 E2: extract_odd (1st vec, 2nd vec)
2933 E3: extract_even (3rd vec, 4th vec)
2934 E4: extract_odd (3rd vec, 4th vec)
2936 The output for the first stage will be:
2938 E1: 0 2 4 6 8 10 12 14
2939 E2: 1 3 5 7 9 11 13 15
2940 E3: 16 18 20 22 24 26 28 30
2941 E4: 17 19 21 23 25 27 29 31
2943 In order to proceed and create the correct sequence for the next stage (or
2944 for the correct output, if the second stage is the last one, as in our
2945 example), we first put the output of extract_even operation and then the
2946 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
2947 The input for the second stage is:
2949 1st vec (E1): 0 2 4 6 8 10 12 14
2950 2nd vec (E3): 16 18 20 22 24 26 28 30
2951 3rd vec (E2): 1 3 5 7 9 11 13 15
2952 4th vec (E4): 17 19 21 23 25 27 29 31
2954 The output of the second stage:
2956 E1: 0 4 8 12 16 20 24 28
2957 E2: 2 6 10 14 18 22 26 30
2958 E3: 1 5 9 13 17 21 25 29
2959 E4: 3 7 11 15 19 23 27 31
2961 And RESULT_CHAIN after reordering:
2963 1st vec (E1): 0 4 8 12 16 20 24 28
2964 2nd vec (E3): 1 5 9 13 17 21 25 29
2965 3rd vec (E2): 2 6 10 14 18 22 26 30
2966 4th vec (E4): 3 7 11 15 19 23 27 31. */
2969 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
2970 unsigned int length,
2972 block_stmt_iterator *bsi,
2973 VEC(tree,heap) **result_chain)
2975 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
2976 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2980 /* Check that the operation is supported. */
2981 if (!vect_strided_load_supported (vectype))
2984 *result_chain = VEC_copy (tree, heap, dr_chain);
2985 for (i = 0; i < exact_log2 (length); i++)
2987 for (j = 0; j < length; j +=2)
2989 first_vect = VEC_index (tree, dr_chain, j);
2990 second_vect = VEC_index (tree, dr_chain, j+1);
2992 /* data_ref = permute_even (first_data_ref, second_data_ref); */
2993 perm_dest = create_tmp_var (vectype, "vect_perm_even");
2994 add_referenced_var (perm_dest);
2996 perm_stmt = build2 (MODIFY_EXPR, vectype, perm_dest,
2997 build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
2998 first_vect, second_vect));
3000 data_ref = make_ssa_name (perm_dest, perm_stmt);
3001 TREE_OPERAND (perm_stmt, 0) = data_ref;
3002 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3003 mark_new_vars_to_rename (perm_stmt);
3005 VEC_replace (tree, *result_chain, j/2, data_ref);
3007 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
3008 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
3009 add_referenced_var (perm_dest);
3011 perm_stmt = build2 (MODIFY_EXPR, vectype, perm_dest,
3012 build2 (VEC_EXTRACT_ODD_EXPR, vectype,
3013 first_vect, second_vect));
3014 data_ref = make_ssa_name (perm_dest, perm_stmt);
3015 TREE_OPERAND (perm_stmt, 0) = data_ref;
3016 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3017 mark_new_vars_to_rename (perm_stmt);
3019 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
3021 dr_chain = VEC_copy (tree, heap, *result_chain);
3027 /* Function vect_transform_strided_load.
3029 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
3030 to perform their permutation and ascribe the result vectorized statements to
3031 the scalar statements.
3035 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
3036 block_stmt_iterator *bsi)
3038 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3039 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3040 tree next_stmt, new_stmt;
3041 VEC(tree,heap) *result_chain = NULL;
3042 unsigned int i, gap_count;
3045 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
3046 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
3047 vectors, that are ready for vector computation. */
3048 result_chain = VEC_alloc (tree, heap, size);
3050 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
3053 /* Put a permuted data-ref in the VECTORIZED_STMT field.
3054 Since we scan the chain starting from it's first node, their order
3055 corresponds the order of data-refs in RESULT_CHAIN. */
3056 next_stmt = first_stmt;
3058 for (i = 0; VEC_iterate(tree, result_chain, i, tmp_data_ref); i++)
3063 /* Skip the gaps. Loads created for the gaps will be removed by dead
3064 code elimination pass later.
3065 DR_GROUP_GAP is the number of steps in elements from the previous
3066 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
3067 correspond to the gaps.
3069 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
3077 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
3078 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
3079 copies, and we put the new vector statement in the first available
3081 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
3082 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
3085 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
3086 tree rel_stmt = STMT_VINFO_RELATED_STMT (
3087 vinfo_for_stmt (prev_stmt));
3090 prev_stmt = rel_stmt;
3091 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
3093 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
3095 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3097 /* If NEXT_STMT accesses the same DR as the previous statement,
3098 put the same TMP_DATA_REF as its vectorized statement; otherwise
3099 get the next data-ref from RESULT_CHAIN. */
3100 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
3108 /* vectorizable_load.
3110 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
3112 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3113 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3114 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3117 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3120 tree vec_dest = NULL;
3121 tree data_ref = NULL;
3123 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3124 stmt_vec_info prev_stmt_info;
3125 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3126 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3127 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
3128 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3131 tree new_stmt = NULL_TREE;
3133 enum dr_alignment_support alignment_support_cheme;
3134 tree dataref_ptr = NULL_TREE;
3136 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3137 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3138 int i, j, group_size;
3139 tree msq = NULL_TREE, lsq;
3140 tree offset = NULL_TREE;
3141 tree realignment_token = NULL_TREE;
3142 tree phi_stmt = NULL_TREE;
3143 VEC(tree,heap) *dr_chain = NULL;
3144 bool strided_load = false;
3147 /* Is vectorizable load? */
3148 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3151 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3153 if (STMT_VINFO_LIVE_P (stmt_info))
3155 /* FORNOW: not yet supported. */
3156 if (vect_print_dump_info (REPORT_DETAILS))
3157 fprintf (vect_dump, "value used after loop.");
3161 if (TREE_CODE (stmt) != MODIFY_EXPR)
3164 scalar_dest = TREE_OPERAND (stmt, 0);
3165 if (TREE_CODE (scalar_dest) != SSA_NAME)
3168 op = TREE_OPERAND (stmt, 1);
3169 if (TREE_CODE (op) != ARRAY_REF
3170 && TREE_CODE (op) != INDIRECT_REF
3171 && !DR_GROUP_FIRST_DR (stmt_info))
3174 if (!STMT_VINFO_DATA_REF (stmt_info))
3177 mode = (int) TYPE_MODE (vectype);
3179 /* FORNOW. In some cases can vectorize even if data-type not supported
3180 (e.g. - data copies). */
3181 if (mov_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3183 if (vect_print_dump_info (REPORT_DETAILS))
3184 fprintf (vect_dump, "Aligned load, but unsupported type.");
3188 /* Check if the load is a part of an interleaving chain. */
3189 if (DR_GROUP_FIRST_DR (stmt_info))
3191 strided_load = true;
3193 /* Check if interleaving is supported. */
3194 if (!vect_strided_load_supported (vectype))
3198 if (!vec_stmt) /* transformation not required. */
3200 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
3206 if (vect_print_dump_info (REPORT_DETAILS))
3207 fprintf (vect_dump, "transform load.");
3211 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3212 /* Check if the chain of loads is already vectorized. */
3213 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
3215 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3218 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3219 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3220 dr_chain = VEC_alloc (tree, heap, group_size);
3229 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3230 gcc_assert (alignment_support_cheme);
3233 /* In case the vectorization factor (VF) is bigger than the number
3234 of elements that we can fit in a vectype (nunits), we have to generate
3235 more than one vector stmt - i.e - we need to "unroll" the
3236 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3237 from one copy of the vector stmt to the next, in the field
3238 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3239 stages to find the correct vector defs to be used when vectorizing
3240 stmts that use the defs of the current stmt. The example below illustrates
3241 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3242 4 vectorized stmts):
3244 before vectorization:
3245 RELATED_STMT VEC_STMT
3249 step 1: vectorize stmt S1:
3250 We first create the vector stmt VS1_0, and, as usual, record a
3251 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
3252 Next, we create the vector stmt VS1_1, and record a pointer to
3253 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
3254 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
3256 RELATED_STMT VEC_STMT
3257 VS1_0: vx0 = memref0 VS1_1 -
3258 VS1_1: vx1 = memref1 VS1_2 -
3259 VS1_2: vx2 = memref2 VS1_3 -
3260 VS1_3: vx3 = memref3 - -
3261 S1: x = load - VS1_0
3264 See in documentation in vect_get_vec_def_for_stmt_copy for how the
3265 information we recorded in RELATED_STMT field is used to vectorize
3268 /* In case of interleaving (non-unit strided access):
3275 Vectorized loads are created in the order of memory accesses
3276 starting from the access of the first stmt of the chain:
3279 VS2: vx1 = &base + vec_size*1
3280 VS3: vx3 = &base + vec_size*2
3281 VS4: vx4 = &base + vec_size*3
3283 Then permutation statements are generated:
3285 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
3286 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
3289 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3290 (the order of the data-refs in the output of vect_permute_load_chain
3291 corresponds to the order of scalar stmts in the interleaving chain - see
3292 the documentaion of vect_permute_load_chain()).
3293 The generation of permutation stmts and recording them in
3294 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
3296 In case of both multiple types and interleaving, the vector loads and
3297 permutation stmts above are created for every copy. The result vector stmts
3298 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3299 STMT_VINFO_RELATED_STMT for the next copies. */
3301 /* If the data reference is aligned (dr_aligned) or potentially unaligned
3302 on a target that supports unaligned accesses (dr_unaligned_supported)
3303 we generate the following code:
3307 p = p + indx * vectype_size;
3312 Otherwise, the data reference is potentially unaligned on a target that
3313 does not support unaligned accesses (dr_unaligned_software_pipeline) -
3314 then generate the following code, in which the data in each iteration is
3315 obtained by two vector loads, one from the previous iteration, and one
3316 from the current iteration:
3318 msq_init = *(floor(p1))
3319 p2 = initial_addr + VS - 1;
3320 realignment_token = call target_builtin;
3323 p2 = p2 + indx * vectype_size
3325 vec_dest = realign_load (msq, lsq, realignment_token)
3330 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3332 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token);
3333 phi_stmt = SSA_NAME_DEF_STMT (msq);
3334 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
3337 prev_stmt_info = NULL;
3338 for (j = 0; j < ncopies; j++)
3340 /* 1. Create the vector pointer update chain. */
3342 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset,
3343 &dummy, &ptr_incr, false);
3345 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3347 for (i = 0; i < group_size; i++)
3349 /* 2. Create the vector-load in the loop. */
3350 switch (alignment_support_cheme)
3353 gcc_assert (aligned_access_p (first_dr));
3354 data_ref = build_fold_indirect_ref (dataref_ptr);
3356 case dr_unaligned_supported:
3358 int mis = DR_MISALIGNMENT (first_dr);
3359 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
3361 gcc_assert (!aligned_access_p (first_dr));
3362 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
3364 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
3367 case dr_unaligned_software_pipeline:
3368 gcc_assert (!aligned_access_p (first_dr));
3369 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
3374 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3375 new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, data_ref);
3376 new_temp = make_ssa_name (vec_dest, new_stmt);
3377 TREE_OPERAND (new_stmt, 0) = new_temp;
3378 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3379 copy_virtual_operands (new_stmt, stmt);
3380 mark_new_vars_to_rename (new_stmt);
3382 /* 3. Handle explicit realignment if necessary/supported. */
3383 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3386 <vec_dest = realign_load (msq, lsq, realignment_token)> */
3387 lsq = TREE_OPERAND (new_stmt, 0);
3388 if (!realignment_token)
3389 realignment_token = dataref_ptr;
3390 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3392 build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token);
3393 new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, new_stmt);
3394 new_temp = make_ssa_name (vec_dest, new_stmt);
3395 TREE_OPERAND (new_stmt, 0) = new_temp;
3396 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3397 if (i == group_size - 1 && j == ncopies - 1)
3398 add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop));
3402 VEC_quick_push (tree, dr_chain, new_temp);
3403 if (i < group_size - 1)
3404 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3409 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
3411 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3412 dr_chain = VEC_alloc (tree, heap, group_size);
3417 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3419 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3420 prev_stmt_info = vinfo_for_stmt (new_stmt);
3428 /* Function vectorizable_live_operation.
3430 STMT computes a value that is used outside the loop. Check if
3431 it can be supported. */
3434 vectorizable_live_operation (tree stmt,
3435 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3436 tree *vec_stmt ATTRIBUTE_UNUSED)
3439 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3440 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3442 enum tree_code code;
3446 enum vect_def_type dt;
3448 if (!STMT_VINFO_LIVE_P (stmt_info))
3451 if (TREE_CODE (stmt) != MODIFY_EXPR)
3454 if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME)
3457 operation = TREE_OPERAND (stmt, 1);
3458 code = TREE_CODE (operation);
3460 op_type = TREE_CODE_LENGTH (code);
3462 /* FORNOW: support only if all uses are invariant. This means
3463 that the scalar operations can remain in place, unvectorized.
3464 The original last scalar value that they compute will be used. */
3466 for (i = 0; i < op_type; i++)
3468 op = TREE_OPERAND (operation, i);
3469 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
3471 if (vect_print_dump_info (REPORT_DETAILS))
3472 fprintf (vect_dump, "use not simple.");
3476 if (dt != vect_invariant_def && dt != vect_constant_def)
3480 /* No transformation is required for the cases we currently support. */
3485 /* Function vect_is_simple_cond.
3488 LOOP - the loop that is being vectorized.
3489 COND - Condition that is checked for simple use.
3491 Returns whether a COND can be vectorized. Checks whether
3492 condition operands are supportable using vec_is_simple_use. */
3495 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
3499 enum vect_def_type dt;
3501 if (!COMPARISON_CLASS_P (cond))
3504 lhs = TREE_OPERAND (cond, 0);
3505 rhs = TREE_OPERAND (cond, 1);
3507 if (TREE_CODE (lhs) == SSA_NAME)
3509 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
3510 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
3513 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST)
3516 if (TREE_CODE (rhs) == SSA_NAME)
3518 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
3519 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
3522 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST)
3528 /* vectorizable_condition.
3530 Check if STMT is conditional modify expression that can be vectorized.
3531 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3532 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
3535 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3538 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3540 tree scalar_dest = NULL_TREE;
3541 tree vec_dest = NULL_TREE;
3542 tree op = NULL_TREE;
3543 tree cond_expr, then_clause, else_clause;
3544 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3545 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3546 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
3547 tree vec_compare, vec_cond_expr;
3549 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3550 enum machine_mode vec_mode;
3552 enum vect_def_type dt;
3553 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3554 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3556 gcc_assert (ncopies >= 1);
3558 return false; /* FORNOW */
3560 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3563 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3565 if (STMT_VINFO_LIVE_P (stmt_info))
3567 /* FORNOW: not yet supported. */
3568 if (vect_print_dump_info (REPORT_DETAILS))
3569 fprintf (vect_dump, "value used after loop.");
3573 if (TREE_CODE (stmt) != MODIFY_EXPR)
3576 op = TREE_OPERAND (stmt, 1);
3578 if (TREE_CODE (op) != COND_EXPR)
3581 cond_expr = TREE_OPERAND (op, 0);
3582 then_clause = TREE_OPERAND (op, 1);
3583 else_clause = TREE_OPERAND (op, 2);
3585 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
3588 /* We do not handle two different vector types for the condition
3590 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
3593 if (TREE_CODE (then_clause) == SSA_NAME)
3595 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
3596 if (!vect_is_simple_use (then_clause, loop_vinfo,
3597 &then_def_stmt, &def, &dt))
3600 else if (TREE_CODE (then_clause) != INTEGER_CST
3601 && TREE_CODE (then_clause) != REAL_CST)
3604 if (TREE_CODE (else_clause) == SSA_NAME)
3606 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
3607 if (!vect_is_simple_use (else_clause, loop_vinfo,
3608 &else_def_stmt, &def, &dt))
3611 else if (TREE_CODE (else_clause) != INTEGER_CST
3612 && TREE_CODE (else_clause) != REAL_CST)
3616 vec_mode = TYPE_MODE (vectype);
3620 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
3621 return expand_vec_cond_expr_p (op, vec_mode);
3627 scalar_dest = TREE_OPERAND (stmt, 0);
3628 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3630 /* Handle cond expr. */
3632 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
3634 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
3635 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
3636 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
3638 /* Arguments are ready. create the new vector stmt. */
3639 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
3640 vec_cond_lhs, vec_cond_rhs);
3641 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
3642 vec_compare, vec_then_clause, vec_else_clause);
3644 *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, vec_cond_expr);
3645 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3646 TREE_OPERAND (*vec_stmt, 0) = new_temp;
3647 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3652 /* Function vect_transform_stmt.
3654 Create a vectorized stmt to replace STMT, and insert it at BSI. */
3657 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
3659 bool is_store = false;
3660 tree vec_stmt = NULL_TREE;
3661 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3662 tree orig_stmt_in_pattern;
3665 if (STMT_VINFO_RELEVANT_P (stmt_info))
3667 switch (STMT_VINFO_TYPE (stmt_info))
3669 case type_demotion_vec_info_type:
3670 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
3674 case type_promotion_vec_info_type:
3675 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
3679 case op_vec_info_type:
3680 done = vectorizable_operation (stmt, bsi, &vec_stmt);
3684 case assignment_vec_info_type:
3685 done = vectorizable_assignment (stmt, bsi, &vec_stmt);
3689 case load_vec_info_type:
3690 done = vectorizable_load (stmt, bsi, &vec_stmt);
3694 case store_vec_info_type:
3695 done = vectorizable_store (stmt, bsi, &vec_stmt);
3697 if (DR_GROUP_FIRST_DR (stmt_info))
3699 /* In case of interleaving, the whole chain is vectorized when the
3700 last store in the chain is reached. Store stmts before the last
3701 one are skipped, and there vec_stmt_info shoudn't be freed
3703 *strided_store = true;
3704 if (STMT_VINFO_VEC_STMT (stmt_info))
3711 case condition_vec_info_type:
3712 done = vectorizable_condition (stmt, bsi, &vec_stmt);
3717 if (vect_print_dump_info (REPORT_DETAILS))
3718 fprintf (vect_dump, "stmt not supported.");
3722 gcc_assert (vec_stmt || *strided_store);
3725 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
3726 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
3727 if (orig_stmt_in_pattern)
3729 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
3730 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
3732 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3734 /* STMT was inserted by the vectorizer to replace a
3735 computation idiom. ORIG_STMT_IN_PATTERN is a stmt in the
3736 original sequence that computed this idiom. We need to
3737 record a pointer to VEC_STMT in the stmt_info of
3738 ORIG_STMT_IN_PATTERN. See more details in the
3739 documentation of vect_pattern_recog. */
3741 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
3747 if (STMT_VINFO_LIVE_P (stmt_info))
3749 switch (STMT_VINFO_TYPE (stmt_info))
3751 case reduc_vec_info_type:
3752 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
3757 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
3766 /* This function builds ni_name = number of iterations loop executes
3767 on the loop preheader. */
3770 vect_build_loop_niters (loop_vec_info loop_vinfo)
3772 tree ni_name, stmt, var;
3774 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3775 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
3777 var = create_tmp_var (TREE_TYPE (ni), "niters");
3778 add_referenced_var (var);
3779 ni_name = force_gimple_operand (ni, &stmt, false, var);
3781 pe = loop_preheader_edge (loop);
3784 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
3785 gcc_assert (!new_bb);
3792 /* This function generates the following statements:
3794 ni_name = number of iterations loop executes
3795 ratio = ni_name / vf
3796 ratio_mult_vf_name = ratio * vf
3798 and places them at the loop preheader edge. */
3801 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
3803 tree *ratio_mult_vf_name_ptr,
3804 tree *ratio_name_ptr)
3812 tree ratio_mult_vf_name;
3813 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3814 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
3815 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3818 pe = loop_preheader_edge (loop);
3820 /* Generate temporary variable that contains
3821 number of iterations loop executes. */
3823 ni_name = vect_build_loop_niters (loop_vinfo);
3824 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
3826 /* Create: ratio = ni >> log2(vf) */
3828 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
3829 if (!is_gimple_val (ratio_name))
3831 var = create_tmp_var (TREE_TYPE (ni), "bnd");
3832 add_referenced_var (var);
3834 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
3835 pe = loop_preheader_edge (loop);
3836 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
3837 gcc_assert (!new_bb);
3840 /* Create: ratio_mult_vf = ratio << log2 (vf). */
3842 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
3843 ratio_name, log_vf);
3844 if (!is_gimple_val (ratio_mult_vf_name))
3846 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
3847 add_referenced_var (var);
3849 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
3851 pe = loop_preheader_edge (loop);
3852 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
3853 gcc_assert (!new_bb);
3856 *ni_name_ptr = ni_name;
3857 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
3858 *ratio_name_ptr = ratio_name;
3864 /* Function update_vuses_to_preheader.
3867 STMT - a statement with potential VUSEs.
3868 LOOP - the loop whose preheader will contain STMT.
3870 It's possible to vectorize a loop even though an SSA_NAME from a VUSE
3871 appears to be defined in a V_MAY_DEF in another statement in a loop.
3872 One such case is when the VUSE is at the dereference of a __restricted__
3873 pointer in a load and the V_MAY_DEF is at the dereference of a different
3874 __restricted__ pointer in a store. Vectorization may result in
3875 copy_virtual_uses being called to copy the problematic VUSE to a new
3876 statement that is being inserted in the loop preheader. This procedure
3877 is called to change the SSA_NAME in the new statement's VUSE from the
3878 SSA_NAME updated in the loop to the related SSA_NAME available on the
3879 path entering the loop.
3881 When this function is called, we have the following situation:
3886 # name1 = phi < name0 , name2>
3891 # name2 = vdef <name1>
3896 Stmt S1 was created in the loop preheader block as part of misaligned-load
3897 handling. This function fixes the name of the vuse of S1 from 'name1' to
3901 update_vuses_to_preheader (tree stmt, struct loop *loop)
3903 basic_block header_bb = loop->header;
3904 edge preheader_e = loop_preheader_edge (loop);
3906 use_operand_p use_p;
3908 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VUSE)
3910 tree ssa_name = USE_FROM_PTR (use_p);
3911 tree def_stmt = SSA_NAME_DEF_STMT (ssa_name);
3912 tree name_var = SSA_NAME_VAR (ssa_name);
3913 basic_block bb = bb_for_stmt (def_stmt);
3915 /* For a use before any definitions, def_stmt is a NOP_EXPR. */
3916 if (!IS_EMPTY_STMT (def_stmt)
3917 && flow_bb_inside_loop_p (loop, bb))
3919 /* If the block containing the statement defining the SSA_NAME
3920 is in the loop then it's necessary to find the definition
3921 outside the loop using the PHI nodes of the header. */
3923 bool updated = false;
3925 for (phi = phi_nodes (header_bb); phi; phi = TREE_CHAIN (phi))
3927 if (SSA_NAME_VAR (PHI_RESULT (phi)) == name_var)
3929 SET_USE (use_p, PHI_ARG_DEF (phi, preheader_e->dest_idx));
3934 gcc_assert (updated);
3940 /* Function vect_update_ivs_after_vectorizer.
3942 "Advance" the induction variables of LOOP to the value they should take
3943 after the execution of LOOP. This is currently necessary because the
3944 vectorizer does not handle induction variables that are used after the
3945 loop. Such a situation occurs when the last iterations of LOOP are
3947 1. We introduced new uses after LOOP for IVs that were not originally used
3948 after LOOP: the IVs of LOOP are now used by an epilog loop.
3949 2. LOOP is going to be vectorized; this means that it will iterate N/VF
3950 times, whereas the loop IVs should be bumped N times.
3953 - LOOP - a loop that is going to be vectorized. The last few iterations
3954 of LOOP were peeled.
3955 - NITERS - the number of iterations that LOOP executes (before it is
3956 vectorized). i.e, the number of times the ivs should be bumped.
3957 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
3958 coming out from LOOP on which there are uses of the LOOP ivs
3959 (this is the path from LOOP->exit to epilog_loop->preheader).
3961 The new definitions of the ivs are placed in LOOP->exit.
3962 The phi args associated with the edge UPDATE_E in the bb
3963 UPDATE_E->dest are updated accordingly.
3965 Assumption 1: Like the rest of the vectorizer, this function assumes
3966 a single loop exit that has a single predecessor.
3968 Assumption 2: The phi nodes in the LOOP header and in update_bb are
3969 organized in the same order.
3971 Assumption 3: The access function of the ivs is simple enough (see
3972 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
3974 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
3975 coming out of LOOP on which the ivs of LOOP are used (this is the path
3976 that leads to the epilog loop; other paths skip the epilog loop). This
3977 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
3978 needs to have its phis updated.
3982 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
3985 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3986 basic_block exit_bb = single_exit (loop)->dest;
3988 basic_block update_bb = update_e->dest;
3990 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
3992 /* Make sure there exists a single-predecessor exit bb: */
3993 gcc_assert (single_pred_p (exit_bb));
3995 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
3997 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
3999 tree access_fn = NULL;
4000 tree evolution_part;
4003 tree var, stmt, ni, ni_name;
4004 block_stmt_iterator last_bsi;
4006 if (vect_print_dump_info (REPORT_DETAILS))
4008 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
4009 print_generic_expr (vect_dump, phi, TDF_SLIM);
4012 /* Skip virtual phi's. */
4013 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
4015 if (vect_print_dump_info (REPORT_DETAILS))
4016 fprintf (vect_dump, "virtual phi. skip.");
4020 /* Skip reduction phis. */
4021 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
4023 if (vect_print_dump_info (REPORT_DETAILS))
4024 fprintf (vect_dump, "reduc phi. skip.");
4028 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
4029 gcc_assert (access_fn);
4031 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
4032 gcc_assert (evolution_part != NULL_TREE);
4034 /* FORNOW: We do not support IVs whose evolution function is a polynomial
4035 of degree >= 2 or exponential. */
4036 gcc_assert (!tree_is_chrec (evolution_part));
4038 step_expr = evolution_part;
4039 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
4042 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
4043 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
4044 niters, step_expr), init_expr);
4046 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
4047 add_referenced_var (var);
4049 ni_name = force_gimple_operand (ni, &stmt, false, var);
4051 /* Insert stmt into exit_bb. */
4052 last_bsi = bsi_last (exit_bb);
4054 bsi_insert_before (&last_bsi, stmt, BSI_SAME_STMT);
4056 /* Fix phi expressions in the successor bb. */
4057 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
4062 /* Function vect_do_peeling_for_loop_bound
4064 Peel the last iterations of the loop represented by LOOP_VINFO.
4065 The peeled iterations form a new epilog loop. Given that the loop now
4066 iterates NITERS times, the new epilog loop iterates
4067 NITERS % VECTORIZATION_FACTOR times.
4069 The original loop will later be made to iterate
4070 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
4073 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio,
4074 struct loops *loops)
4076 tree ni_name, ratio_mult_vf_name;
4077 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4078 struct loop *new_loop;
4080 basic_block preheader;
4083 if (vect_print_dump_info (REPORT_DETAILS))
4084 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
4086 initialize_original_copy_tables ();
4088 /* Generate the following variables on the preheader of original loop:
4090 ni_name = number of iteration the original loop executes
4091 ratio = ni_name / vf
4092 ratio_mult_vf_name = ratio * vf */
4093 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
4094 &ratio_mult_vf_name, ratio);
4096 loop_num = loop->num;
4097 new_loop = slpeel_tree_peel_loop_to_edge (loop, loops, single_exit (loop),
4098 ratio_mult_vf_name, ni_name, false);
4099 gcc_assert (new_loop);
4100 gcc_assert (loop_num == loop->num);
4101 #ifdef ENABLE_CHECKING
4102 slpeel_verify_cfg_after_peeling (loop, new_loop);
4105 /* A guard that controls whether the new_loop is to be executed or skipped
4106 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
4107 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
4108 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
4109 is on the path where the LOOP IVs are used and need to be updated. */
4111 preheader = loop_preheader_edge (new_loop)->src;
4112 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
4113 update_e = EDGE_PRED (preheader, 0);
4115 update_e = EDGE_PRED (preheader, 1);
4117 /* Update IVs of original loop as if they were advanced
4118 by ratio_mult_vf_name steps. */
4119 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
4121 /* After peeling we have to reset scalar evolution analyzer. */
4124 free_original_copy_tables ();
4128 /* Function vect_gen_niters_for_prolog_loop
4130 Set the number of iterations for the loop represented by LOOP_VINFO
4131 to the minimum between LOOP_NITERS (the original iteration count of the loop)
4132 and the misalignment of DR - the data reference recorded in
4133 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
4134 this loop, the data reference DR will refer to an aligned location.
4136 The following computation is generated:
4138 If the misalignment of DR is known at compile time:
4139 addr_mis = int mis = DR_MISALIGNMENT (dr);
4140 Else, compute address misalignment in bytes:
4141 addr_mis = addr & (vectype_size - 1)
4143 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
4145 (elem_size = element type size; an element is the scalar element
4146 whose type is the inner type of the vectype)
4150 prolog_niters = min ( LOOP_NITERS ,
4151 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
4152 where group_size is the size of the interleaved group.
4156 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
4158 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
4159 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4160 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4162 tree iters, iters_name;
4165 tree dr_stmt = DR_STMT (dr);
4166 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
4167 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4168 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
4169 tree niters_type = TREE_TYPE (loop_niters);
4171 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
4173 if (DR_GROUP_FIRST_DR (stmt_info))
4175 /* For interleaved access element size must be multipled by the size of
4176 the interleaved group. */
4177 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
4178 DR_GROUP_FIRST_DR (stmt_info)));
4179 element_size *= group_size;
4182 pe = loop_preheader_edge (loop);
4184 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
4186 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
4187 int elem_misalign = byte_misalign / element_size;
4189 if (vect_print_dump_info (REPORT_DETAILS))
4190 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
4191 iters = build_int_cst (niters_type,
4192 (vf - elem_misalign)&(vf/group_size-1));
4196 tree new_stmts = NULL_TREE;
4198 vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE);
4199 tree ptr_type = TREE_TYPE (start_addr);
4200 tree size = TYPE_SIZE (ptr_type);
4201 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
4202 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
4203 tree elem_size_log =
4204 build_int_cst (type, exact_log2 (vectype_align/vf));
4205 tree vf_minus_1 = build_int_cst (type, vf - 1);
4206 tree vf_tree = build_int_cst (type, vf);
4210 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
4211 gcc_assert (!new_bb);
4213 /* Create: byte_misalign = addr & (vectype_size - 1) */
4215 fold_build2 (BIT_AND_EXPR, type, start_addr, vectype_size_minus_1);
4217 /* Create: elem_misalign = byte_misalign / element_size */
4219 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
4221 /* Create: (niters_type) (VF - elem_misalign)&(VF - 1) */
4222 iters = fold_build2 (MINUS_EXPR, type, vf_tree, elem_misalign);
4223 iters = fold_build2 (BIT_AND_EXPR, type, iters, vf_minus_1);
4224 iters = fold_convert (niters_type, iters);
4227 /* Create: prolog_loop_niters = min (iters, loop_niters) */
4228 /* If the loop bound is known at compile time we already verified that it is
4229 greater than vf; since the misalignment ('iters') is at most vf, there's
4230 no need to generate the MIN_EXPR in this case. */
4231 if (TREE_CODE (loop_niters) != INTEGER_CST)
4232 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
4234 if (vect_print_dump_info (REPORT_DETAILS))
4236 fprintf (vect_dump, "niters for prolog loop: ");
4237 print_generic_expr (vect_dump, iters, TDF_SLIM);
4240 var = create_tmp_var (niters_type, "prolog_loop_niters");
4241 add_referenced_var (var);
4242 iters_name = force_gimple_operand (iters, &stmt, false, var);
4244 /* Insert stmt on loop preheader edge. */
4247 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4248 gcc_assert (!new_bb);
4255 /* Function vect_update_init_of_dr
4257 NITERS iterations were peeled from LOOP. DR represents a data reference
4258 in LOOP. This function updates the information recorded in DR to
4259 account for the fact that the first NITERS iterations had already been
4260 executed. Specifically, it updates the OFFSET field of DR. */
4263 vect_update_init_of_dr (struct data_reference *dr, tree niters)
4265 tree offset = DR_OFFSET (dr);
4267 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
4268 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
4269 DR_OFFSET (dr) = offset;
4273 /* Function vect_update_inits_of_drs
4275 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
4276 This function updates the information recorded for the data references in
4277 the loop to account for the fact that the first NITERS iterations had
4278 already been executed. Specifically, it updates the initial_condition of the
4279 access_function of all the data_references in the loop. */
4282 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
4285 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
4286 struct data_reference *dr;
4288 if (vect_dump && (dump_flags & TDF_DETAILS))
4289 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
4291 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
4292 vect_update_init_of_dr (dr, niters);
4296 /* Function vect_do_peeling_for_alignment
4298 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
4299 'niters' is set to the misalignment of one of the data references in the
4300 loop, thereby forcing it to refer to an aligned location at the beginning
4301 of the execution of this loop. The data reference for which we are
4302 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
4305 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo, struct loops *loops)
4307 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4308 tree niters_of_prolog_loop, ni_name;
4310 struct loop *new_loop;
4312 if (vect_print_dump_info (REPORT_DETAILS))
4313 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
4315 initialize_original_copy_tables ();
4317 ni_name = vect_build_loop_niters (loop_vinfo);
4318 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
4320 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
4322 slpeel_tree_peel_loop_to_edge (loop, loops, loop_preheader_edge (loop),
4323 niters_of_prolog_loop, ni_name, true);
4324 gcc_assert (new_loop);
4325 #ifdef ENABLE_CHECKING
4326 slpeel_verify_cfg_after_peeling (new_loop, loop);
4329 /* Update number of times loop executes. */
4330 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
4331 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
4332 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
4334 /* Update the init conditions of the access functions of all data refs. */
4335 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
4337 /* After peeling we have to reset scalar evolution analyzer. */
4340 free_original_copy_tables ();
4344 /* Function vect_create_cond_for_align_checks.
4346 Create a conditional expression that represents the alignment checks for
4347 all of data references (array element references) whose alignment must be
4351 LOOP_VINFO - two fields of the loop information are used.
4352 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
4353 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
4356 COND_EXPR_STMT_LIST - statements needed to construct the conditional
4358 The returned value is the conditional expression to be used in the if
4359 statement that controls which version of the loop gets executed at runtime.
4361 The algorithm makes two assumptions:
4362 1) The number of bytes "n" in a vector is a power of 2.
4363 2) An address "a" is aligned if a%n is zero and that this
4364 test can be done as a&(n-1) == 0. For example, for 16
4365 byte vectors the test is a&0xf == 0. */
4368 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
4369 tree *cond_expr_stmt_list)
4371 VEC(tree,heap) *may_misalign_stmts
4372 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
4374 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
4378 tree int_ptrsize_type;
4380 tree or_tmp_name = NULL_TREE;
4381 tree and_tmp, and_tmp_name, and_stmt;
4384 /* Check that mask is one less than a power of 2, i.e., mask is
4385 all zeros followed by all ones. */
4386 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
4388 /* CHECKME: what is the best integer or unsigned type to use to hold a
4389 cast from a pointer value? */
4390 psize = TYPE_SIZE (ptr_type_node);
4392 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
4394 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
4395 of the first vector of the i'th data reference. */
4397 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
4399 tree new_stmt_list = NULL_TREE;
4401 tree addr_tmp, addr_tmp_name, addr_stmt;
4402 tree or_tmp, new_or_tmp_name, or_stmt;
4404 /* create: addr_tmp = (int)(address_of_first_vector) */
4405 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
4409 if (new_stmt_list != NULL_TREE)
4410 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
4412 sprintf (tmp_name, "%s%d", "addr2int", i);
4413 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4414 add_referenced_var (addr_tmp);
4415 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
4416 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
4417 addr_stmt = build2 (MODIFY_EXPR, void_type_node,
4418 addr_tmp_name, addr_stmt);
4419 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
4420 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
4422 /* The addresses are OR together. */
4424 if (or_tmp_name != NULL_TREE)
4426 /* create: or_tmp = or_tmp | addr_tmp */
4427 sprintf (tmp_name, "%s%d", "orptrs", i);
4428 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4429 add_referenced_var (or_tmp);
4430 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
4431 or_stmt = build2 (MODIFY_EXPR, void_type_node, new_or_tmp_name,
4432 build2 (BIT_IOR_EXPR, int_ptrsize_type,
4435 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
4436 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
4437 or_tmp_name = new_or_tmp_name;
4440 or_tmp_name = addr_tmp_name;
4444 mask_cst = build_int_cst (int_ptrsize_type, mask);
4446 /* create: and_tmp = or_tmp & mask */
4447 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
4448 add_referenced_var (and_tmp);
4449 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
4451 and_stmt = build2 (MODIFY_EXPR, void_type_node,
4453 build2 (BIT_AND_EXPR, int_ptrsize_type,
4454 or_tmp_name, mask_cst));
4455 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
4456 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
4458 /* Make and_tmp the left operand of the conditional test against zero.
4459 if and_tmp has a nonzero bit then some address is unaligned. */
4460 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
4461 return build2 (EQ_EXPR, boolean_type_node,
4462 and_tmp_name, ptrsize_zero);
4466 /* Function vect_transform_loop.
4468 The analysis phase has determined that the loop is vectorizable.
4469 Vectorize the loop - created vectorized stmts to replace the scalar
4470 stmts in the loop, and update the loop exit condition. */
4473 vect_transform_loop (loop_vec_info loop_vinfo,
4474 struct loops *loops ATTRIBUTE_UNUSED)
4476 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4477 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4478 int nbbs = loop->num_nodes;
4479 block_stmt_iterator si;
4482 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4487 if (vect_print_dump_info (REPORT_DETAILS))
4488 fprintf (vect_dump, "=== vec_transform_loop ===");
4490 /* If the loop has data references that may or may not be aligned then
4491 two versions of the loop need to be generated, one which is vectorized
4492 and one which isn't. A test is then generated to control which of the
4493 loops is executed. The test checks for the alignment of all of the
4494 data references that may or may not be aligned. */
4496 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
4500 tree cond_expr_stmt_list = NULL_TREE;
4501 basic_block condition_bb;
4502 block_stmt_iterator cond_exp_bsi;
4503 basic_block merge_bb;
4504 basic_block new_exit_bb;
4506 tree orig_phi, new_phi, arg;
4508 cond_expr = vect_create_cond_for_align_checks (loop_vinfo,
4509 &cond_expr_stmt_list);
4510 initialize_original_copy_tables ();
4511 nloop = loop_version (loops, loop, cond_expr, &condition_bb, true);
4512 free_original_copy_tables();
4514 /** Loop versioning violates an assumption we try to maintain during
4515 vectorization - that the loop exit block has a single predecessor.
4516 After versioning, the exit block of both loop versions is the same
4517 basic block (i.e. it has two predecessors). Just in order to simplify
4518 following transformations in the vectorizer, we fix this situation
4519 here by adding a new (empty) block on the exit-edge of the loop,
4520 with the proper loop-exit phis to maintain loop-closed-form. **/
4522 merge_bb = single_exit (loop)->dest;
4523 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
4524 new_exit_bb = split_edge (single_exit (loop));
4525 new_exit_e = single_exit (loop);
4526 e = EDGE_SUCC (new_exit_bb, 0);
4528 for (orig_phi = phi_nodes (merge_bb); orig_phi;
4529 orig_phi = PHI_CHAIN (orig_phi))
4531 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
4533 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
4534 add_phi_arg (new_phi, arg, new_exit_e);
4535 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
4538 /** end loop-exit-fixes after versioning **/
4540 update_ssa (TODO_update_ssa);
4541 cond_exp_bsi = bsi_last (condition_bb);
4542 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
4545 /* CHECKME: we wouldn't need this if we called update_ssa once
4547 bitmap_zero (vect_vnames_to_rename);
4549 /* Peel the loop if there are data refs with unknown alignment.
4550 Only one data ref with unknown store is allowed. */
4552 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4553 vect_do_peeling_for_alignment (loop_vinfo, loops);
4555 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4556 compile time constant), or it is a constant that doesn't divide by the
4557 vectorization factor, then an epilog loop needs to be created.
4558 We therefore duplicate the loop: the original loop will be vectorized,
4559 and will compute the first (n/VF) iterations. The second copy of the loop
4560 will remain scalar and will compute the remaining (n%VF) iterations.
4561 (VF is the vectorization factor). */
4563 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4564 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4565 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
4566 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio, loops);
4568 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4569 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4571 /* 1) Make sure the loop header has exactly two entries
4572 2) Make sure we have a preheader basic block. */
4574 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4576 split_edge (loop_preheader_edge (loop));
4578 /* FORNOW: the vectorizer supports only loops which body consist
4579 of one basic block (header + empty latch). When the vectorizer will
4580 support more involved loop forms, the order by which the BBs are
4581 traversed need to be reconsidered. */
4583 for (i = 0; i < nbbs; i++)
4585 basic_block bb = bbs[i];
4587 for (si = bsi_start (bb); !bsi_end_p (si);)
4589 tree stmt = bsi_stmt (si);
4590 stmt_vec_info stmt_info;
4593 if (vect_print_dump_info (REPORT_DETAILS))
4595 fprintf (vect_dump, "------>vectorizing statement: ");
4596 print_generic_expr (vect_dump, stmt, TDF_SLIM);
4598 stmt_info = vinfo_for_stmt (stmt);
4599 gcc_assert (stmt_info);
4600 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4601 && !STMT_VINFO_LIVE_P (stmt_info))
4607 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4608 != (unsigned HOST_WIDE_INT) vectorization_factor)
4609 && vect_print_dump_info (REPORT_DETAILS))
4610 fprintf (vect_dump, "multiple-types.");
4612 /* -------- vectorize statement ------------ */
4613 if (vect_print_dump_info (REPORT_DETAILS))
4614 fprintf (vect_dump, "transform statement.");
4616 strided_store = false;
4617 is_store = vect_transform_stmt (stmt, &si, &strided_store);
4621 if (DR_GROUP_FIRST_DR (stmt_info))
4623 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4624 interleaving chain was completed - free all the stores in
4626 tree next = DR_GROUP_FIRST_DR (stmt_info);
4628 stmt_vec_info next_stmt_info;
4632 next_stmt_info = vinfo_for_stmt (next);
4633 /* Free the attached stmt_vec_info and remove the stmt. */
4634 ann = stmt_ann (next);
4635 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
4636 free (next_stmt_info);
4637 set_stmt_info (ann, NULL);
4640 bsi_remove (&si, true);
4645 /* Free the attached stmt_vec_info and remove the stmt. */
4646 ann = stmt_ann (stmt);
4648 set_stmt_info (ann, NULL);
4649 bsi_remove (&si, true);
4657 /* This is case of skipped interleaved store. We don't free
4658 its stmt_vec_info. */
4659 bsi_remove (&si, true);
4667 slpeel_make_loop_iterate_ntimes (loop, ratio);
4669 EXECUTE_IF_SET_IN_BITMAP (vect_vnames_to_rename, 0, j, bi)
4670 mark_sym_for_renaming (SSA_NAME_VAR (ssa_name (j)));
4672 /* The memory tags and pointers in vectorized statements need to
4673 have their SSA forms updated. FIXME, why can't this be delayed
4674 until all the loops have been transformed? */
4675 update_ssa (TODO_update_ssa);
4677 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
4678 fprintf (vect_dump, "LOOP VECTORIZED.");