1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to the Free
19 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
24 #include "coretypes.h"
30 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
40 #include "tree-data-ref.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "langhooks.h"
45 #include "tree-pass.h"
49 /* Utility functions for the code transformation. */
50 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (tree, block_stmt_iterator *, tree, tree *, tree *, bool, tree);
54 static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree);
55 static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
58 static tree vect_init_vector (tree, tree, tree);
59 static void vect_finish_stmt_generation
60 (tree stmt, tree vec_stmt, block_stmt_iterator *bsi);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void update_vuses_to_preheader (tree, struct loop*);
63 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
64 static tree get_initial_def_for_reduction (tree, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
77 /* Function vect_get_new_vect_var.
79 Returns a name for a new variable. The current naming scheme appends the
80 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
81 the name of vectorizer generated variables, and appends that to NAME if
85 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
98 case vect_pointer_var:
106 new_vect_var = create_tmp_var (type, concat (prefix, name, NULL));
108 new_vect_var = create_tmp_var (type, prefix);
110 /* Mark vector typed variable as a gimple register variable. */
111 if (TREE_CODE (type) == VECTOR_TYPE)
112 DECL_GIMPLE_REG_P (new_vect_var) = true;
118 /* Function vect_create_addr_base_for_vector_ref.
120 Create an expression that computes the address of the first memory location
121 that will be accessed for a data reference.
124 STMT: The statement containing the data reference.
125 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
126 OFFSET: Optional. If supplied, it is be added to the initial address.
129 1. Return an SSA_NAME whose value is the address of the memory location of
130 the first vector of the data reference.
131 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
132 these statement(s) which define the returned SSA_NAME.
134 FORNOW: We are only handling array accesses with step 1. */
137 vect_create_addr_base_for_vector_ref (tree stmt,
141 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
142 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
143 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
144 tree base_name = build_fold_indirect_ref (data_ref_base);
146 tree addr_base, addr_expr;
148 tree base_offset = unshare_expr (DR_OFFSET (dr));
149 tree init = unshare_expr (DR_INIT (dr));
150 tree vect_ptr_type, addr_expr2;
152 /* Create base_offset */
153 base_offset = size_binop (PLUS_EXPR, base_offset, init);
154 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
155 add_referenced_var (dest);
156 base_offset = force_gimple_operand (base_offset, &new_stmt, false, dest);
157 append_to_statement_list_force (new_stmt, new_stmt_list);
161 tree tmp = create_tmp_var (TREE_TYPE (base_offset), "offset");
164 /* For interleaved access step we divide STEP by the size of the
165 interleaving group. */
166 if (DR_GROUP_SIZE (stmt_info))
167 step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr),
168 build_int_cst (TREE_TYPE (offset),
169 DR_GROUP_SIZE (stmt_info)));
173 add_referenced_var (tmp);
174 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
175 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
176 base_offset, offset);
177 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
178 append_to_statement_list_force (new_stmt, new_stmt_list);
181 /* base + base_offset */
182 addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base,
185 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
187 /* addr_expr = addr_base */
188 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
189 get_name (base_name));
190 add_referenced_var (addr_expr);
191 vec_stmt = fold_convert (vect_ptr_type, addr_base);
192 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
193 get_name (base_name));
194 add_referenced_var (addr_expr2);
195 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
196 append_to_statement_list_force (new_stmt, new_stmt_list);
198 if (vect_print_dump_info (REPORT_DETAILS))
200 fprintf (vect_dump, "created ");
201 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
207 /* Function vect_create_data_ref_ptr.
209 Create a new pointer to vector type (vp), that points to the first location
210 accessed in the loop by STMT, along with the def-use update chain to
211 appropriately advance the pointer through the loop iterations. Also set
212 aliasing information for the pointer. This vector pointer is used by the
213 callers to this function to create a memory reference expression for vector
217 1. STMT: a stmt that references memory. Expected to be of the form
218 GIMPLE_MODIFY_STMT <name, data-ref> or
219 GIMPLE_MODIFY_STMT <data-ref, name>.
220 2. BSI: block_stmt_iterator where new stmts can be added.
221 3. OFFSET (optional): an offset to be added to the initial address accessed
222 by the data-ref in STMT.
223 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
224 pointing to the initial address.
225 5. TYPE: if not NULL indicates the required type of the data-ref
228 1. Declare a new ptr to vector_type, and have it point to the base of the
229 data reference (initial addressed accessed by the data reference).
230 For example, for vector of type V8HI, the following code is generated:
233 vp = (v8hi *)initial_address;
235 if OFFSET is not supplied:
236 initial_address = &a[init];
237 if OFFSET is supplied:
238 initial_address = &a[init + OFFSET];
240 Return the initial_address in INITIAL_ADDRESS.
242 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
243 update the pointer in each iteration of the loop.
245 Return the increment stmt that updates the pointer in PTR_INCR.
247 3. Return the pointer. */
250 vect_create_data_ref_ptr (tree stmt,
251 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
252 tree offset, tree *initial_address, tree *ptr_incr,
253 bool only_init, tree type)
256 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
257 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
258 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
259 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
265 tree new_stmt_list = NULL_TREE;
266 edge pe = loop_preheader_edge (loop);
269 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
271 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
273 if (vect_print_dump_info (REPORT_DETAILS))
275 tree data_ref_base = base_name;
276 fprintf (vect_dump, "create vector-pointer variable to type: ");
277 print_generic_expr (vect_dump, vectype, TDF_SLIM);
278 if (TREE_CODE (data_ref_base) == VAR_DECL)
279 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
280 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
281 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
282 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
283 fprintf (vect_dump, " vectorizing a record based array ref: ");
284 else if (TREE_CODE (data_ref_base) == SSA_NAME)
285 fprintf (vect_dump, " vectorizing a pointer ref: ");
286 print_generic_expr (vect_dump, base_name, TDF_SLIM);
289 /** (1) Create the new vector-pointer variable: **/
291 vect_ptr_type = build_pointer_type (type);
293 vect_ptr_type = build_pointer_type (vectype);
294 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
295 get_name (base_name));
296 add_referenced_var (vect_ptr);
298 /** (2) Add aliasing information to the new vector-pointer:
299 (The points-to info (DR_PTR_INFO) may be defined later.) **/
301 tag = DR_MEMTAG (dr);
304 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
305 tag must be created with tag added to its may alias list. */
307 new_type_alias (vect_ptr, tag, DR_REF (dr));
309 set_symbol_mem_tag (vect_ptr, tag);
311 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
313 /** (3) Calculate the initial address the vector-pointer, and set
314 the vector-pointer to point to it before the loop: **/
316 /* Create: (&(base[init_val+offset]) in the loop preheader. */
317 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
319 pe = loop_preheader_edge (loop);
320 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
321 gcc_assert (!new_bb);
322 *initial_address = new_temp;
324 /* Create: p = (vectype *) initial_base */
325 vec_stmt = fold_convert (vect_ptr_type, new_temp);
326 vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
327 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
328 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
329 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
330 gcc_assert (!new_bb);
333 /** (4) Handle the updating of the vector-pointer inside the loop: **/
335 if (only_init) /* No update in loop is required. */
337 /* Copy the points-to information if it exists. */
338 if (DR_PTR_INFO (dr))
339 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
340 return vect_ptr_init;
344 block_stmt_iterator incr_bsi;
346 tree indx_before_incr, indx_after_incr;
349 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
350 create_iv (vect_ptr_init,
351 fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)),
352 NULL_TREE, loop, &incr_bsi, insert_after,
353 &indx_before_incr, &indx_after_incr);
354 incr = bsi_stmt (incr_bsi);
355 set_stmt_info (stmt_ann (incr),
356 new_stmt_vec_info (incr, loop_vinfo));
358 /* Copy the points-to information if it exists. */
359 if (DR_PTR_INFO (dr))
361 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
362 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
364 merge_alias_info (vect_ptr_init, indx_before_incr);
365 merge_alias_info (vect_ptr_init, indx_after_incr);
369 return indx_before_incr;
374 /* Function bump_vector_ptr
376 Increment a pointer (to a vector type) by vector-size. Connect the new
377 increment stmt to the existing def-use update-chain of the pointer.
379 The pointer def-use update-chain before this function:
380 DATAREF_PTR = phi (p_0, p_2)
382 PTR_INCR: p_2 = DATAREF_PTR + step
384 The pointer def-use update-chain after this function:
385 DATAREF_PTR = phi (p_0, p_2)
387 NEW_DATAREF_PTR = DATAREF_PTR + vector_size
389 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
392 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
394 PTR_INCR - the stmt that updates the pointer in each iteration of the loop.
395 The increment amount across iterations is also expected to be
397 BSI - location where the new update stmt is to be placed.
398 STMT - the original scalar memory-access stmt that is being vectorized.
400 Output: Return NEW_DATAREF_PTR as illustrated above.
405 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
408 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
409 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
410 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
411 tree vptr_type = TREE_TYPE (dataref_ptr);
412 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
413 tree update = fold_convert (vptr_type, TYPE_SIZE_UNIT (vectype));
417 tree new_dataref_ptr;
419 incr_stmt = build_gimple_modify_stmt (ptr_var,
420 build2 (PLUS_EXPR, vptr_type,
421 dataref_ptr, update));
422 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
423 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
424 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
426 /* Update the vector-pointer's cross-iteration increment. */
427 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
429 tree use = USE_FROM_PTR (use_p);
431 if (use == dataref_ptr)
432 SET_USE (use_p, new_dataref_ptr);
434 gcc_assert (tree_int_cst_compare (use, update) == 0);
437 /* Copy the points-to information if it exists. */
438 if (DR_PTR_INFO (dr))
439 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
440 merge_alias_info (new_dataref_ptr, dataref_ptr);
442 return new_dataref_ptr;
446 /* Function vect_create_destination_var.
448 Create a new temporary of type VECTYPE. */
451 vect_create_destination_var (tree scalar_dest, tree vectype)
454 const char *new_name;
456 enum vect_var_kind kind;
458 kind = vectype ? vect_simple_var : vect_scalar_var;
459 type = vectype ? vectype : TREE_TYPE (scalar_dest);
461 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
463 new_name = get_name (scalar_dest);
466 vec_dest = vect_get_new_vect_var (type, kind, new_name);
467 add_referenced_var (vec_dest);
473 /* Function vect_init_vector.
475 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
476 the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be
477 used in the vectorization of STMT. */
480 vect_init_vector (tree stmt, tree vector_var, tree vector_type)
482 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
483 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
484 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
492 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
493 add_referenced_var (new_var);
495 init_stmt = build_gimple_modify_stmt (new_var, vector_var);
496 new_temp = make_ssa_name (new_var, init_stmt);
497 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
499 pe = loop_preheader_edge (loop);
500 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
501 gcc_assert (!new_bb);
503 if (vect_print_dump_info (REPORT_DETAILS))
505 fprintf (vect_dump, "created new init_stmt: ");
506 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
509 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
514 /* Function get_initial_def_for_induction
517 STMT - a stmt that performs an induction operation in the loop.
518 IV_PHI - the initial value of the induction variable
521 Return a vector variable, initialized with the first VF values of
522 the induction variable. E.g., for an iv with IV_PHI='X' and
523 evolution S, for a vector of 4 units, we want to return:
524 [X, X + S, X + 2*S, X + 3*S]. */
527 get_initial_def_for_induction (tree stmt, tree iv_phi)
529 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
530 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
531 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
532 tree scalar_type = TREE_TYPE (iv_phi);
533 tree vectype = get_vectype_for_scalar_type (scalar_type);
534 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
535 edge pe = loop_preheader_edge (loop);
537 block_stmt_iterator bsi;
538 tree vec, vec_init, vec_step, t;
543 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
544 tree init_expr, step_expr;
545 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
548 int ncopies = vf / nunits;
550 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
553 gcc_assert (phi_info);
555 if (STMT_VINFO_VEC_STMT (phi_info))
557 induction_phi = STMT_VINFO_VEC_STMT (phi_info);
558 gcc_assert (TREE_CODE (induction_phi) == PHI_NODE);
560 if (vect_print_dump_info (REPORT_DETAILS))
562 fprintf (vect_dump, "induction already vectorized:");
563 print_generic_expr (vect_dump, iv_phi, TDF_SLIM);
564 fprintf (vect_dump, "\n");
565 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
568 return PHI_RESULT (induction_phi);
571 gcc_assert (ncopies >= 1);
573 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (iv_phi));
574 gcc_assert (access_fn);
575 ok = vect_is_simple_iv_evolution (loop->num, access_fn, &init_expr, &step_expr);
578 /* Create the vector that holds the initial_value of the induction. */
579 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
580 add_referenced_var (new_var);
582 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
585 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
586 gcc_assert (!new_bb);
590 t = tree_cons (NULL_TREE, new_name, t);
591 for (i = 1; i < nunits; i++)
595 /* Create: new_name = new_name + step_expr */
596 tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
597 init_stmt = build_gimple_modify_stmt (new_var, tmp);
598 new_name = make_ssa_name (new_var, init_stmt);
599 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
601 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
602 gcc_assert (!new_bb);
604 if (vect_print_dump_info (REPORT_DETAILS))
606 fprintf (vect_dump, "created new init_stmt: ");
607 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
609 t = tree_cons (NULL_TREE, new_name, t);
611 vec = build_constructor_from_list (vectype, nreverse (t));
612 vec_init = vect_init_vector (stmt, vec, vectype);
615 /* Create the vector that holds the step of the induction. */
616 expr = build_int_cst (scalar_type, vf);
617 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
619 for (i = 0; i < nunits; i++)
620 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
621 vec = build_constructor_from_list (vectype, t);
622 vec_step = vect_init_vector (stmt, vec, vectype);
625 /* Create the following def-use cycle:
627 vec_init = [X, X+S, X+2*S, X+3*S]
628 vec_step = [VF*S, VF*S, VF*S, VF*S]
630 vec_iv = PHI <vec_init, vec_loop>
634 vec_loop = vec_iv + vec_step; */
636 /* Create the induction-phi that defines the induction-operand. */
637 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
638 add_referenced_var (vec_dest);
639 induction_phi = create_phi_node (vec_dest, loop->header);
640 set_stmt_info (get_stmt_ann (induction_phi),
641 new_stmt_vec_info (induction_phi, loop_vinfo));
642 induc_def = PHI_RESULT (induction_phi);
644 /* Create the iv update inside the loop */
645 new_stmt = build_gimple_modify_stmt (NULL_TREE,
646 build2 (PLUS_EXPR, vectype,
647 induc_def, vec_step));
648 vec_def = make_ssa_name (vec_dest, new_stmt);
649 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
650 bsi = bsi_for_stmt (stmt);
651 vect_finish_stmt_generation (stmt, new_stmt, &bsi);
653 /* Set the arguments of the phi node: */
654 add_phi_arg (induction_phi, vec_init, loop_preheader_edge (loop));
655 add_phi_arg (induction_phi, vec_def, loop_latch_edge (loop));
658 /* In case the vectorization factor (VF) is bigger than the number
659 of elements that we can fit in a vectype (nunits), we have to generate
660 more than one vector stmt - i.e - we need to "unroll" the
661 vector stmt by a factor VF/nunits. For more details see documentation
662 in vectorizable_operation. */
666 stmt_vec_info prev_stmt_vinfo;
668 /* Create the vector that holds the step of the induction. */
669 expr = build_int_cst (scalar_type, nunits);
670 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
672 for (i = 0; i < nunits; i++)
673 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
674 vec = build_constructor_from_list (vectype, t);
675 vec_step = vect_init_vector (stmt, vec, vectype);
678 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
679 for (i = 1; i < ncopies; i++)
683 /* vec_i = vec_prev + vec_{step*nunits} */
684 tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
685 new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
686 vec_def = make_ssa_name (vec_dest, new_stmt);
687 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
688 bsi = bsi_for_stmt (stmt);
689 vect_finish_stmt_generation (stmt, new_stmt, &bsi);
691 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
692 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
696 if (vect_print_dump_info (REPORT_DETAILS))
698 fprintf (vect_dump, "transform induction: created def-use cycle:");
699 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
700 fprintf (vect_dump, "\n");
701 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
704 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
709 /* Function vect_get_vec_def_for_operand.
711 OP is an operand in STMT. This function returns a (vector) def that will be
712 used in the vectorized stmt for STMT.
714 In the case that OP is an SSA_NAME which is defined in the loop, then
715 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
717 In case OP is an invariant or constant, a new stmt that creates a vector def
718 needs to be introduced. */
721 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
726 stmt_vec_info def_stmt_info = NULL;
727 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
728 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
729 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
730 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
731 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
737 enum vect_def_type dt;
741 if (vect_print_dump_info (REPORT_DETAILS))
743 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
744 print_generic_expr (vect_dump, op, TDF_SLIM);
747 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
748 gcc_assert (is_simple_use);
749 if (vect_print_dump_info (REPORT_DETAILS))
753 fprintf (vect_dump, "def = ");
754 print_generic_expr (vect_dump, def, TDF_SLIM);
758 fprintf (vect_dump, " def_stmt = ");
759 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
765 /* Case 1: operand is a constant. */
766 case vect_constant_def:
771 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
772 if (vect_print_dump_info (REPORT_DETAILS))
773 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
775 for (i = nunits - 1; i >= 0; --i)
777 t = tree_cons (NULL_TREE, op, t);
779 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
780 vec_cst = build_vector (vector_type, t);
782 return vect_init_vector (stmt, vec_cst, vector_type);
785 /* Case 2: operand is defined outside the loop - loop invariant. */
786 case vect_invariant_def:
791 /* Create 'vec_inv = {inv,inv,..,inv}' */
792 if (vect_print_dump_info (REPORT_DETAILS))
793 fprintf (vect_dump, "Create vector_inv.");
795 for (i = nunits - 1; i >= 0; --i)
797 t = tree_cons (NULL_TREE, def, t);
800 /* FIXME: use build_constructor directly. */
801 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
802 vec_inv = build_constructor_from_list (vector_type, t);
803 return vect_init_vector (stmt, vec_inv, vector_type);
806 /* Case 3: operand is defined inside the loop. */
810 *scalar_def = def_stmt;
812 /* Get the def from the vectorized stmt. */
813 def_stmt_info = vinfo_for_stmt (def_stmt);
814 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
815 gcc_assert (vec_stmt);
816 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
820 /* Case 4: operand is defined by a loop header phi - reduction */
821 case vect_reduction_def:
823 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
825 /* Get the def before the loop */
826 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
827 return get_initial_def_for_reduction (stmt, op, scalar_def);
830 /* Case 5: operand is defined by loop-header phi - induction. */
831 case vect_induction_def:
833 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
835 /* Get the def before the loop */
836 return get_initial_def_for_induction (stmt, def_stmt);
845 /* Function vect_get_vec_def_for_stmt_copy
847 Return a vector-def for an operand. This function is used when the
848 vectorized stmt to be created (by the caller to this function) is a "copy"
849 created in case the vectorized result cannot fit in one vector, and several
850 copies of the vector-stmt are required. In this case the vector-def is
851 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
852 of the stmt that defines VEC_OPRND.
853 DT is the type of the vector def VEC_OPRND.
856 In case the vectorization factor (VF) is bigger than the number
857 of elements that can fit in a vectype (nunits), we have to generate
858 more than one vector stmt to vectorize the scalar stmt. This situation
859 arises when there are multiple data-types operated upon in the loop; the
860 smallest data-type determines the VF, and as a result, when vectorizing
861 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
862 vector stmt (each computing a vector of 'nunits' results, and together
863 computing 'VF' results in each iteration). This function is called when
864 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
865 which VF=16 and nunits=4, so the number of copies required is 4):
867 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
869 S1: x = load VS1.0: vx.0 = memref0 VS1.1
870 VS1.1: vx.1 = memref1 VS1.2
871 VS1.2: vx.2 = memref2 VS1.3
872 VS1.3: vx.3 = memref3
874 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
875 VSnew.1: vz1 = vx.1 + ... VSnew.2
876 VSnew.2: vz2 = vx.2 + ... VSnew.3
877 VSnew.3: vz3 = vx.3 + ...
879 The vectorization of S1 is explained in vectorizable_load.
880 The vectorization of S2:
881 To create the first vector-stmt out of the 4 copies - VSnew.0 -
882 the function 'vect_get_vec_def_for_operand' is called to
883 get the relevant vector-def for each operand of S2. For operand x it
884 returns the vector-def 'vx.0'.
886 To create the remaining copies of the vector-stmt (VSnew.j), this
887 function is called to get the relevant vector-def for each operand. It is
888 obtained from the respective VS1.j stmt, which is recorded in the
889 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
891 For example, to obtain the vector-def 'vx.1' in order to create the
892 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
893 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
894 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
895 and return its def ('vx.1').
896 Overall, to create the above sequence this function will be called 3 times:
897 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
898 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
899 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
902 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
904 tree vec_stmt_for_operand;
905 stmt_vec_info def_stmt_info;
907 /* Do nothing; can reuse same def. */
908 if (dt == vect_invariant_def || dt == vect_constant_def )
911 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
912 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
913 gcc_assert (def_stmt_info);
914 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
915 gcc_assert (vec_stmt_for_operand);
916 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
922 /* Function vect_finish_stmt_generation.
924 Insert a new stmt. */
927 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
928 block_stmt_iterator *bsi)
930 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
931 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
933 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
934 set_stmt_info (get_stmt_ann (vec_stmt),
935 new_stmt_vec_info (vec_stmt, loop_vinfo));
937 if (vect_print_dump_info (REPORT_DETAILS))
939 fprintf (vect_dump, "add new stmt: ");
940 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
943 /* Make sure bsi points to the stmt that is being vectorized. */
944 gcc_assert (stmt == bsi_stmt (*bsi));
946 #ifdef USE_MAPPED_LOCATION
947 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
949 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
954 #define ADJUST_IN_EPILOG 1
956 /* Function get_initial_def_for_reduction
959 STMT - a stmt that performs a reduction operation in the loop.
960 INIT_VAL - the initial value of the reduction variable
963 SCALAR_DEF - a tree that holds a value to be added to the final result
964 of the reduction (used for "ADJUST_IN_EPILOG" - see below).
965 Return a vector variable, initialized according to the operation that STMT
966 performs. This vector will be used as the initial value of the
967 vector of partial results.
969 Option1 ("ADJUST_IN_EPILOG"): Initialize the vector as follows:
972 min/max: [init_val,init_val,..,init_val,init_val]
973 bit and/or: [init_val,init_val,..,init_val,init_val]
974 and when necessary (e.g. add/mult case) let the caller know
975 that it needs to adjust the result by init_val.
977 Option2: Initialize the vector as follows:
978 add: [0,0,...,0,init_val]
979 mult: [1,1,...,1,init_val]
980 min/max: [init_val,init_val,...,init_val]
981 bit and/or: [init_val,init_val,...,init_val]
982 and no adjustments are needed.
984 For example, for the following code:
990 STMT is 's = s + a[i]', and the reduction variable is 's'.
991 For a vector of 4 units, we want to return either [0,0,0,init_val],
992 or [0,0,0,0] and let the caller know that it needs to adjust
993 the result at the end by 'init_val'.
995 FORNOW: We use the "ADJUST_IN_EPILOG" scheme.
996 TODO: Use some cost-model to estimate which scheme is more profitable.
1000 get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def)
1002 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1003 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1004 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1006 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
1007 tree type = TREE_TYPE (init_val);
1009 tree vec, t = NULL_TREE;
1010 bool need_epilog_adjust;
1014 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
1018 case WIDEN_SUM_EXPR:
1021 if (INTEGRAL_TYPE_P (type))
1022 def = build_int_cst (type, 0);
1024 def = build_real (type, dconst0);
1026 #ifdef ADJUST_IN_EPILOG
1027 /* All the 'nunits' elements are set to 0. The final result will be
1028 adjusted by 'init_val' at the loop epilog. */
1030 need_epilog_adjust = true;
1032 /* 'nunits - 1' elements are set to 0; The last element is set to
1033 'init_val'. No further adjustments at the epilog are needed. */
1034 nelements = nunits - 1;
1035 need_epilog_adjust = false;
1043 need_epilog_adjust = false;
1050 for (i = nelements - 1; i >= 0; --i)
1051 t = tree_cons (NULL_TREE, def, t);
1053 if (nelements == nunits - 1)
1055 /* Set the last element of the vector. */
1056 t = tree_cons (NULL_TREE, init_val, t);
1059 gcc_assert (nelements == nunits);
1061 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1062 if (TREE_CODE (init_val) == INTEGER_CST || TREE_CODE (init_val) == REAL_CST)
1063 vec = build_vector (vector_type, t);
1065 vec = build_constructor_from_list (vector_type, t);
1067 if (!need_epilog_adjust)
1068 *scalar_def = NULL_TREE;
1070 *scalar_def = init_val;
1072 return vect_init_vector (stmt, vec, vector_type);
1076 /* Function vect_create_epilog_for_reduction
1078 Create code at the loop-epilog to finalize the result of a reduction
1081 VECT_DEF is a vector of partial results.
1082 REDUC_CODE is the tree-code for the epilog reduction.
1083 STMT is the scalar reduction stmt that is being vectorized.
1084 REDUCTION_PHI is the phi-node that carries the reduction computation.
1087 1. Creates the reduction def-use cycle: sets the arguments for
1089 The loop-entry argument is the vectorized initial-value of the reduction.
1090 The loop-latch argument is VECT_DEF - the vector of partial sums.
1091 2. "Reduces" the vector of partial results VECT_DEF into a single result,
1092 by applying the operation specified by REDUC_CODE if available, or by
1093 other means (whole-vector shifts or a scalar loop).
1094 The function also creates a new phi node at the loop exit to preserve
1095 loop-closed form, as illustrated below.
1097 The flow at the entry to this function:
1100 vec_def = phi <null, null> # REDUCTION_PHI
1101 VECT_DEF = vector_stmt # vectorized form of STMT
1102 s_loop = scalar_stmt # (scalar) STMT
1104 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1108 The above is transformed by this function into:
1111 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
1112 VECT_DEF = vector_stmt # vectorized form of STMT
1113 s_loop = scalar_stmt # (scalar) STMT
1115 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1116 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1117 v_out2 = reduce <v_out1>
1118 s_out3 = extract_field <v_out2, 0>
1119 s_out4 = adjust_result <s_out3>
1125 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
1126 enum tree_code reduc_code, tree reduction_phi)
1128 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1130 enum machine_mode mode;
1131 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1132 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1133 basic_block exit_bb;
1137 block_stmt_iterator exit_bsi;
1142 tree new_scalar_dest, exit_phi;
1143 tree bitsize, bitpos, bytesize;
1144 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
1145 tree scalar_initial_def;
1146 tree vec_initial_def;
1148 imm_use_iterator imm_iter;
1149 use_operand_p use_p;
1150 bool extract_scalar_result;
1154 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
1157 op_type = TREE_OPERAND_LENGTH (operation);
1158 reduction_op = TREE_OPERAND (operation, op_type-1);
1159 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
1160 mode = TYPE_MODE (vectype);
1162 /*** 1. Create the reduction def-use cycle ***/
1164 /* 1.1 set the loop-entry arg of the reduction-phi: */
1165 /* For the case of reduction, vect_get_vec_def_for_operand returns
1166 the scalar def before the loop, that defines the initial value
1167 of the reduction variable. */
1168 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
1169 &scalar_initial_def);
1170 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
1172 /* 1.2 set the loop-latch arg for the reduction-phi: */
1173 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
1175 if (vect_print_dump_info (REPORT_DETAILS))
1177 fprintf (vect_dump, "transform reduction: created def-use cycle:");
1178 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
1179 fprintf (vect_dump, "\n");
1180 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
1184 /*** 2. Create epilog code
1185 The reduction epilog code operates across the elements of the vector
1186 of partial results computed by the vectorized loop.
1187 The reduction epilog code consists of:
1188 step 1: compute the scalar result in a vector (v_out2)
1189 step 2: extract the scalar result (s_out3) from the vector (v_out2)
1190 step 3: adjust the scalar result (s_out3) if needed.
1192 Step 1 can be accomplished using one the following three schemes:
1193 (scheme 1) using reduc_code, if available.
1194 (scheme 2) using whole-vector shifts, if available.
1195 (scheme 3) using a scalar loop. In this case steps 1+2 above are
1198 The overall epilog code looks like this:
1200 s_out0 = phi <s_loop> # original EXIT_PHI
1201 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1202 v_out2 = reduce <v_out1> # step 1
1203 s_out3 = extract_field <v_out2, 0> # step 2
1204 s_out4 = adjust_result <s_out3> # step 3
1206 (step 3 is optional, and step2 1 and 2 may be combined).
1207 Lastly, the uses of s_out0 are replaced by s_out4.
1211 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
1212 v_out1 = phi <v_loop> */
1214 exit_bb = single_exit (loop)->dest;
1215 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
1216 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
1217 exit_bsi = bsi_start (exit_bb);
1219 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
1220 (i.e. when reduc_code is not available) and in the final adjustment code
1221 (if needed). Also get the original scalar reduction variable as
1222 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
1223 represents a reduction pattern), the tree-code and scalar-def are
1224 taken from the original stmt that the pattern-stmt (STMT) replaces.
1225 Otherwise (it is a regular reduction) - the tree-code and scalar-def
1226 are taken from STMT. */
1228 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1231 /* Regular reduction */
1236 /* Reduction pattern */
1237 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
1238 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1239 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
1241 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1242 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
1243 scalar_type = TREE_TYPE (scalar_dest);
1244 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
1245 bitsize = TYPE_SIZE (scalar_type);
1246 bytesize = TYPE_SIZE_UNIT (scalar_type);
1248 /* 2.3 Create the reduction code, using one of the three schemes described
1251 if (reduc_code < NUM_TREE_CODES)
1255 /*** Case 1: Create:
1256 v_out2 = reduc_expr <v_out1> */
1258 if (vect_print_dump_info (REPORT_DETAILS))
1259 fprintf (vect_dump, "Reduce using direct vector reduction.");
1261 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1262 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
1263 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
1264 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1265 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1266 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1268 extract_scalar_result = true;
1272 enum tree_code shift_code = 0;
1273 bool have_whole_vector_shift = true;
1275 int element_bitsize = tree_low_cst (bitsize, 1);
1276 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1279 if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
1280 shift_code = VEC_RSHIFT_EXPR;
1282 have_whole_vector_shift = false;
1284 /* Regardless of whether we have a whole vector shift, if we're
1285 emulating the operation via tree-vect-generic, we don't want
1286 to use it. Only the first round of the reduction is likely
1287 to still be profitable via emulation. */
1288 /* ??? It might be better to emit a reduction tree code here, so that
1289 tree-vect-generic can expand the first round via bit tricks. */
1290 if (!VECTOR_MODE_P (mode))
1291 have_whole_vector_shift = false;
1294 optab optab = optab_for_tree_code (code, vectype);
1295 if (optab->handlers[mode].insn_code == CODE_FOR_nothing)
1296 have_whole_vector_shift = false;
1299 if (have_whole_vector_shift)
1301 /*** Case 2: Create:
1302 for (offset = VS/2; offset >= element_size; offset/=2)
1304 Create: va' = vec_shift <va, offset>
1305 Create: va = vop <va, va'>
1308 if (vect_print_dump_info (REPORT_DETAILS))
1309 fprintf (vect_dump, "Reduce using vector shifts");
1311 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1312 new_temp = PHI_RESULT (new_phi);
1314 for (bit_offset = vec_size_in_bits/2;
1315 bit_offset >= element_bitsize;
1318 tree bitpos = size_int (bit_offset);
1319 tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
1320 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
1321 new_name = make_ssa_name (vec_dest, epilog_stmt);
1322 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1323 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1325 tmp = build2 (code, vectype, new_name, new_temp);
1326 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
1327 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1328 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1329 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1332 extract_scalar_result = true;
1338 /*** Case 3: Create:
1339 s = extract_field <v_out2, 0>
1340 for (offset = element_size;
1341 offset < vector_size;
1342 offset += element_size;)
1344 Create: s' = extract_field <v_out2, offset>
1345 Create: s = op <s, s'>
1348 if (vect_print_dump_info (REPORT_DETAILS))
1349 fprintf (vect_dump, "Reduce using scalar code. ");
1351 vec_temp = PHI_RESULT (new_phi);
1352 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1353 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1355 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1356 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
1357 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1358 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1359 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1361 for (bit_offset = element_bitsize;
1362 bit_offset < vec_size_in_bits;
1363 bit_offset += element_bitsize)
1366 tree bitpos = bitsize_int (bit_offset);
1367 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1370 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1371 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
1372 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
1373 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1374 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1376 tmp = build2 (code, scalar_type, new_name, new_temp);
1377 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
1378 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1379 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1380 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1383 extract_scalar_result = false;
1387 /* 2.4 Extract the final scalar result. Create:
1388 s_out3 = extract_field <v_out2, bitpos> */
1390 if (extract_scalar_result)
1394 if (vect_print_dump_info (REPORT_DETAILS))
1395 fprintf (vect_dump, "extract scalar result");
1397 if (BYTES_BIG_ENDIAN)
1398 bitpos = size_binop (MULT_EXPR,
1399 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
1400 TYPE_SIZE (scalar_type));
1402 bitpos = bitsize_zero_node;
1404 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
1405 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1406 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
1407 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1408 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1409 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1412 /* 2.4 Adjust the final result by the initial value of the reduction
1413 variable. (When such adjustment is not needed, then
1414 'scalar_initial_def' is zero).
1417 s_out4 = scalar_expr <s_out3, scalar_initial_def> */
1419 if (scalar_initial_def)
1421 tree tmp = build2 (code, scalar_type, new_temp, scalar_initial_def);
1422 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
1423 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1424 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1425 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1428 /* 2.6 Replace uses of s_out0 with uses of s_out3 */
1430 /* Find the loop-closed-use at the loop exit of the original scalar result.
1431 (The reduction result is expected to have two immediate uses - one at the
1432 latch block, and one at the loop exit). */
1434 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
1436 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
1438 exit_phi = USE_STMT (use_p);
1442 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
1443 gcc_assert (exit_phi);
1444 /* Replace the uses: */
1445 orig_name = PHI_RESULT (exit_phi);
1446 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
1447 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1448 SET_USE (use_p, new_temp);
1452 /* Function vectorizable_reduction.
1454 Check if STMT performs a reduction operation that can be vectorized.
1455 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1456 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1457 Return FALSE if not a vectorizable STMT, TRUE otherwise.
1459 This function also handles reduction idioms (patterns) that have been
1460 recognized in advance during vect_pattern_recog. In this case, STMT may be
1462 X = pattern_expr (arg0, arg1, ..., X)
1463 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
1464 sequence that had been detected and replaced by the pattern-stmt (STMT).
1466 In some cases of reduction patterns, the type of the reduction variable X is
1467 different than the type of the other arguments of STMT.
1468 In such cases, the vectype that is used when transforming STMT into a vector
1469 stmt is different than the vectype that is used to determine the
1470 vectorization factor, because it consists of a different number of elements
1471 than the actual number of elements that are being operated upon in parallel.
1473 For example, consider an accumulation of shorts into an int accumulator.
1474 On some targets it's possible to vectorize this pattern operating on 8
1475 shorts at a time (hence, the vectype for purposes of determining the
1476 vectorization factor should be V8HI); on the other hand, the vectype that
1477 is used to create the vector form is actually V4SI (the type of the result).
1479 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
1480 indicates what is the actual level of parallelism (V8HI in the example), so
1481 that the right vectorization factor would be derived. This vectype
1482 corresponds to the type of arguments to the reduction stmt, and should *NOT*
1483 be used to create the vectorized stmt. The right vectype for the vectorized
1484 stmt is obtained from the type of the result X:
1485 get_vectype_for_scalar_type (TREE_TYPE (X))
1487 This means that, contrary to "regular" reductions (or "regular" stmts in
1488 general), the following equation:
1489 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
1490 does *NOT* necessarily hold for reduction patterns. */
1493 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1498 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
1499 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1500 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1501 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1502 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1504 enum tree_code code, orig_code, epilog_reduc_code = 0;
1505 enum machine_mode vec_mode;
1507 optab optab, reduc_optab;
1508 tree new_temp = NULL_TREE;
1510 enum vect_def_type dt;
1515 stmt_vec_info orig_stmt_info;
1516 tree expr = NULL_TREE;
1518 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1519 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1520 stmt_vec_info prev_stmt_info;
1522 tree new_stmt = NULL_TREE;
1525 gcc_assert (ncopies >= 1);
1527 /* 1. Is vectorizable reduction? */
1529 /* Not supportable if the reduction variable is used in the loop. */
1530 if (STMT_VINFO_RELEVANT_P (stmt_info))
1533 if (!STMT_VINFO_LIVE_P (stmt_info))
1536 /* Make sure it was already recognized as a reduction computation. */
1537 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
1540 /* 2. Has this been recognized as a reduction pattern?
1542 Check if STMT represents a pattern that has been recognized
1543 in earlier analysis stages. For stmts that represent a pattern,
1544 the STMT_VINFO_RELATED_STMT field records the last stmt in
1545 the original sequence that constitutes the pattern. */
1547 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1550 orig_stmt_info = vinfo_for_stmt (orig_stmt);
1551 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
1552 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
1553 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
1556 /* 3. Check the operands of the operation. The first operands are defined
1557 inside the loop body. The last operand is the reduction variable,
1558 which is defined by the loop-header-phi. */
1560 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
1562 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1563 code = TREE_CODE (operation);
1564 op_type = TREE_OPERAND_LENGTH (operation);
1565 if (op_type != binary_op && op_type != ternary_op)
1567 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1568 scalar_type = TREE_TYPE (scalar_dest);
1570 /* All uses but the last are expected to be defined in the loop.
1571 The last use is the reduction variable. */
1572 for (i = 0; i < op_type-1; i++)
1574 op = TREE_OPERAND (operation, i);
1575 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1576 gcc_assert (is_simple_use);
1577 if (dt != vect_loop_def
1578 && dt != vect_invariant_def
1579 && dt != vect_constant_def
1580 && dt != vect_induction_def)
1584 op = TREE_OPERAND (operation, i);
1585 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1586 gcc_assert (is_simple_use);
1587 gcc_assert (dt == vect_reduction_def);
1588 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1590 gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt));
1592 gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt));
1594 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
1597 /* 4. Supportable by target? */
1599 /* 4.1. check support for the operation in the loop */
1600 optab = optab_for_tree_code (code, vectype);
1603 if (vect_print_dump_info (REPORT_DETAILS))
1604 fprintf (vect_dump, "no optab.");
1607 vec_mode = TYPE_MODE (vectype);
1608 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1610 if (vect_print_dump_info (REPORT_DETAILS))
1611 fprintf (vect_dump, "op not supported by target.");
1612 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1613 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1614 < vect_min_worthwhile_factor (code))
1616 if (vect_print_dump_info (REPORT_DETAILS))
1617 fprintf (vect_dump, "proceeding using word mode.");
1620 /* Worthwhile without SIMD support? */
1621 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1622 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1623 < vect_min_worthwhile_factor (code))
1625 if (vect_print_dump_info (REPORT_DETAILS))
1626 fprintf (vect_dump, "not worthwhile without SIMD support.");
1630 /* 4.2. Check support for the epilog operation.
1632 If STMT represents a reduction pattern, then the type of the
1633 reduction variable may be different than the type of the rest
1634 of the arguments. For example, consider the case of accumulation
1635 of shorts into an int accumulator; The original code:
1636 S1: int_a = (int) short_a;
1637 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
1640 STMT: int_acc = widen_sum <short_a, int_acc>
1643 1. The tree-code that is used to create the vector operation in the
1644 epilog code (that reduces the partial results) is not the
1645 tree-code of STMT, but is rather the tree-code of the original
1646 stmt from the pattern that STMT is replacing. I.e, in the example
1647 above we want to use 'widen_sum' in the loop, but 'plus' in the
1649 2. The type (mode) we use to check available target support
1650 for the vector operation to be created in the *epilog*, is
1651 determined by the type of the reduction variable (in the example
1652 above we'd check this: plus_optab[vect_int_mode]).
1653 However the type (mode) we use to check available target support
1654 for the vector operation to be created *inside the loop*, is
1655 determined by the type of the other arguments to STMT (in the
1656 example we'd check this: widen_sum_optab[vect_short_mode]).
1658 This is contrary to "regular" reductions, in which the types of all
1659 the arguments are the same as the type of the reduction variable.
1660 For "regular" reductions we can therefore use the same vector type
1661 (and also the same tree-code) when generating the epilog code and
1662 when generating the code inside the loop. */
1666 /* This is a reduction pattern: get the vectype from the type of the
1667 reduction variable, and get the tree-code from orig_stmt. */
1668 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1669 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
1670 vec_mode = TYPE_MODE (vectype);
1674 /* Regular reduction: use the same vectype and tree-code as used for
1675 the vector code inside the loop can be used for the epilog code. */
1679 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
1681 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
1684 if (vect_print_dump_info (REPORT_DETAILS))
1685 fprintf (vect_dump, "no optab for reduction.");
1686 epilog_reduc_code = NUM_TREE_CODES;
1688 if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1690 if (vect_print_dump_info (REPORT_DETAILS))
1691 fprintf (vect_dump, "reduc op not supported by target.");
1692 epilog_reduc_code = NUM_TREE_CODES;
1695 if (!vec_stmt) /* transformation not required. */
1697 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
1703 if (vect_print_dump_info (REPORT_DETAILS))
1704 fprintf (vect_dump, "transform reduction.");
1706 /* Create the destination vector */
1707 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1709 /* Create the reduction-phi that defines the reduction-operand. */
1710 new_phi = create_phi_node (vec_dest, loop->header);
1712 /* In case the vectorization factor (VF) is bigger than the number
1713 of elements that we can fit in a vectype (nunits), we have to generate
1714 more than one vector stmt - i.e - we need to "unroll" the
1715 vector stmt by a factor VF/nunits. For more details see documentation
1716 in vectorizable_operation. */
1718 prev_stmt_info = NULL;
1719 for (j = 0; j < ncopies; j++)
1724 op = TREE_OPERAND (operation, 0);
1725 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
1726 if (op_type == ternary_op)
1728 op = TREE_OPERAND (operation, 1);
1729 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
1732 /* Get the vector def for the reduction variable from the phi node */
1733 reduc_def = PHI_RESULT (new_phi);
1737 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
1738 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
1739 if (op_type == ternary_op)
1740 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
1742 /* Get the vector def for the reduction variable from the vectorized
1743 reduction operation generated in the previous iteration (j-1) */
1744 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
1747 /* Arguments are ready. create the new vector stmt. */
1749 if (op_type == binary_op)
1750 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
1752 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
1754 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
1755 new_temp = make_ssa_name (vec_dest, new_stmt);
1756 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1757 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1760 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1762 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1763 prev_stmt_info = vinfo_for_stmt (new_stmt);
1766 /* Finalize the reduction-phi (set it's arguments) and create the
1767 epilog reduction code. */
1768 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
1772 /* Checks if CALL can be vectorized in type VECTYPE. Returns
1773 a function declaration if the target has a vectorized version
1774 of the function, or NULL_TREE if the function cannot be vectorized. */
1777 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
1779 tree fndecl = get_callee_fndecl (call);
1780 enum built_in_function code;
1782 /* We only handle functions that do not read or clobber memory -- i.e.
1783 const or novops ones. */
1784 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
1788 || TREE_CODE (fndecl) != FUNCTION_DECL
1789 || !DECL_BUILT_IN (fndecl))
1792 code = DECL_FUNCTION_CODE (fndecl);
1793 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
1797 /* Function vectorizable_call.
1799 Check if STMT performs a function call that can be vectorized.
1800 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1801 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1802 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1805 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1811 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
1812 tree vectype_out, vectype_in;
1813 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1814 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
1815 enum vect_def_type dt[2];
1816 int ncopies, j, nargs;
1817 call_expr_arg_iterator iter;
1819 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1822 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
1825 /* FORNOW: not yet supported. */
1826 if (STMT_VINFO_LIVE_P (stmt_info))
1828 if (vect_print_dump_info (REPORT_DETAILS))
1829 fprintf (vect_dump, "value used after loop.");
1833 /* Is STMT a vectorizable call? */
1834 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
1837 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
1840 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1841 if (TREE_CODE (operation) != CALL_EXPR)
1844 /* Process function arguments. */
1845 rhs_type = NULL_TREE;
1847 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
1851 /* Bail out if the function has more than two arguments, we
1852 do not have interesting builtin functions to vectorize with
1853 more than two arguments. */
1857 /* We can only handle calls with arguments of the same type. */
1859 && rhs_type != TREE_TYPE (op))
1861 if (vect_print_dump_info (REPORT_DETAILS))
1862 fprintf (vect_dump, "argument types differ.");
1865 rhs_type = TREE_TYPE (op);
1867 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs-1]))
1869 if (vect_print_dump_info (REPORT_DETAILS))
1870 fprintf (vect_dump, "use not simple.");
1875 /* No arguments is also not good. */
1879 vectype_in = get_vectype_for_scalar_type (rhs_type);
1881 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
1882 vectype_out = get_vectype_for_scalar_type (lhs_type);
1884 /* Only handle the case of vectors with the same number of elements.
1885 FIXME: We need a way to handle for example the SSE2 cvtpd2dq
1886 instruction which converts V2DFmode to V4SImode but only
1887 using the lower half of the V4SImode result. */
1888 if (TYPE_VECTOR_SUBPARTS (vectype_in) != TYPE_VECTOR_SUBPARTS (vectype_out))
1891 /* For now, we only vectorize functions if a target specific builtin
1892 is available. TODO -- in some cases, it might be profitable to
1893 insert the calls for pieces of the vector, in order to be able
1894 to vectorize other operations in the loop. */
1895 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
1896 if (fndecl == NULL_TREE)
1898 if (vect_print_dump_info (REPORT_DETAILS))
1899 fprintf (vect_dump, "function is not vectorizable.");
1904 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
1906 if (!vec_stmt) /* transformation not required. */
1908 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
1914 if (vect_print_dump_info (REPORT_DETAILS))
1915 fprintf (vect_dump, "transform operation.");
1917 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1918 / TYPE_VECTOR_SUBPARTS (vectype_out));
1919 gcc_assert (ncopies >= 1);
1922 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1923 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
1925 prev_stmt_info = NULL;
1926 for (j = 0; j < ncopies; ++j)
1928 tree new_stmt, vargs;
1932 /* Build argument list for the vectorized call. */
1933 /* FIXME: Rewrite this so that it doesn't construct a temporary
1937 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
1942 vec_oprnd[n] = vect_get_vec_def_for_operand (op, stmt, NULL);
1944 vec_oprnd[n] = vect_get_vec_def_for_stmt_copy (dt[n], vec_oprnd[n]);
1946 vargs = tree_cons (NULL_TREE, vec_oprnd[n], vargs);
1948 vargs = nreverse (vargs);
1950 rhs = build_function_call_expr (fndecl, vargs);
1951 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
1952 new_temp = make_ssa_name (vec_dest, new_stmt);
1953 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1955 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1958 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1960 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1961 prev_stmt_info = vinfo_for_stmt (new_stmt);
1964 /* The call in STMT might prevent it from being removed in dce. We however
1965 cannot remove it here, due to the way the ssa name it defines is mapped
1966 to the new definition. So just replace rhs of the statement with something
1968 type = TREE_TYPE (scalar_dest);
1969 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
1975 /* Function vectorizable_conversion.
1977 Check if STMT performs a conversion operation, that can be vectorized.
1978 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1979 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1980 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1983 vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
1990 tree vec_oprnd0 = NULL_TREE;
1991 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1992 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1993 enum tree_code code;
1996 enum vect_def_type dt0;
2001 tree vectype_out, vectype_in;
2002 tree rhs_type, lhs_type;
2004 stmt_vec_info prev_stmt_info;
2006 /* Is STMT a vectorizable conversion? */
2008 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2011 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2014 if (STMT_VINFO_LIVE_P (stmt_info))
2016 /* FORNOW: not yet supported. */
2017 if (vect_print_dump_info (REPORT_DETAILS))
2018 fprintf (vect_dump, "value used after loop.");
2022 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2025 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2028 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2029 code = TREE_CODE (operation);
2030 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
2033 /* Check types of lhs and rhs */
2034 op0 = TREE_OPERAND (operation, 0);
2035 rhs_type = TREE_TYPE (op0);
2036 vectype_in = get_vectype_for_scalar_type (rhs_type);
2037 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2039 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2040 lhs_type = TREE_TYPE (scalar_dest);
2041 vectype_out = get_vectype_for_scalar_type (lhs_type);
2042 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
2043 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2045 /* FORNOW: need to extend to support short<->float conversions as well. */
2046 if (nunits_out != nunits_in)
2049 /* Bail out if the types are both integral or non-integral */
2050 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
2051 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
2054 /* Sanity check: make sure that at least one copy of the vectorized stmt
2055 needs to be generated. */
2056 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2057 gcc_assert (ncopies >= 1);
2059 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2061 if (vect_print_dump_info (REPORT_DETAILS))
2062 fprintf (vect_dump, "use not simple.");
2066 /* Supportable by target? */
2067 if (!targetm.vectorize.builtin_conversion (code, vectype_in))
2069 if (vect_print_dump_info (REPORT_DETAILS))
2070 fprintf (vect_dump, "op not supported by target.");
2074 if (!vec_stmt) /* transformation not required. */
2076 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
2082 if (vect_print_dump_info (REPORT_DETAILS))
2083 fprintf (vect_dump, "transform conversion.");
2086 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2088 prev_stmt_info = NULL;
2089 for (j = 0; j < ncopies; j++)
2095 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2097 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2100 targetm.vectorize.builtin_conversion (code, vectype_in);
2101 new_stmt = build_call_expr (builtin_decl, 1, vec_oprnd0);
2103 /* Arguments are ready. create the new vector stmt. */
2104 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
2105 new_temp = make_ssa_name (vec_dest, new_stmt);
2106 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2107 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2108 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2110 if (TREE_CODE (sym) == SSA_NAME)
2111 sym = SSA_NAME_VAR (sym);
2112 mark_sym_for_renaming (sym);
2116 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2118 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2119 prev_stmt_info = vinfo_for_stmt (new_stmt);
2125 /* Function vectorizable_assignment.
2127 Check if STMT performs an assignment (copy) that can be vectorized.
2128 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2129 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2130 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2133 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2139 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2140 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2141 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2144 enum vect_def_type dt;
2145 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2146 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2148 gcc_assert (ncopies >= 1);
2150 return false; /* FORNOW */
2152 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2155 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2158 /* FORNOW: not yet supported. */
2159 if (STMT_VINFO_LIVE_P (stmt_info))
2161 if (vect_print_dump_info (REPORT_DETAILS))
2162 fprintf (vect_dump, "value used after loop.");
2166 /* Is vectorizable assignment? */
2167 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2170 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2171 if (TREE_CODE (scalar_dest) != SSA_NAME)
2174 op = GIMPLE_STMT_OPERAND (stmt, 1);
2175 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
2177 if (vect_print_dump_info (REPORT_DETAILS))
2178 fprintf (vect_dump, "use not simple.");
2182 if (!vec_stmt) /* transformation not required. */
2184 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
2189 if (vect_print_dump_info (REPORT_DETAILS))
2190 fprintf (vect_dump, "transform assignment.");
2193 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2196 op = GIMPLE_STMT_OPERAND (stmt, 1);
2197 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
2199 /* Arguments are ready. create the new vector stmt. */
2200 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_oprnd);
2201 new_temp = make_ssa_name (vec_dest, *vec_stmt);
2202 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
2203 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
2209 /* Function vect_min_worthwhile_factor.
2211 For a loop where we could vectorize the operation indicated by CODE,
2212 return the minimum vectorization factor that makes it worthwhile
2213 to use generic vectors. */
2215 vect_min_worthwhile_factor (enum tree_code code)
2236 /* Function vectorizable_operation.
2238 Check if STMT performs a binary or unary operation that can be vectorized.
2239 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2240 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2241 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2244 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2249 tree op0, op1 = NULL;
2250 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
2251 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2252 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2253 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2254 enum tree_code code;
2255 enum machine_mode vec_mode;
2260 enum machine_mode optab_op2_mode;
2262 enum vect_def_type dt0, dt1;
2264 stmt_vec_info prev_stmt_info;
2265 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
2268 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2271 gcc_assert (ncopies >= 1);
2273 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2276 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2279 /* FORNOW: not yet supported. */
2280 if (STMT_VINFO_LIVE_P (stmt_info))
2282 if (vect_print_dump_info (REPORT_DETAILS))
2283 fprintf (vect_dump, "value used after loop.");
2287 /* Is STMT a vectorizable binary/unary operation? */
2288 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2291 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2294 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2295 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2296 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2297 if (nunits_out != nunits_in)
2300 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2301 code = TREE_CODE (operation);
2302 optab = optab_for_tree_code (code, vectype);
2304 /* Support only unary or binary operations. */
2305 op_type = TREE_OPERAND_LENGTH (operation);
2306 if (op_type != unary_op && op_type != binary_op)
2308 if (vect_print_dump_info (REPORT_DETAILS))
2309 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
2313 op0 = TREE_OPERAND (operation, 0);
2314 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2316 if (vect_print_dump_info (REPORT_DETAILS))
2317 fprintf (vect_dump, "use not simple.");
2321 if (op_type == binary_op)
2323 op1 = TREE_OPERAND (operation, 1);
2324 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2326 if (vect_print_dump_info (REPORT_DETAILS))
2327 fprintf (vect_dump, "use not simple.");
2332 /* Supportable by target? */
2335 if (vect_print_dump_info (REPORT_DETAILS))
2336 fprintf (vect_dump, "no optab.");
2339 vec_mode = TYPE_MODE (vectype);
2340 icode = (int) optab->handlers[(int) vec_mode].insn_code;
2341 if (icode == CODE_FOR_nothing)
2343 if (vect_print_dump_info (REPORT_DETAILS))
2344 fprintf (vect_dump, "op not supported by target.");
2345 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2346 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2347 < vect_min_worthwhile_factor (code))
2349 if (vect_print_dump_info (REPORT_DETAILS))
2350 fprintf (vect_dump, "proceeding using word mode.");
2353 /* Worthwhile without SIMD support? */
2354 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2355 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2356 < vect_min_worthwhile_factor (code))
2358 if (vect_print_dump_info (REPORT_DETAILS))
2359 fprintf (vect_dump, "not worthwhile without SIMD support.");
2363 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2365 /* FORNOW: not yet supported. */
2366 if (!VECTOR_MODE_P (vec_mode))
2369 /* Invariant argument is needed for a vector shift
2370 by a scalar shift operand. */
2371 optab_op2_mode = insn_data[icode].operand[2].mode;
2372 if (! (VECTOR_MODE_P (optab_op2_mode)
2373 || dt1 == vect_constant_def
2374 || dt1 == vect_invariant_def))
2376 if (vect_print_dump_info (REPORT_DETAILS))
2377 fprintf (vect_dump, "operand mode requires invariant argument.");
2382 if (!vec_stmt) /* transformation not required. */
2384 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
2390 if (vect_print_dump_info (REPORT_DETAILS))
2391 fprintf (vect_dump, "transform binary/unary operation.");
2394 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2396 /* In case the vectorization factor (VF) is bigger than the number
2397 of elements that we can fit in a vectype (nunits), we have to generate
2398 more than one vector stmt - i.e - we need to "unroll" the
2399 vector stmt by a factor VF/nunits. In doing so, we record a pointer
2400 from one copy of the vector stmt to the next, in the field
2401 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
2402 stages to find the correct vector defs to be used when vectorizing
2403 stmts that use the defs of the current stmt. The example below illustrates
2404 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
2405 4 vectorized stmts):
2407 before vectorization:
2408 RELATED_STMT VEC_STMT
2412 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
2414 RELATED_STMT VEC_STMT
2415 VS1_0: vx0 = memref0 VS1_1 -
2416 VS1_1: vx1 = memref1 VS1_2 -
2417 VS1_2: vx2 = memref2 VS1_3 -
2418 VS1_3: vx3 = memref3 - -
2419 S1: x = load - VS1_0
2422 step2: vectorize stmt S2 (done here):
2423 To vectorize stmt S2 we first need to find the relevant vector
2424 def for the first operand 'x'. This is, as usual, obtained from
2425 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
2426 that defines 'x' (S1). This way we find the stmt VS1_0, and the
2427 relevant vector def 'vx0'. Having found 'vx0' we can generate
2428 the vector stmt VS2_0, and as usual, record it in the
2429 STMT_VINFO_VEC_STMT of stmt S2.
2430 When creating the second copy (VS2_1), we obtain the relevant vector
2431 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
2432 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
2433 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
2434 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
2435 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
2436 chain of stmts and pointers:
2437 RELATED_STMT VEC_STMT
2438 VS1_0: vx0 = memref0 VS1_1 -
2439 VS1_1: vx1 = memref1 VS1_2 -
2440 VS1_2: vx2 = memref2 VS1_3 -
2441 VS1_3: vx3 = memref3 - -
2442 S1: x = load - VS1_0
2443 VS2_0: vz0 = vx0 + v1 VS2_1 -
2444 VS2_1: vz1 = vx1 + v1 VS2_2 -
2445 VS2_2: vz2 = vx2 + v1 VS2_3 -
2446 VS2_3: vz3 = vx3 + v1 - -
2447 S2: z = x + 1 - VS2_0 */
2449 prev_stmt_info = NULL;
2450 for (j = 0; j < ncopies; j++)
2455 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2456 if (op_type == binary_op)
2458 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2460 /* Vector shl and shr insn patterns can be defined with
2461 scalar operand 2 (shift operand). In this case, use
2462 constant or loop invariant op1 directly, without
2463 extending it to vector mode first. */
2464 optab_op2_mode = insn_data[icode].operand[2].mode;
2465 if (!VECTOR_MODE_P (optab_op2_mode))
2467 if (vect_print_dump_info (REPORT_DETAILS))
2468 fprintf (vect_dump, "operand 1 using scalar mode.");
2473 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2478 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2479 if (op_type == binary_op)
2480 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2483 /* Arguments are ready. create the new vector stmt. */
2485 if (op_type == binary_op)
2486 new_stmt = build_gimple_modify_stmt (vec_dest,
2487 build2 (code, vectype, vec_oprnd0, vec_oprnd1));
2489 new_stmt = build_gimple_modify_stmt (vec_dest,
2490 build1 (code, vectype, vec_oprnd0));
2491 new_temp = make_ssa_name (vec_dest, new_stmt);
2492 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2493 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2496 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2498 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2499 prev_stmt_info = vinfo_for_stmt (new_stmt);
2506 /* Function vectorizable_type_demotion
2508 Check if STMT performs a binary or unary operation that involves
2509 type demotion, and if it can be vectorized.
2510 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2511 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2512 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2515 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
2522 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2523 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2524 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2525 enum tree_code code;
2528 enum vect_def_type dt0;
2530 stmt_vec_info prev_stmt_info;
2540 enum machine_mode vec_mode;
2542 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2545 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2548 /* FORNOW: not yet supported. */
2549 if (STMT_VINFO_LIVE_P (stmt_info))
2551 if (vect_print_dump_info (REPORT_DETAILS))
2552 fprintf (vect_dump, "value used after loop.");
2556 /* Is STMT a vectorizable type-demotion operation? */
2557 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2560 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2563 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2564 code = TREE_CODE (operation);
2565 if (code != NOP_EXPR && code != CONVERT_EXPR)
2568 op0 = TREE_OPERAND (operation, 0);
2569 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2570 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2572 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2573 scalar_type = TREE_TYPE (scalar_dest);
2574 vectype_out = get_vectype_for_scalar_type (scalar_type);
2575 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2576 if (nunits_in != nunits_out / 2) /* FORNOW */
2579 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2580 gcc_assert (ncopies >= 1);
2582 if (! INTEGRAL_TYPE_P (scalar_type)
2583 || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2586 /* Check the operands of the operation. */
2587 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2589 if (vect_print_dump_info (REPORT_DETAILS))
2590 fprintf (vect_dump, "use not simple.");
2594 /* Supportable by target? */
2595 code = VEC_PACK_MOD_EXPR;
2596 optab = optab_for_tree_code (VEC_PACK_MOD_EXPR, vectype_in);
2600 vec_mode = TYPE_MODE (vectype_in);
2601 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
2604 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2606 if (!vec_stmt) /* transformation not required. */
2608 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
2614 if (vect_print_dump_info (REPORT_DETAILS))
2615 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
2619 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2621 /* In case the vectorization factor (VF) is bigger than the number
2622 of elements that we can fit in a vectype (nunits), we have to generate
2623 more than one vector stmt - i.e - we need to "unroll" the
2624 vector stmt by a factor VF/nunits. */
2625 prev_stmt_info = NULL;
2626 for (j = 0; j < ncopies; j++)
2631 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2632 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2636 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
2637 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2640 /* Arguments are ready. Create the new vector stmt. */
2641 expr = build2 (code, vectype_out, vec_oprnd0, vec_oprnd1);
2642 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2643 new_temp = make_ssa_name (vec_dest, new_stmt);
2644 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2645 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2648 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2650 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2652 prev_stmt_info = vinfo_for_stmt (new_stmt);
2655 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2660 /* Function vect_gen_widened_results_half
2662 Create a vector stmt whose code, type, number of arguments, and result
2663 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
2664 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
2665 In the case that CODE is a CALL_EXPR, this means that a call to DECL
2666 needs to be created (DECL is a function-decl of a target-builtin).
2667 STMT is the original scalar stmt that we are vectorizing. */
2670 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
2671 tree vec_oprnd0, tree vec_oprnd1, int op_type,
2672 tree vec_dest, block_stmt_iterator *bsi,
2681 /* Generate half of the widened result: */
2682 if (code == CALL_EXPR)
2684 /* Target specific support */
2685 if (op_type == binary_op)
2686 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
2688 expr = build_call_expr (decl, 1, vec_oprnd0);
2692 /* Generic support */
2693 gcc_assert (op_type == TREE_CODE_LENGTH (code));
2694 if (op_type == binary_op)
2695 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
2697 expr = build1 (code, vectype, vec_oprnd0);
2699 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2700 new_temp = make_ssa_name (vec_dest, new_stmt);
2701 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2702 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2704 if (code == CALL_EXPR)
2706 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2708 if (TREE_CODE (sym) == SSA_NAME)
2709 sym = SSA_NAME_VAR (sym);
2710 mark_sym_for_renaming (sym);
2718 /* Function vectorizable_type_promotion
2720 Check if STMT performs a binary or unary operation that involves
2721 type promotion, and if it can be vectorized.
2722 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2723 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2724 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2727 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
2733 tree op0, op1 = NULL;
2734 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2735 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2736 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2737 enum tree_code code, code1 = CODE_FOR_nothing, code2 = CODE_FOR_nothing;
2738 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
2741 enum vect_def_type dt0, dt1;
2743 stmt_vec_info prev_stmt_info;
2751 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2754 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2757 /* FORNOW: not yet supported. */
2758 if (STMT_VINFO_LIVE_P (stmt_info))
2760 if (vect_print_dump_info (REPORT_DETAILS))
2761 fprintf (vect_dump, "value used after loop.");
2765 /* Is STMT a vectorizable type-promotion operation? */
2766 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2769 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2772 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2773 code = TREE_CODE (operation);
2774 if (code != NOP_EXPR && code != WIDEN_MULT_EXPR)
2777 op0 = TREE_OPERAND (operation, 0);
2778 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2779 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2780 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2781 gcc_assert (ncopies >= 1);
2783 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2784 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2785 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2786 if (nunits_out != nunits_in / 2) /* FORNOW */
2789 if (! INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
2790 || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2793 /* Check the operands of the operation. */
2794 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2796 if (vect_print_dump_info (REPORT_DETAILS))
2797 fprintf (vect_dump, "use not simple.");
2801 op_type = TREE_CODE_LENGTH (code);
2802 if (op_type == binary_op)
2804 op1 = TREE_OPERAND (operation, 1);
2805 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2807 if (vect_print_dump_info (REPORT_DETAILS))
2808 fprintf (vect_dump, "use not simple.");
2813 /* Supportable by target? */
2814 if (!supportable_widening_operation (code, stmt, vectype_in,
2815 &decl1, &decl2, &code1, &code2))
2818 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2820 if (!vec_stmt) /* transformation not required. */
2822 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
2828 if (vect_print_dump_info (REPORT_DETAILS))
2829 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
2833 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2835 /* In case the vectorization factor (VF) is bigger than the number
2836 of elements that we can fit in a vectype (nunits), we have to generate
2837 more than one vector stmt - i.e - we need to "unroll" the
2838 vector stmt by a factor VF/nunits. */
2840 prev_stmt_info = NULL;
2841 for (j = 0; j < ncopies; j++)
2846 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2847 if (op_type == binary_op)
2848 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2852 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2853 if (op_type == binary_op)
2854 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2857 /* Arguments are ready. Create the new vector stmt. We are creating
2858 two vector defs because the widened result does not fit in one vector.
2859 The vectorized stmt can be expressed as a call to a taregt builtin,
2860 or a using a tree-code. */
2861 /* Generate first half of the widened result: */
2862 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
2863 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2865 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2867 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2868 prev_stmt_info = vinfo_for_stmt (new_stmt);
2870 /* Generate second half of the widened result: */
2871 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
2872 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2873 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2874 prev_stmt_info = vinfo_for_stmt (new_stmt);
2878 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2883 /* Function vect_strided_store_supported.
2885 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
2886 and FALSE otherwise. */
2889 vect_strided_store_supported (tree vectype)
2891 optab interleave_high_optab, interleave_low_optab;
2894 mode = (int) TYPE_MODE (vectype);
2896 /* Check that the operation is supported. */
2897 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
2899 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
2901 if (!interleave_high_optab || !interleave_low_optab)
2903 if (vect_print_dump_info (REPORT_DETAILS))
2904 fprintf (vect_dump, "no optab for interleave.");
2908 if (interleave_high_optab->handlers[(int) mode].insn_code
2910 || interleave_low_optab->handlers[(int) mode].insn_code
2911 == CODE_FOR_nothing)
2913 if (vect_print_dump_info (REPORT_DETAILS))
2914 fprintf (vect_dump, "interleave op not supported by target.");
2921 /* Function vect_permute_store_chain.
2923 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
2924 a power of 2, generate interleave_high/low stmts to reorder the data
2925 correctly for the stores. Return the final references for stores in
2928 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
2929 The input is 4 vectors each containing 8 elements. We assign a number to each
2930 element, the input sequence is:
2932 1st vec: 0 1 2 3 4 5 6 7
2933 2nd vec: 8 9 10 11 12 13 14 15
2934 3rd vec: 16 17 18 19 20 21 22 23
2935 4th vec: 24 25 26 27 28 29 30 31
2937 The output sequence should be:
2939 1st vec: 0 8 16 24 1 9 17 25
2940 2nd vec: 2 10 18 26 3 11 19 27
2941 3rd vec: 4 12 20 28 5 13 21 30
2942 4th vec: 6 14 22 30 7 15 23 31
2944 i.e., we interleave the contents of the four vectors in their order.
2946 We use interleave_high/low instructions to create such output. The input of
2947 each interleave_high/low operation is two vectors:
2950 the even elements of the result vector are obtained left-to-right from the
2951 high/low elements of the first vector. The odd elements of the result are
2952 obtained left-to-right from the high/low elements of the second vector.
2953 The output of interleave_high will be: 0 4 1 5
2954 and of interleave_low: 2 6 3 7
2957 The permutation is done in log LENGTH stages. In each stage interleave_high
2958 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
2959 where the first argument is taken from the first half of DR_CHAIN and the
2960 second argument from it's second half.
2963 I1: interleave_high (1st vec, 3rd vec)
2964 I2: interleave_low (1st vec, 3rd vec)
2965 I3: interleave_high (2nd vec, 4th vec)
2966 I4: interleave_low (2nd vec, 4th vec)
2968 The output for the first stage is:
2970 I1: 0 16 1 17 2 18 3 19
2971 I2: 4 20 5 21 6 22 7 23
2972 I3: 8 24 9 25 10 26 11 27
2973 I4: 12 28 13 29 14 30 15 31
2975 The output of the second stage, i.e. the final result is:
2977 I1: 0 8 16 24 1 9 17 25
2978 I2: 2 10 18 26 3 11 19 27
2979 I3: 4 12 20 28 5 13 21 30
2980 I4: 6 14 22 30 7 15 23 31. */
2983 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
2984 unsigned int length,
2986 block_stmt_iterator *bsi,
2987 VEC(tree,heap) **result_chain)
2989 tree perm_dest, perm_stmt, vect1, vect2, high, low;
2990 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2991 tree scalar_dest, tmp;
2994 VEC(tree,heap) *first, *second;
2996 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2997 first = VEC_alloc (tree, heap, length/2);
2998 second = VEC_alloc (tree, heap, length/2);
3000 /* Check that the operation is supported. */
3001 if (!vect_strided_store_supported (vectype))
3004 *result_chain = VEC_copy (tree, heap, dr_chain);
3006 for (i = 0; i < exact_log2 (length); i++)
3008 for (j = 0; j < length/2; j++)
3010 vect1 = VEC_index (tree, dr_chain, j);
3011 vect2 = VEC_index (tree, dr_chain, j+length/2);
3013 /* Create interleaving stmt:
3014 in the case of big endian:
3015 high = interleave_high (vect1, vect2)
3016 and in the case of little endian:
3017 high = interleave_low (vect1, vect2). */
3018 perm_dest = create_tmp_var (vectype, "vect_inter_high");
3019 DECL_GIMPLE_REG_P (perm_dest) = 1;
3020 add_referenced_var (perm_dest);
3021 if (BYTES_BIG_ENDIAN)
3022 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
3024 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
3025 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
3026 high = make_ssa_name (perm_dest, perm_stmt);
3027 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
3028 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3029 VEC_replace (tree, *result_chain, 2*j, high);
3031 /* Create interleaving stmt:
3032 in the case of big endian:
3033 low = interleave_low (vect1, vect2)
3034 and in the case of little endian:
3035 low = interleave_high (vect1, vect2). */
3036 perm_dest = create_tmp_var (vectype, "vect_inter_low");
3037 DECL_GIMPLE_REG_P (perm_dest) = 1;
3038 add_referenced_var (perm_dest);
3039 if (BYTES_BIG_ENDIAN)
3040 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
3042 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
3043 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
3044 low = make_ssa_name (perm_dest, perm_stmt);
3045 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
3046 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3047 VEC_replace (tree, *result_chain, 2*j+1, low);
3049 dr_chain = VEC_copy (tree, heap, *result_chain);
3055 /* Function vectorizable_store.
3057 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
3059 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3060 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3061 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3064 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3069 tree vec_oprnd = NULL_TREE;
3070 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3071 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
3072 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3073 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3074 enum machine_mode vec_mode;
3076 enum dr_alignment_support alignment_support_cheme;
3078 def_operand_p def_p;
3080 enum vect_def_type dt;
3081 stmt_vec_info prev_stmt_info = NULL;
3082 tree dataref_ptr = NULL_TREE;
3083 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3084 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3086 tree next_stmt, first_stmt;
3087 bool strided_store = false;
3088 unsigned int group_size, i;
3089 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
3090 gcc_assert (ncopies >= 1);
3092 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3095 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3098 if (STMT_VINFO_LIVE_P (stmt_info))
3100 if (vect_print_dump_info (REPORT_DETAILS))
3101 fprintf (vect_dump, "value used after loop.");
3105 /* Is vectorizable store? */
3107 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3110 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3111 if (TREE_CODE (scalar_dest) != ARRAY_REF
3112 && TREE_CODE (scalar_dest) != INDIRECT_REF
3113 && !DR_GROUP_FIRST_DR (stmt_info))
3116 op = GIMPLE_STMT_OPERAND (stmt, 1);
3117 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
3119 if (vect_print_dump_info (REPORT_DETAILS))
3120 fprintf (vect_dump, "use not simple.");
3124 vec_mode = TYPE_MODE (vectype);
3125 /* FORNOW. In some cases can vectorize even if data-type not supported
3126 (e.g. - array initialization with 0). */
3127 if (mov_optab->handlers[(int)vec_mode].insn_code == CODE_FOR_nothing)
3130 if (!STMT_VINFO_DATA_REF (stmt_info))
3133 if (DR_GROUP_FIRST_DR (stmt_info))
3135 strided_store = true;
3136 if (!vect_strided_store_supported (vectype))
3140 if (!vec_stmt) /* transformation not required. */
3142 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
3148 if (vect_print_dump_info (REPORT_DETAILS))
3149 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
3153 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3154 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3155 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3157 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
3159 /* We vectorize all the stmts of the interleaving group when we
3160 reach the last stmt in the group. */
3161 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
3162 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
3164 *vec_stmt = NULL_TREE;
3175 dr_chain = VEC_alloc (tree, heap, group_size);
3176 oprnds = VEC_alloc (tree, heap, group_size);
3178 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3179 gcc_assert (alignment_support_cheme);
3180 gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */
3182 /* In case the vectorization factor (VF) is bigger than the number
3183 of elements that we can fit in a vectype (nunits), we have to generate
3184 more than one vector stmt - i.e - we need to "unroll" the
3185 vector stmt by a factor VF/nunits. For more details see documentation in
3186 vect_get_vec_def_for_copy_stmt. */
3188 /* In case of interleaving (non-unit strided access):
3195 We create vectorized stores starting from base address (the access of the
3196 first stmt in the chain (S2 in the above example), when the last store stmt
3197 of the chain (S4) is reached:
3200 VS2: &base + vec_size*1 = vx0
3201 VS3: &base + vec_size*2 = vx1
3202 VS4: &base + vec_size*3 = vx3
3204 Then permutation statements are generated:
3206 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
3207 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
3210 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3211 (the order of the data-refs in the output of vect_permute_store_chain
3212 corresponds to the order of scalar stmts in the interleaving chain - see
3213 the documentation of vect_permute_store_chain()).
3215 In case of both multiple types and interleaving, above vector stores and
3216 permutation stmts are created for every copy. The result vector stmts are
3217 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3218 STMT_VINFO_RELATED_STMT for the next copies.
3221 prev_stmt_info = NULL;
3222 for (j = 0; j < ncopies; j++)
3229 /* For interleaved stores we collect vectorized defs for all the
3230 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
3231 as an input to vect_permute_store_chain(), and OPRNDS as an input
3232 to vect_get_vec_def_for_stmt_copy() for the next copy.
3233 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
3234 OPRNDS are of size 1. */
3235 next_stmt = first_stmt;
3236 for (i = 0; i < group_size; i++)
3238 /* Since gaps are not supported for interleaved stores, GROUP_SIZE
3239 is the exact number of stmts in the chain. Therefore, NEXT_STMT
3240 can't be NULL_TREE. In case that there is no interleaving,
3241 GROUP_SIZE is 1, and only one iteration of the loop will be
3243 gcc_assert (next_stmt);
3244 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
3245 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
3246 VEC_quick_push(tree, dr_chain, vec_oprnd);
3247 VEC_quick_push(tree, oprnds, vec_oprnd);
3248 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3250 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE,
3251 &dummy, &ptr_incr, false,
3252 TREE_TYPE (vec_oprnd));
3256 /* For interleaved stores we created vectorized defs for all the
3257 defs stored in OPRNDS in the previous iteration (previous copy).
3258 DR_CHAIN is then used as an input to vect_permute_store_chain(),
3259 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
3261 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
3262 OPRNDS are of size 1. */
3263 for (i = 0; i < group_size; i++)
3265 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
3266 VEC_index (tree, oprnds, i));
3267 VEC_replace(tree, dr_chain, i, vec_oprnd);
3268 VEC_replace(tree, oprnds, i, vec_oprnd);
3270 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3275 result_chain = VEC_alloc (tree, heap, group_size);
3277 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
3282 next_stmt = first_stmt;
3283 for (i = 0; i < group_size; i++)
3285 /* For strided stores vectorized defs are interleaved in
3286 vect_permute_store_chain(). */
3288 vec_oprnd = VEC_index(tree, result_chain, i);
3290 data_ref = build_fold_indirect_ref (dataref_ptr);
3291 /* Arguments are ready. Create the new vector stmt. */
3292 new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
3293 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3295 /* Set the VDEFs for the vector pointer. If this virtual def
3296 has a use outside the loop and a loop peel is performed
3297 then the def may be renamed by the peel. Mark it for
3298 renaming so the later use will also be renamed. */
3299 copy_virtual_operands (new_stmt, next_stmt);
3302 /* The original store is deleted so the same SSA_NAMEs
3304 FOR_EACH_SSA_TREE_OPERAND (def, next_stmt, iter, SSA_OP_VDEF)
3306 SSA_NAME_DEF_STMT (def) = new_stmt;
3307 mark_sym_for_renaming (SSA_NAME_VAR (def));
3310 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3314 /* Create new names for all the definitions created by COPY and
3315 add replacement mappings for each new name. */
3316 FOR_EACH_SSA_DEF_OPERAND (def_p, new_stmt, iter, SSA_OP_VDEF)
3318 create_new_def_for (DEF_FROM_PTR (def_p), new_stmt, def_p);
3319 mark_sym_for_renaming (SSA_NAME_VAR (DEF_FROM_PTR (def_p)));
3322 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3325 prev_stmt_info = vinfo_for_stmt (new_stmt);
3326 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3329 /* Bump the vector pointer. */
3330 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3338 /* Function vect_setup_realignment
3340 This function is called when vectorizing an unaligned load using
3341 the dr_unaligned_software_pipeline scheme.
3342 This function generates the following code at the loop prolog:
3345 msq_init = *(floor(p)); # prolog load
3346 realignment_token = call target_builtin;
3348 msq = phi (msq_init, ---)
3350 The code above sets up a new (vector) pointer, pointing to the first
3351 location accessed by STMT, and a "floor-aligned" load using that pointer.
3352 It also generates code to compute the "realignment-token" (if the relevant
3353 target hook was defined), and creates a phi-node at the loop-header bb
3354 whose arguments are the result of the prolog-load (created by this
3355 function) and the result of a load that takes place in the loop (to be
3356 created by the caller to this function).
3357 The caller to this function uses the phi-result (msq) to create the
3358 realignment code inside the loop, and sets up the missing phi argument,
3362 msq = phi (msq_init, lsq)
3363 lsq = *(floor(p')); # load in loop
3364 result = realign_load (msq, lsq, realignment_token);
3367 STMT - (scalar) load stmt to be vectorized. This load accesses
3368 a memory location that may be unaligned.
3369 BSI - place where new code is to be inserted.
3372 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
3373 target hook, if defined.
3374 Return value - the result of the loop-header phi node. */
3377 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
3378 tree *realignment_token)
3380 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3381 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3382 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3383 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3384 edge pe = loop_preheader_edge (loop);
3385 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3398 /* 1. Create msq_init = *(floor(p1)) in the loop preheader */
3399 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3400 ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true,
3402 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
3403 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
3404 new_temp = make_ssa_name (vec_dest, new_stmt);
3405 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3406 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3407 gcc_assert (!new_bb);
3408 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
3409 copy_virtual_operands (new_stmt, stmt);
3410 update_vuses_to_preheader (new_stmt, loop);
3412 /* 2. Create permutation mask, if required, in loop preheader. */
3413 if (targetm.vectorize.builtin_mask_for_load)
3417 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
3418 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
3419 vec_dest = vect_create_destination_var (scalar_dest,
3420 TREE_TYPE (new_stmt));
3421 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3422 new_temp = make_ssa_name (vec_dest, new_stmt);
3423 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3424 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3425 gcc_assert (!new_bb);
3426 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
3428 /* The result of the CALL_EXPR to this builtin is determined from
3429 the value of the parameter and no global variables are touched
3430 which makes the builtin a "const" function. Requiring the
3431 builtin to have the "const" attribute makes it unnecessary
3432 to call mark_call_clobbered. */
3433 gcc_assert (TREE_READONLY (builtin_decl));
3436 /* 3. Create msq = phi <msq_init, lsq> in loop */
3437 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3438 msq = make_ssa_name (vec_dest, NULL_TREE);
3439 phi_stmt = create_phi_node (msq, loop->header);
3440 SSA_NAME_DEF_STMT (msq) = phi_stmt;
3441 add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop));
3447 /* Function vect_strided_load_supported.
3449 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
3450 and FALSE otherwise. */
3453 vect_strided_load_supported (tree vectype)
3455 optab perm_even_optab, perm_odd_optab;
3458 mode = (int) TYPE_MODE (vectype);
3460 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
3461 if (!perm_even_optab)
3463 if (vect_print_dump_info (REPORT_DETAILS))
3464 fprintf (vect_dump, "no optab for perm_even.");
3468 if (perm_even_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3470 if (vect_print_dump_info (REPORT_DETAILS))
3471 fprintf (vect_dump, "perm_even op not supported by target.");
3475 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
3476 if (!perm_odd_optab)
3478 if (vect_print_dump_info (REPORT_DETAILS))
3479 fprintf (vect_dump, "no optab for perm_odd.");
3483 if (perm_odd_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3485 if (vect_print_dump_info (REPORT_DETAILS))
3486 fprintf (vect_dump, "perm_odd op not supported by target.");
3493 /* Function vect_permute_load_chain.
3495 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
3496 a power of 2, generate extract_even/odd stmts to reorder the input data
3497 correctly. Return the final references for loads in RESULT_CHAIN.
3499 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3500 The input is 4 vectors each containing 8 elements. We assign a number to each
3501 element, the input sequence is:
3503 1st vec: 0 1 2 3 4 5 6 7
3504 2nd vec: 8 9 10 11 12 13 14 15
3505 3rd vec: 16 17 18 19 20 21 22 23
3506 4th vec: 24 25 26 27 28 29 30 31
3508 The output sequence should be:
3510 1st vec: 0 4 8 12 16 20 24 28
3511 2nd vec: 1 5 9 13 17 21 25 29
3512 3rd vec: 2 6 10 14 18 22 26 30
3513 4th vec: 3 7 11 15 19 23 27 31
3515 i.e., the first output vector should contain the first elements of each
3516 interleaving group, etc.
3518 We use extract_even/odd instructions to create such output. The input of each
3519 extract_even/odd operation is two vectors
3523 and the output is the vector of extracted even/odd elements. The output of
3524 extract_even will be: 0 2 4 6
3525 and of extract_odd: 1 3 5 7
3528 The permutation is done in log LENGTH stages. In each stage extract_even and
3529 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
3530 order. In our example,
3532 E1: extract_even (1st vec, 2nd vec)
3533 E2: extract_odd (1st vec, 2nd vec)
3534 E3: extract_even (3rd vec, 4th vec)
3535 E4: extract_odd (3rd vec, 4th vec)
3537 The output for the first stage will be:
3539 E1: 0 2 4 6 8 10 12 14
3540 E2: 1 3 5 7 9 11 13 15
3541 E3: 16 18 20 22 24 26 28 30
3542 E4: 17 19 21 23 25 27 29 31
3544 In order to proceed and create the correct sequence for the next stage (or
3545 for the correct output, if the second stage is the last one, as in our
3546 example), we first put the output of extract_even operation and then the
3547 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
3548 The input for the second stage is:
3550 1st vec (E1): 0 2 4 6 8 10 12 14
3551 2nd vec (E3): 16 18 20 22 24 26 28 30
3552 3rd vec (E2): 1 3 5 7 9 11 13 15
3553 4th vec (E4): 17 19 21 23 25 27 29 31
3555 The output of the second stage:
3557 E1: 0 4 8 12 16 20 24 28
3558 E2: 2 6 10 14 18 22 26 30
3559 E3: 1 5 9 13 17 21 25 29
3560 E4: 3 7 11 15 19 23 27 31
3562 And RESULT_CHAIN after reordering:
3564 1st vec (E1): 0 4 8 12 16 20 24 28
3565 2nd vec (E3): 1 5 9 13 17 21 25 29
3566 3rd vec (E2): 2 6 10 14 18 22 26 30
3567 4th vec (E4): 3 7 11 15 19 23 27 31. */
3570 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
3571 unsigned int length,
3573 block_stmt_iterator *bsi,
3574 VEC(tree,heap) **result_chain)
3576 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
3577 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3582 /* Check that the operation is supported. */
3583 if (!vect_strided_load_supported (vectype))
3586 *result_chain = VEC_copy (tree, heap, dr_chain);
3587 for (i = 0; i < exact_log2 (length); i++)
3589 for (j = 0; j < length; j +=2)
3591 first_vect = VEC_index (tree, dr_chain, j);
3592 second_vect = VEC_index (tree, dr_chain, j+1);
3594 /* data_ref = permute_even (first_data_ref, second_data_ref); */
3595 perm_dest = create_tmp_var (vectype, "vect_perm_even");
3596 DECL_GIMPLE_REG_P (perm_dest) = 1;
3597 add_referenced_var (perm_dest);
3599 tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
3600 first_vect, second_vect);
3601 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
3603 data_ref = make_ssa_name (perm_dest, perm_stmt);
3604 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3605 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3606 mark_symbols_for_renaming (perm_stmt);
3608 VEC_replace (tree, *result_chain, j/2, data_ref);
3610 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
3611 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
3612 DECL_GIMPLE_REG_P (perm_dest) = 1;
3613 add_referenced_var (perm_dest);
3615 tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
3616 first_vect, second_vect);
3617 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
3618 data_ref = make_ssa_name (perm_dest, perm_stmt);
3619 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3620 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3621 mark_symbols_for_renaming (perm_stmt);
3623 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
3625 dr_chain = VEC_copy (tree, heap, *result_chain);
3631 /* Function vect_transform_strided_load.
3633 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
3634 to perform their permutation and ascribe the result vectorized statements to
3635 the scalar statements.
3639 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
3640 block_stmt_iterator *bsi)
3642 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3643 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3644 tree next_stmt, new_stmt;
3645 VEC(tree,heap) *result_chain = NULL;
3646 unsigned int i, gap_count;
3649 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
3650 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
3651 vectors, that are ready for vector computation. */
3652 result_chain = VEC_alloc (tree, heap, size);
3654 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
3657 /* Put a permuted data-ref in the VECTORIZED_STMT field.
3658 Since we scan the chain starting from it's first node, their order
3659 corresponds the order of data-refs in RESULT_CHAIN. */
3660 next_stmt = first_stmt;
3662 for (i = 0; VEC_iterate(tree, result_chain, i, tmp_data_ref); i++)
3667 /* Skip the gaps. Loads created for the gaps will be removed by dead
3668 code elimination pass later.
3669 DR_GROUP_GAP is the number of steps in elements from the previous
3670 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
3671 correspond to the gaps.
3673 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
3681 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
3682 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
3683 copies, and we put the new vector statement in the first available
3685 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
3686 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
3689 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
3690 tree rel_stmt = STMT_VINFO_RELATED_STMT (
3691 vinfo_for_stmt (prev_stmt));
3694 prev_stmt = rel_stmt;
3695 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
3697 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
3699 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3701 /* If NEXT_STMT accesses the same DR as the previous statement,
3702 put the same TMP_DATA_REF as its vectorized statement; otherwise
3703 get the next data-ref from RESULT_CHAIN. */
3704 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
3712 /* vectorizable_load.
3714 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
3716 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3717 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3718 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3721 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3724 tree vec_dest = NULL;
3725 tree data_ref = NULL;
3727 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3728 stmt_vec_info prev_stmt_info;
3729 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3730 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3731 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
3732 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3735 tree new_stmt = NULL_TREE;
3737 enum dr_alignment_support alignment_support_cheme;
3738 tree dataref_ptr = NULL_TREE;
3740 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3741 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3742 int i, j, group_size;
3743 tree msq = NULL_TREE, lsq;
3744 tree offset = NULL_TREE;
3745 tree realignment_token = NULL_TREE;
3746 tree phi_stmt = NULL_TREE;
3747 VEC(tree,heap) *dr_chain = NULL;
3748 bool strided_load = false;
3751 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3754 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3757 /* FORNOW: not yet supported. */
3758 if (STMT_VINFO_LIVE_P (stmt_info))
3760 if (vect_print_dump_info (REPORT_DETAILS))
3761 fprintf (vect_dump, "value used after loop.");
3765 /* Is vectorizable load? */
3766 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3769 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3770 if (TREE_CODE (scalar_dest) != SSA_NAME)
3773 op = GIMPLE_STMT_OPERAND (stmt, 1);
3774 if (TREE_CODE (op) != ARRAY_REF
3775 && TREE_CODE (op) != INDIRECT_REF
3776 && !DR_GROUP_FIRST_DR (stmt_info))
3779 if (!STMT_VINFO_DATA_REF (stmt_info))
3782 mode = (int) TYPE_MODE (vectype);
3784 /* FORNOW. In some cases can vectorize even if data-type not supported
3785 (e.g. - data copies). */
3786 if (mov_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3788 if (vect_print_dump_info (REPORT_DETAILS))
3789 fprintf (vect_dump, "Aligned load, but unsupported type.");
3793 /* Check if the load is a part of an interleaving chain. */
3794 if (DR_GROUP_FIRST_DR (stmt_info))
3796 strided_load = true;
3798 /* Check if interleaving is supported. */
3799 if (!vect_strided_load_supported (vectype))
3803 if (!vec_stmt) /* transformation not required. */
3805 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
3811 if (vect_print_dump_info (REPORT_DETAILS))
3812 fprintf (vect_dump, "transform load.");
3816 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3817 /* Check if the chain of loads is already vectorized. */
3818 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
3820 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3823 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3824 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3825 dr_chain = VEC_alloc (tree, heap, group_size);
3834 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3835 gcc_assert (alignment_support_cheme);
3838 /* In case the vectorization factor (VF) is bigger than the number
3839 of elements that we can fit in a vectype (nunits), we have to generate
3840 more than one vector stmt - i.e - we need to "unroll" the
3841 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3842 from one copy of the vector stmt to the next, in the field
3843 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3844 stages to find the correct vector defs to be used when vectorizing
3845 stmts that use the defs of the current stmt. The example below illustrates
3846 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3847 4 vectorized stmts):
3849 before vectorization:
3850 RELATED_STMT VEC_STMT
3854 step 1: vectorize stmt S1:
3855 We first create the vector stmt VS1_0, and, as usual, record a
3856 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
3857 Next, we create the vector stmt VS1_1, and record a pointer to
3858 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
3859 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
3861 RELATED_STMT VEC_STMT
3862 VS1_0: vx0 = memref0 VS1_1 -
3863 VS1_1: vx1 = memref1 VS1_2 -
3864 VS1_2: vx2 = memref2 VS1_3 -
3865 VS1_3: vx3 = memref3 - -
3866 S1: x = load - VS1_0
3869 See in documentation in vect_get_vec_def_for_stmt_copy for how the
3870 information we recorded in RELATED_STMT field is used to vectorize
3873 /* In case of interleaving (non-unit strided access):
3880 Vectorized loads are created in the order of memory accesses
3881 starting from the access of the first stmt of the chain:
3884 VS2: vx1 = &base + vec_size*1
3885 VS3: vx3 = &base + vec_size*2
3886 VS4: vx4 = &base + vec_size*3
3888 Then permutation statements are generated:
3890 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
3891 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
3894 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3895 (the order of the data-refs in the output of vect_permute_load_chain
3896 corresponds to the order of scalar stmts in the interleaving chain - see
3897 the documentation of vect_permute_load_chain()).
3898 The generation of permutation stmts and recording them in
3899 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
3901 In case of both multiple types and interleaving, the vector loads and
3902 permutation stmts above are created for every copy. The result vector stmts
3903 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3904 STMT_VINFO_RELATED_STMT for the next copies. */
3906 /* If the data reference is aligned (dr_aligned) or potentially unaligned
3907 on a target that supports unaligned accesses (dr_unaligned_supported)
3908 we generate the following code:
3912 p = p + indx * vectype_size;
3917 Otherwise, the data reference is potentially unaligned on a target that
3918 does not support unaligned accesses (dr_unaligned_software_pipeline) -
3919 then generate the following code, in which the data in each iteration is
3920 obtained by two vector loads, one from the previous iteration, and one
3921 from the current iteration:
3923 msq_init = *(floor(p1))
3924 p2 = initial_addr + VS - 1;
3925 realignment_token = call target_builtin;
3928 p2 = p2 + indx * vectype_size
3930 vec_dest = realign_load (msq, lsq, realignment_token)
3935 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3937 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token);
3938 phi_stmt = SSA_NAME_DEF_STMT (msq);
3939 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
3942 prev_stmt_info = NULL;
3943 for (j = 0; j < ncopies; j++)
3945 /* 1. Create the vector pointer update chain. */
3947 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset, &dummy,
3948 &ptr_incr, false, NULL_TREE);
3950 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3952 for (i = 0; i < group_size; i++)
3954 /* 2. Create the vector-load in the loop. */
3955 switch (alignment_support_cheme)
3958 gcc_assert (aligned_access_p (first_dr));
3959 data_ref = build_fold_indirect_ref (dataref_ptr);
3961 case dr_unaligned_supported:
3963 int mis = DR_MISALIGNMENT (first_dr);
3964 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
3966 gcc_assert (!aligned_access_p (first_dr));
3967 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
3969 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
3972 case dr_unaligned_software_pipeline:
3973 gcc_assert (!aligned_access_p (first_dr));
3974 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
3979 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3980 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
3981 new_temp = make_ssa_name (vec_dest, new_stmt);
3982 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3983 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3984 copy_virtual_operands (new_stmt, stmt);
3985 mark_symbols_for_renaming (new_stmt);
3987 /* 3. Handle explicit realignment if necessary/supported. */
3988 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3991 <vec_dest = realign_load (msq, lsq, realignment_token)> */
3992 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
3993 if (!realignment_token)
3994 realignment_token = dataref_ptr;
3995 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3997 build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token);
3998 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3999 new_temp = make_ssa_name (vec_dest, new_stmt);
4000 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4001 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4002 if (i == group_size - 1 && j == ncopies - 1)
4003 add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop));
4007 VEC_quick_push (tree, dr_chain, new_temp);
4008 if (i < group_size - 1)
4009 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
4014 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
4016 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4017 dr_chain = VEC_alloc (tree, heap, group_size);
4022 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4024 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4025 prev_stmt_info = vinfo_for_stmt (new_stmt);
4033 /* Function vectorizable_live_operation.
4035 STMT computes a value that is used outside the loop. Check if
4036 it can be supported. */
4039 vectorizable_live_operation (tree stmt,
4040 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
4041 tree *vec_stmt ATTRIBUTE_UNUSED)
4044 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4045 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4050 enum vect_def_type dt;
4052 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4054 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4057 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4060 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4063 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4064 op_type = TREE_OPERAND_LENGTH (operation);
4066 /* FORNOW: support only if all uses are invariant. This means
4067 that the scalar operations can remain in place, unvectorized.
4068 The original last scalar value that they compute will be used. */
4070 for (i = 0; i < op_type; i++)
4072 op = TREE_OPERAND (operation, i);
4073 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4075 if (vect_print_dump_info (REPORT_DETAILS))
4076 fprintf (vect_dump, "use not simple.");
4080 if (dt != vect_invariant_def && dt != vect_constant_def)
4084 /* No transformation is required for the cases we currently support. */
4089 /* Function vect_is_simple_cond.
4092 LOOP - the loop that is being vectorized.
4093 COND - Condition that is checked for simple use.
4095 Returns whether a COND can be vectorized. Checks whether
4096 condition operands are supportable using vec_is_simple_use. */
4099 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
4103 enum vect_def_type dt;
4105 if (!COMPARISON_CLASS_P (cond))
4108 lhs = TREE_OPERAND (cond, 0);
4109 rhs = TREE_OPERAND (cond, 1);
4111 if (TREE_CODE (lhs) == SSA_NAME)
4113 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
4114 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
4117 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST)
4120 if (TREE_CODE (rhs) == SSA_NAME)
4122 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
4123 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
4126 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST)
4132 /* vectorizable_condition.
4134 Check if STMT is conditional modify expression that can be vectorized.
4135 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4136 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
4139 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4142 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
4144 tree scalar_dest = NULL_TREE;
4145 tree vec_dest = NULL_TREE;
4146 tree op = NULL_TREE;
4147 tree cond_expr, then_clause, else_clause;
4148 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4149 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4150 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
4151 tree vec_compare, vec_cond_expr;
4153 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4154 enum machine_mode vec_mode;
4156 enum vect_def_type dt;
4157 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4158 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4160 gcc_assert (ncopies >= 1);
4162 return false; /* FORNOW */
4164 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4167 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4170 /* FORNOW: not yet supported. */
4171 if (STMT_VINFO_LIVE_P (stmt_info))
4173 if (vect_print_dump_info (REPORT_DETAILS))
4174 fprintf (vect_dump, "value used after loop.");
4178 /* Is vectorizable conditional operation? */
4179 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4182 op = GIMPLE_STMT_OPERAND (stmt, 1);
4184 if (TREE_CODE (op) != COND_EXPR)
4187 cond_expr = TREE_OPERAND (op, 0);
4188 then_clause = TREE_OPERAND (op, 1);
4189 else_clause = TREE_OPERAND (op, 2);
4191 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
4194 /* We do not handle two different vector types for the condition
4196 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
4199 if (TREE_CODE (then_clause) == SSA_NAME)
4201 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
4202 if (!vect_is_simple_use (then_clause, loop_vinfo,
4203 &then_def_stmt, &def, &dt))
4206 else if (TREE_CODE (then_clause) != INTEGER_CST
4207 && TREE_CODE (then_clause) != REAL_CST)
4210 if (TREE_CODE (else_clause) == SSA_NAME)
4212 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
4213 if (!vect_is_simple_use (else_clause, loop_vinfo,
4214 &else_def_stmt, &def, &dt))
4217 else if (TREE_CODE (else_clause) != INTEGER_CST
4218 && TREE_CODE (else_clause) != REAL_CST)
4222 vec_mode = TYPE_MODE (vectype);
4226 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
4227 return expand_vec_cond_expr_p (op, vec_mode);
4233 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4234 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4236 /* Handle cond expr. */
4238 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
4240 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
4241 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
4242 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
4244 /* Arguments are ready. create the new vector stmt. */
4245 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
4246 vec_cond_lhs, vec_cond_rhs);
4247 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
4248 vec_compare, vec_then_clause, vec_else_clause);
4250 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
4251 new_temp = make_ssa_name (vec_dest, *vec_stmt);
4252 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
4253 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
4258 /* Function vect_transform_stmt.
4260 Create a vectorized stmt to replace STMT, and insert it at BSI. */
4263 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
4265 bool is_store = false;
4266 tree vec_stmt = NULL_TREE;
4267 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4268 tree orig_stmt_in_pattern;
4271 switch (STMT_VINFO_TYPE (stmt_info))
4273 case type_demotion_vec_info_type:
4274 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
4278 case type_promotion_vec_info_type:
4279 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
4283 case type_conversion_vec_info_type:
4284 done = vectorizable_conversion (stmt, bsi, &vec_stmt);
4288 case op_vec_info_type:
4289 done = vectorizable_operation (stmt, bsi, &vec_stmt);
4293 case assignment_vec_info_type:
4294 done = vectorizable_assignment (stmt, bsi, &vec_stmt);
4298 case load_vec_info_type:
4299 done = vectorizable_load (stmt, bsi, &vec_stmt);
4303 case store_vec_info_type:
4304 done = vectorizable_store (stmt, bsi, &vec_stmt);
4306 if (DR_GROUP_FIRST_DR (stmt_info))
4308 /* In case of interleaving, the whole chain is vectorized when the
4309 last store in the chain is reached. Store stmts before the last
4310 one are skipped, and there vec_stmt_info shouldn't be freed
4312 *strided_store = true;
4313 if (STMT_VINFO_VEC_STMT (stmt_info))
4320 case condition_vec_info_type:
4321 done = vectorizable_condition (stmt, bsi, &vec_stmt);
4325 case call_vec_info_type:
4326 done = vectorizable_call (stmt, bsi, &vec_stmt);
4329 case reduc_vec_info_type:
4330 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
4335 if (!STMT_VINFO_LIVE_P (stmt_info))
4337 if (vect_print_dump_info (REPORT_DETAILS))
4338 fprintf (vect_dump, "stmt not supported.");
4343 if (STMT_VINFO_LIVE_P (stmt_info)
4344 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
4346 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
4352 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
4353 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
4354 if (orig_stmt_in_pattern)
4356 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
4357 /* STMT was inserted by the vectorizer to replace a computation idiom.
4358 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
4359 computed this idiom. We need to record a pointer to VEC_STMT in
4360 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
4361 documentation of vect_pattern_recog. */
4362 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
4364 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4365 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
4374 /* This function builds ni_name = number of iterations loop executes
4375 on the loop preheader. */
4378 vect_build_loop_niters (loop_vec_info loop_vinfo)
4380 tree ni_name, stmt, var;
4382 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4383 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
4385 var = create_tmp_var (TREE_TYPE (ni), "niters");
4386 add_referenced_var (var);
4387 ni_name = force_gimple_operand (ni, &stmt, false, var);
4389 pe = loop_preheader_edge (loop);
4392 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4393 gcc_assert (!new_bb);
4400 /* This function generates the following statements:
4402 ni_name = number of iterations loop executes
4403 ratio = ni_name / vf
4404 ratio_mult_vf_name = ratio * vf
4406 and places them at the loop preheader edge. */
4409 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
4411 tree *ratio_mult_vf_name_ptr,
4412 tree *ratio_name_ptr)
4420 tree ratio_mult_vf_name;
4421 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4422 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
4423 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4426 pe = loop_preheader_edge (loop);
4428 /* Generate temporary variable that contains
4429 number of iterations loop executes. */
4431 ni_name = vect_build_loop_niters (loop_vinfo);
4432 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
4434 /* Create: ratio = ni >> log2(vf) */
4436 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
4437 if (!is_gimple_val (ratio_name))
4439 var = create_tmp_var (TREE_TYPE (ni), "bnd");
4440 add_referenced_var (var);
4442 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
4443 pe = loop_preheader_edge (loop);
4444 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4445 gcc_assert (!new_bb);
4448 /* Create: ratio_mult_vf = ratio << log2 (vf). */
4450 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
4451 ratio_name, log_vf);
4452 if (!is_gimple_val (ratio_mult_vf_name))
4454 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
4455 add_referenced_var (var);
4457 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
4459 pe = loop_preheader_edge (loop);
4460 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4461 gcc_assert (!new_bb);
4464 *ni_name_ptr = ni_name;
4465 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
4466 *ratio_name_ptr = ratio_name;
4472 /* Function update_vuses_to_preheader.
4475 STMT - a statement with potential VUSEs.
4476 LOOP - the loop whose preheader will contain STMT.
4478 It's possible to vectorize a loop even though an SSA_NAME from a VUSE
4479 appears to be defined in a VDEF in another statement in a loop.
4480 One such case is when the VUSE is at the dereference of a __restricted__
4481 pointer in a load and the VDEF is at the dereference of a different
4482 __restricted__ pointer in a store. Vectorization may result in
4483 copy_virtual_uses being called to copy the problematic VUSE to a new
4484 statement that is being inserted in the loop preheader. This procedure
4485 is called to change the SSA_NAME in the new statement's VUSE from the
4486 SSA_NAME updated in the loop to the related SSA_NAME available on the
4487 path entering the loop.
4489 When this function is called, we have the following situation:
4494 # name1 = phi < name0 , name2>
4499 # name2 = vdef <name1>
4504 Stmt S1 was created in the loop preheader block as part of misaligned-load
4505 handling. This function fixes the name of the vuse of S1 from 'name1' to
4509 update_vuses_to_preheader (tree stmt, struct loop *loop)
4511 basic_block header_bb = loop->header;
4512 edge preheader_e = loop_preheader_edge (loop);
4514 use_operand_p use_p;
4516 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VUSE)
4518 tree ssa_name = USE_FROM_PTR (use_p);
4519 tree def_stmt = SSA_NAME_DEF_STMT (ssa_name);
4520 tree name_var = SSA_NAME_VAR (ssa_name);
4521 basic_block bb = bb_for_stmt (def_stmt);
4523 /* For a use before any definitions, def_stmt is a NOP_EXPR. */
4524 if (!IS_EMPTY_STMT (def_stmt)
4525 && flow_bb_inside_loop_p (loop, bb))
4527 /* If the block containing the statement defining the SSA_NAME
4528 is in the loop then it's necessary to find the definition
4529 outside the loop using the PHI nodes of the header. */
4531 bool updated = false;
4533 for (phi = phi_nodes (header_bb); phi; phi = TREE_CHAIN (phi))
4535 if (SSA_NAME_VAR (PHI_RESULT (phi)) == name_var)
4537 SET_USE (use_p, PHI_ARG_DEF (phi, preheader_e->dest_idx));
4542 gcc_assert (updated);
4548 /* Function vect_update_ivs_after_vectorizer.
4550 "Advance" the induction variables of LOOP to the value they should take
4551 after the execution of LOOP. This is currently necessary because the
4552 vectorizer does not handle induction variables that are used after the
4553 loop. Such a situation occurs when the last iterations of LOOP are
4555 1. We introduced new uses after LOOP for IVs that were not originally used
4556 after LOOP: the IVs of LOOP are now used by an epilog loop.
4557 2. LOOP is going to be vectorized; this means that it will iterate N/VF
4558 times, whereas the loop IVs should be bumped N times.
4561 - LOOP - a loop that is going to be vectorized. The last few iterations
4562 of LOOP were peeled.
4563 - NITERS - the number of iterations that LOOP executes (before it is
4564 vectorized). i.e, the number of times the ivs should be bumped.
4565 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
4566 coming out from LOOP on which there are uses of the LOOP ivs
4567 (this is the path from LOOP->exit to epilog_loop->preheader).
4569 The new definitions of the ivs are placed in LOOP->exit.
4570 The phi args associated with the edge UPDATE_E in the bb
4571 UPDATE_E->dest are updated accordingly.
4573 Assumption 1: Like the rest of the vectorizer, this function assumes
4574 a single loop exit that has a single predecessor.
4576 Assumption 2: The phi nodes in the LOOP header and in update_bb are
4577 organized in the same order.
4579 Assumption 3: The access function of the ivs is simple enough (see
4580 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
4582 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
4583 coming out of LOOP on which the ivs of LOOP are used (this is the path
4584 that leads to the epilog loop; other paths skip the epilog loop). This
4585 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
4586 needs to have its phis updated.
4590 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
4593 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4594 basic_block exit_bb = single_exit (loop)->dest;
4596 basic_block update_bb = update_e->dest;
4598 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
4600 /* Make sure there exists a single-predecessor exit bb: */
4601 gcc_assert (single_pred_p (exit_bb));
4603 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
4605 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
4607 tree access_fn = NULL;
4608 tree evolution_part;
4611 tree var, stmt, ni, ni_name;
4612 block_stmt_iterator last_bsi;
4614 if (vect_print_dump_info (REPORT_DETAILS))
4616 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
4617 print_generic_expr (vect_dump, phi, TDF_SLIM);
4620 /* Skip virtual phi's. */
4621 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
4623 if (vect_print_dump_info (REPORT_DETAILS))
4624 fprintf (vect_dump, "virtual phi. skip.");
4628 /* Skip reduction phis. */
4629 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
4631 if (vect_print_dump_info (REPORT_DETAILS))
4632 fprintf (vect_dump, "reduc phi. skip.");
4636 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
4637 gcc_assert (access_fn);
4639 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
4640 gcc_assert (evolution_part != NULL_TREE);
4642 /* FORNOW: We do not support IVs whose evolution function is a polynomial
4643 of degree >= 2 or exponential. */
4644 gcc_assert (!tree_is_chrec (evolution_part));
4646 step_expr = evolution_part;
4647 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
4650 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
4651 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
4652 fold_convert (TREE_TYPE (init_expr),
4657 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
4658 add_referenced_var (var);
4660 ni_name = force_gimple_operand (ni, &stmt, false, var);
4662 /* Insert stmt into exit_bb. */
4663 last_bsi = bsi_last (exit_bb);
4665 bsi_insert_before (&last_bsi, stmt, BSI_SAME_STMT);
4667 /* Fix phi expressions in the successor bb. */
4668 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
4673 /* Function vect_do_peeling_for_loop_bound
4675 Peel the last iterations of the loop represented by LOOP_VINFO.
4676 The peeled iterations form a new epilog loop. Given that the loop now
4677 iterates NITERS times, the new epilog loop iterates
4678 NITERS % VECTORIZATION_FACTOR times.
4680 The original loop will later be made to iterate
4681 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
4684 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
4686 tree ni_name, ratio_mult_vf_name;
4687 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4688 struct loop *new_loop;
4690 basic_block preheader;
4694 if (vect_print_dump_info (REPORT_DETAILS))
4695 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
4697 initialize_original_copy_tables ();
4699 /* Generate the following variables on the preheader of original loop:
4701 ni_name = number of iteration the original loop executes
4702 ratio = ni_name / vf
4703 ratio_mult_vf_name = ratio * vf */
4704 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
4705 &ratio_mult_vf_name, ratio);
4707 loop_num = loop->num;
4708 /* Threshold for vectorized loop. */
4709 th = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)) *
4710 LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4711 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
4712 ratio_mult_vf_name, ni_name, false, th);
4713 gcc_assert (new_loop);
4714 gcc_assert (loop_num == loop->num);
4715 #ifdef ENABLE_CHECKING
4716 slpeel_verify_cfg_after_peeling (loop, new_loop);
4719 /* A guard that controls whether the new_loop is to be executed or skipped
4720 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
4721 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
4722 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
4723 is on the path where the LOOP IVs are used and need to be updated. */
4725 preheader = loop_preheader_edge (new_loop)->src;
4726 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
4727 update_e = EDGE_PRED (preheader, 0);
4729 update_e = EDGE_PRED (preheader, 1);
4731 /* Update IVs of original loop as if they were advanced
4732 by ratio_mult_vf_name steps. */
4733 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
4735 /* After peeling we have to reset scalar evolution analyzer. */
4738 free_original_copy_tables ();
4742 /* Function vect_gen_niters_for_prolog_loop
4744 Set the number of iterations for the loop represented by LOOP_VINFO
4745 to the minimum between LOOP_NITERS (the original iteration count of the loop)
4746 and the misalignment of DR - the data reference recorded in
4747 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
4748 this loop, the data reference DR will refer to an aligned location.
4750 The following computation is generated:
4752 If the misalignment of DR is known at compile time:
4753 addr_mis = int mis = DR_MISALIGNMENT (dr);
4754 Else, compute address misalignment in bytes:
4755 addr_mis = addr & (vectype_size - 1)
4757 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
4759 (elem_size = element type size; an element is the scalar element
4760 whose type is the inner type of the vectype)
4764 prolog_niters = min ( LOOP_NITERS ,
4765 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
4766 where group_size is the size of the interleaved group.
4770 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
4772 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
4773 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4774 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4776 tree iters, iters_name;
4779 tree dr_stmt = DR_STMT (dr);
4780 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
4781 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4782 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
4783 tree niters_type = TREE_TYPE (loop_niters);
4785 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
4787 if (DR_GROUP_FIRST_DR (stmt_info))
4789 /* For interleaved access element size must be multiplied by the size of
4790 the interleaved group. */
4791 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
4792 DR_GROUP_FIRST_DR (stmt_info)));
4793 element_size *= group_size;
4796 pe = loop_preheader_edge (loop);
4798 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
4800 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
4801 int elem_misalign = byte_misalign / element_size;
4803 if (vect_print_dump_info (REPORT_DETAILS))
4804 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
4805 iters = build_int_cst (niters_type,
4806 (vf - elem_misalign)&(vf/group_size-1));
4810 tree new_stmts = NULL_TREE;
4812 vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE);
4813 tree ptr_type = TREE_TYPE (start_addr);
4814 tree size = TYPE_SIZE (ptr_type);
4815 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
4816 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
4817 tree elem_size_log =
4818 build_int_cst (type, exact_log2 (vectype_align/vf));
4819 tree vf_minus_1 = build_int_cst (type, vf - 1);
4820 tree vf_tree = build_int_cst (type, vf);
4824 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
4825 gcc_assert (!new_bb);
4827 /* Create: byte_misalign = addr & (vectype_size - 1) */
4829 fold_build2 (BIT_AND_EXPR, type, start_addr, vectype_size_minus_1);
4831 /* Create: elem_misalign = byte_misalign / element_size */
4833 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
4835 /* Create: (niters_type) (VF - elem_misalign)&(VF - 1) */
4836 iters = fold_build2 (MINUS_EXPR, type, vf_tree, elem_misalign);
4837 iters = fold_build2 (BIT_AND_EXPR, type, iters, vf_minus_1);
4838 iters = fold_convert (niters_type, iters);
4841 /* Create: prolog_loop_niters = min (iters, loop_niters) */
4842 /* If the loop bound is known at compile time we already verified that it is
4843 greater than vf; since the misalignment ('iters') is at most vf, there's
4844 no need to generate the MIN_EXPR in this case. */
4845 if (TREE_CODE (loop_niters) != INTEGER_CST)
4846 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
4848 if (vect_print_dump_info (REPORT_DETAILS))
4850 fprintf (vect_dump, "niters for prolog loop: ");
4851 print_generic_expr (vect_dump, iters, TDF_SLIM);
4854 var = create_tmp_var (niters_type, "prolog_loop_niters");
4855 add_referenced_var (var);
4856 iters_name = force_gimple_operand (iters, &stmt, false, var);
4858 /* Insert stmt on loop preheader edge. */
4861 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4862 gcc_assert (!new_bb);
4869 /* Function vect_update_init_of_dr
4871 NITERS iterations were peeled from LOOP. DR represents a data reference
4872 in LOOP. This function updates the information recorded in DR to
4873 account for the fact that the first NITERS iterations had already been
4874 executed. Specifically, it updates the OFFSET field of DR. */
4877 vect_update_init_of_dr (struct data_reference *dr, tree niters)
4879 tree offset = DR_OFFSET (dr);
4881 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
4882 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
4883 DR_OFFSET (dr) = offset;
4887 /* Function vect_update_inits_of_drs
4889 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
4890 This function updates the information recorded for the data references in
4891 the loop to account for the fact that the first NITERS iterations had
4892 already been executed. Specifically, it updates the initial_condition of the
4893 access_function of all the data_references in the loop. */
4896 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
4899 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
4900 struct data_reference *dr;
4902 if (vect_dump && (dump_flags & TDF_DETAILS))
4903 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
4905 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
4906 vect_update_init_of_dr (dr, niters);
4910 /* Function vect_do_peeling_for_alignment
4912 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
4913 'niters' is set to the misalignment of one of the data references in the
4914 loop, thereby forcing it to refer to an aligned location at the beginning
4915 of the execution of this loop. The data reference for which we are
4916 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
4919 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
4921 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4922 tree niters_of_prolog_loop, ni_name;
4924 struct loop *new_loop;
4926 if (vect_print_dump_info (REPORT_DETAILS))
4927 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
4929 initialize_original_copy_tables ();
4931 ni_name = vect_build_loop_niters (loop_vinfo);
4932 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
4934 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
4936 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
4937 niters_of_prolog_loop, ni_name, true, 0);
4938 gcc_assert (new_loop);
4939 #ifdef ENABLE_CHECKING
4940 slpeel_verify_cfg_after_peeling (new_loop, loop);
4943 /* Update number of times loop executes. */
4944 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
4945 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
4946 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
4948 /* Update the init conditions of the access functions of all data refs. */
4949 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
4951 /* After peeling we have to reset scalar evolution analyzer. */
4954 free_original_copy_tables ();
4958 /* Function vect_create_cond_for_align_checks.
4960 Create a conditional expression that represents the alignment checks for
4961 all of data references (array element references) whose alignment must be
4965 LOOP_VINFO - two fields of the loop information are used.
4966 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
4967 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
4970 COND_EXPR_STMT_LIST - statements needed to construct the conditional
4972 The returned value is the conditional expression to be used in the if
4973 statement that controls which version of the loop gets executed at runtime.
4975 The algorithm makes two assumptions:
4976 1) The number of bytes "n" in a vector is a power of 2.
4977 2) An address "a" is aligned if a%n is zero and that this
4978 test can be done as a&(n-1) == 0. For example, for 16
4979 byte vectors the test is a&0xf == 0. */
4982 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
4983 tree *cond_expr_stmt_list)
4985 VEC(tree,heap) *may_misalign_stmts
4986 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
4988 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
4992 tree int_ptrsize_type;
4994 tree or_tmp_name = NULL_TREE;
4995 tree and_tmp, and_tmp_name, and_stmt;
4998 /* Check that mask is one less than a power of 2, i.e., mask is
4999 all zeros followed by all ones. */
5000 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
5002 /* CHECKME: what is the best integer or unsigned type to use to hold a
5003 cast from a pointer value? */
5004 psize = TYPE_SIZE (ptr_type_node);
5006 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
5008 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
5009 of the first vector of the i'th data reference. */
5011 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
5013 tree new_stmt_list = NULL_TREE;
5015 tree addr_tmp, addr_tmp_name, addr_stmt;
5016 tree or_tmp, new_or_tmp_name, or_stmt;
5018 /* create: addr_tmp = (int)(address_of_first_vector) */
5019 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
5023 if (new_stmt_list != NULL_TREE)
5024 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
5026 sprintf (tmp_name, "%s%d", "addr2int", i);
5027 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
5028 add_referenced_var (addr_tmp);
5029 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
5030 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
5031 addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
5032 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
5033 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
5035 /* The addresses are OR together. */
5037 if (or_tmp_name != NULL_TREE)
5039 /* create: or_tmp = or_tmp | addr_tmp */
5040 sprintf (tmp_name, "%s%d", "orptrs", i);
5041 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
5042 add_referenced_var (or_tmp);
5043 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
5044 tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
5045 or_tmp_name, addr_tmp_name);
5046 or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
5047 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
5048 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
5049 or_tmp_name = new_or_tmp_name;
5052 or_tmp_name = addr_tmp_name;
5056 mask_cst = build_int_cst (int_ptrsize_type, mask);
5058 /* create: and_tmp = or_tmp & mask */
5059 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
5060 add_referenced_var (and_tmp);
5061 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
5063 tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
5064 and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
5065 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
5066 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
5068 /* Make and_tmp the left operand of the conditional test against zero.
5069 if and_tmp has a nonzero bit then some address is unaligned. */
5070 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
5071 return build2 (EQ_EXPR, boolean_type_node,
5072 and_tmp_name, ptrsize_zero);
5076 /* Function vect_transform_loop.
5078 The analysis phase has determined that the loop is vectorizable.
5079 Vectorize the loop - created vectorized stmts to replace the scalar
5080 stmts in the loop, and update the loop exit condition. */
5083 vect_transform_loop (loop_vec_info loop_vinfo)
5085 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5086 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5087 int nbbs = loop->num_nodes;
5088 block_stmt_iterator si, next_si;
5091 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5094 if (vect_print_dump_info (REPORT_DETAILS))
5095 fprintf (vect_dump, "=== vec_transform_loop ===");
5097 /* If the loop has data references that may or may not be aligned then
5098 two versions of the loop need to be generated, one which is vectorized
5099 and one which isn't. A test is then generated to control which of the
5100 loops is executed. The test checks for the alignment of all of the
5101 data references that may or may not be aligned. */
5103 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
5107 tree cond_expr_stmt_list = NULL_TREE;
5108 basic_block condition_bb;
5109 block_stmt_iterator cond_exp_bsi;
5110 basic_block merge_bb;
5111 basic_block new_exit_bb;
5113 tree orig_phi, new_phi, arg;
5114 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
5116 cond_expr = vect_create_cond_for_align_checks (loop_vinfo,
5117 &cond_expr_stmt_list);
5118 initialize_original_copy_tables ();
5119 nloop = loop_version (loop, cond_expr, &condition_bb,
5120 prob, prob, REG_BR_PROB_BASE - prob, true);
5121 free_original_copy_tables();
5123 /** Loop versioning violates an assumption we try to maintain during
5124 vectorization - that the loop exit block has a single predecessor.
5125 After versioning, the exit block of both loop versions is the same
5126 basic block (i.e. it has two predecessors). Just in order to simplify
5127 following transformations in the vectorizer, we fix this situation
5128 here by adding a new (empty) block on the exit-edge of the loop,
5129 with the proper loop-exit phis to maintain loop-closed-form. **/
5131 merge_bb = single_exit (loop)->dest;
5132 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
5133 new_exit_bb = split_edge (single_exit (loop));
5134 new_exit_e = single_exit (loop);
5135 e = EDGE_SUCC (new_exit_bb, 0);
5137 for (orig_phi = phi_nodes (merge_bb); orig_phi;
5138 orig_phi = PHI_CHAIN (orig_phi))
5140 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
5142 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
5143 add_phi_arg (new_phi, arg, new_exit_e);
5144 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
5147 /** end loop-exit-fixes after versioning **/
5149 update_ssa (TODO_update_ssa);
5150 cond_exp_bsi = bsi_last (condition_bb);
5151 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
5154 /* CHECKME: we wouldn't need this if we called update_ssa once
5156 bitmap_zero (vect_memsyms_to_rename);
5158 /* Peel the loop if there are data refs with unknown alignment.
5159 Only one data ref with unknown store is allowed. */
5161 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5162 vect_do_peeling_for_alignment (loop_vinfo);
5164 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5165 compile time constant), or it is a constant that doesn't divide by the
5166 vectorization factor, then an epilog loop needs to be created.
5167 We therefore duplicate the loop: the original loop will be vectorized,
5168 and will compute the first (n/VF) iterations. The second copy of the loop
5169 will remain scalar and will compute the remaining (n%VF) iterations.
5170 (VF is the vectorization factor). */
5172 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5173 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5174 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
5175 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
5177 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5178 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5180 /* 1) Make sure the loop header has exactly two entries
5181 2) Make sure we have a preheader basic block. */
5183 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5185 split_edge (loop_preheader_edge (loop));
5187 /* FORNOW: the vectorizer supports only loops which body consist
5188 of one basic block (header + empty latch). When the vectorizer will
5189 support more involved loop forms, the order by which the BBs are
5190 traversed need to be reconsidered. */
5192 for (i = 0; i < nbbs; i++)
5194 basic_block bb = bbs[i];
5196 for (si = bsi_start (bb); !bsi_end_p (si);)
5198 tree stmt = bsi_stmt (si);
5199 stmt_vec_info stmt_info;
5202 if (vect_print_dump_info (REPORT_DETAILS))
5204 fprintf (vect_dump, "------>vectorizing statement: ");
5205 print_generic_expr (vect_dump, stmt, TDF_SLIM);
5207 stmt_info = vinfo_for_stmt (stmt);
5208 gcc_assert (stmt_info);
5209 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5210 && !STMT_VINFO_LIVE_P (stmt_info))
5216 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5217 != (unsigned HOST_WIDE_INT) vectorization_factor)
5218 && vect_print_dump_info (REPORT_DETAILS))
5219 fprintf (vect_dump, "multiple-types.");
5221 /* -------- vectorize statement ------------ */
5222 if (vect_print_dump_info (REPORT_DETAILS))
5223 fprintf (vect_dump, "transform statement.");
5225 strided_store = false;
5226 is_store = vect_transform_stmt (stmt, &si, &strided_store);
5230 if (DR_GROUP_FIRST_DR (stmt_info))
5232 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5233 interleaving chain was completed - free all the stores in
5235 tree next = DR_GROUP_FIRST_DR (stmt_info);
5237 stmt_vec_info next_stmt_info;
5241 next_si = bsi_for_stmt (next);
5242 next_stmt_info = vinfo_for_stmt (next);
5243 /* Free the attached stmt_vec_info and remove the stmt. */
5244 ann = stmt_ann (next);
5245 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
5246 free (next_stmt_info);
5247 set_stmt_info (ann, NULL);
5248 bsi_remove (&next_si, true);
5251 bsi_remove (&si, true);
5256 /* Free the attached stmt_vec_info and remove the stmt. */
5257 ann = stmt_ann (stmt);
5259 set_stmt_info (ann, NULL);
5260 bsi_remove (&si, true);
5268 slpeel_make_loop_iterate_ntimes (loop, ratio);
5270 mark_set_for_renaming (vect_memsyms_to_rename);
5272 /* The memory tags and pointers in vectorized statements need to
5273 have their SSA forms updated. FIXME, why can't this be delayed
5274 until all the loops have been transformed? */
5275 update_ssa (TODO_update_ssa);
5277 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
5278 fprintf (vect_dump, "LOOP VECTORIZED.");