1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to the Free
19 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
24 #include "coretypes.h"
30 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
40 #include "tree-data-ref.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "langhooks.h"
45 #include "tree-pass.h"
49 /* Utility functions for the code transformation. */
50 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (tree, block_stmt_iterator *, tree, tree *, tree *, bool, tree);
54 static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree);
55 static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
58 static tree vect_init_vector (tree, tree, tree);
59 static void vect_finish_stmt_generation
60 (tree stmt, tree vec_stmt, block_stmt_iterator *bsi);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void update_vuses_to_preheader (tree, struct loop*);
63 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
64 static tree get_initial_def_for_reduction (tree, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
77 /* Function vect_get_new_vect_var.
79 Returns a name for a new variable. The current naming scheme appends the
80 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
81 the name of vectorizer generated variables, and appends that to NAME if
85 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
98 case vect_pointer_var:
106 new_vect_var = create_tmp_var (type, concat (prefix, name, NULL));
108 new_vect_var = create_tmp_var (type, prefix);
110 /* Mark vector typed variable as a gimple register variable. */
111 if (TREE_CODE (type) == VECTOR_TYPE)
112 DECL_GIMPLE_REG_P (new_vect_var) = true;
118 /* Function vect_create_addr_base_for_vector_ref.
120 Create an expression that computes the address of the first memory location
121 that will be accessed for a data reference.
124 STMT: The statement containing the data reference.
125 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
126 OFFSET: Optional. If supplied, it is be added to the initial address.
129 1. Return an SSA_NAME whose value is the address of the memory location of
130 the first vector of the data reference.
131 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
132 these statement(s) which define the returned SSA_NAME.
134 FORNOW: We are only handling array accesses with step 1. */
137 vect_create_addr_base_for_vector_ref (tree stmt,
141 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
142 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
143 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
144 tree base_name = build_fold_indirect_ref (data_ref_base);
146 tree addr_base, addr_expr;
148 tree base_offset = unshare_expr (DR_OFFSET (dr));
149 tree init = unshare_expr (DR_INIT (dr));
150 tree vect_ptr_type, addr_expr2;
152 /* Create base_offset */
153 base_offset = size_binop (PLUS_EXPR, base_offset, init);
154 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
155 add_referenced_var (dest);
156 base_offset = force_gimple_operand (base_offset, &new_stmt, false, dest);
157 append_to_statement_list_force (new_stmt, new_stmt_list);
161 tree tmp = create_tmp_var (TREE_TYPE (base_offset), "offset");
164 /* For interleaved access step we divide STEP by the size of the
165 interleaving group. */
166 if (DR_GROUP_SIZE (stmt_info))
167 step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr),
168 build_int_cst (TREE_TYPE (offset),
169 DR_GROUP_SIZE (stmt_info)));
173 add_referenced_var (tmp);
174 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
175 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
176 base_offset, offset);
177 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
178 append_to_statement_list_force (new_stmt, new_stmt_list);
181 /* base + base_offset */
182 addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base,
185 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
187 /* addr_expr = addr_base */
188 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
189 get_name (base_name));
190 add_referenced_var (addr_expr);
191 vec_stmt = fold_convert (vect_ptr_type, addr_base);
192 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
193 get_name (base_name));
194 add_referenced_var (addr_expr2);
195 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
196 append_to_statement_list_force (new_stmt, new_stmt_list);
198 if (vect_print_dump_info (REPORT_DETAILS))
200 fprintf (vect_dump, "created ");
201 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
207 /* Function vect_create_data_ref_ptr.
209 Create a new pointer to vector type (vp), that points to the first location
210 accessed in the loop by STMT, along with the def-use update chain to
211 appropriately advance the pointer through the loop iterations. Also set
212 aliasing information for the pointer. This vector pointer is used by the
213 callers to this function to create a memory reference expression for vector
217 1. STMT: a stmt that references memory. Expected to be of the form
218 GIMPLE_MODIFY_STMT <name, data-ref> or
219 GIMPLE_MODIFY_STMT <data-ref, name>.
220 2. BSI: block_stmt_iterator where new stmts can be added.
221 3. OFFSET (optional): an offset to be added to the initial address accessed
222 by the data-ref in STMT.
223 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
224 pointing to the initial address.
225 5. TYPE: if not NULL indicates the required type of the data-ref
228 1. Declare a new ptr to vector_type, and have it point to the base of the
229 data reference (initial addressed accessed by the data reference).
230 For example, for vector of type V8HI, the following code is generated:
233 vp = (v8hi *)initial_address;
235 if OFFSET is not supplied:
236 initial_address = &a[init];
237 if OFFSET is supplied:
238 initial_address = &a[init + OFFSET];
240 Return the initial_address in INITIAL_ADDRESS.
242 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
243 update the pointer in each iteration of the loop.
245 Return the increment stmt that updates the pointer in PTR_INCR.
247 3. Return the pointer. */
250 vect_create_data_ref_ptr (tree stmt,
251 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
252 tree offset, tree *initial_address, tree *ptr_incr,
253 bool only_init, tree type)
256 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
257 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
258 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
259 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
265 tree new_stmt_list = NULL_TREE;
266 edge pe = loop_preheader_edge (loop);
269 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
271 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
273 if (vect_print_dump_info (REPORT_DETAILS))
275 tree data_ref_base = base_name;
276 fprintf (vect_dump, "create vector-pointer variable to type: ");
277 print_generic_expr (vect_dump, vectype, TDF_SLIM);
278 if (TREE_CODE (data_ref_base) == VAR_DECL)
279 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
280 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
281 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
282 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
283 fprintf (vect_dump, " vectorizing a record based array ref: ");
284 else if (TREE_CODE (data_ref_base) == SSA_NAME)
285 fprintf (vect_dump, " vectorizing a pointer ref: ");
286 print_generic_expr (vect_dump, base_name, TDF_SLIM);
289 /** (1) Create the new vector-pointer variable: **/
291 vect_ptr_type = build_pointer_type (type);
293 vect_ptr_type = build_pointer_type (vectype);
294 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
295 get_name (base_name));
296 add_referenced_var (vect_ptr);
298 /** (2) Add aliasing information to the new vector-pointer:
299 (The points-to info (DR_PTR_INFO) may be defined later.) **/
301 tag = DR_MEMTAG (dr);
304 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
305 tag must be created with tag added to its may alias list. */
307 new_type_alias (vect_ptr, tag, DR_REF (dr));
309 set_symbol_mem_tag (vect_ptr, tag);
311 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
313 /** (3) Calculate the initial address the vector-pointer, and set
314 the vector-pointer to point to it before the loop: **/
316 /* Create: (&(base[init_val+offset]) in the loop preheader. */
317 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
319 pe = loop_preheader_edge (loop);
320 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
321 gcc_assert (!new_bb);
322 *initial_address = new_temp;
324 /* Create: p = (vectype *) initial_base */
325 vec_stmt = fold_convert (vect_ptr_type, new_temp);
326 vec_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vect_ptr, vec_stmt);
327 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
328 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
329 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
330 gcc_assert (!new_bb);
333 /** (4) Handle the updating of the vector-pointer inside the loop: **/
335 if (only_init) /* No update in loop is required. */
337 /* Copy the points-to information if it exists. */
338 if (DR_PTR_INFO (dr))
339 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
340 return vect_ptr_init;
344 block_stmt_iterator incr_bsi;
346 tree indx_before_incr, indx_after_incr;
349 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
350 create_iv (vect_ptr_init,
351 fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)),
352 NULL_TREE, loop, &incr_bsi, insert_after,
353 &indx_before_incr, &indx_after_incr);
354 incr = bsi_stmt (incr_bsi);
355 set_stmt_info (stmt_ann (incr),
356 new_stmt_vec_info (incr, loop_vinfo));
358 /* Copy the points-to information if it exists. */
359 if (DR_PTR_INFO (dr))
361 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
362 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
364 merge_alias_info (vect_ptr_init, indx_before_incr);
365 merge_alias_info (vect_ptr_init, indx_after_incr);
369 return indx_before_incr;
374 /* Function bump_vector_ptr
376 Increment a pointer (to a vector type) by vector-size. Connect the new
377 increment stmt to the existing def-use update-chain of the pointer.
379 The pointer def-use update-chain before this function:
380 DATAREF_PTR = phi (p_0, p_2)
382 PTR_INCR: p_2 = DATAREF_PTR + step
384 The pointer def-use update-chain after this function:
385 DATAREF_PTR = phi (p_0, p_2)
387 NEW_DATAREF_PTR = DATAREF_PTR + vector_size
389 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
392 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
394 PTR_INCR - the stmt that updates the pointer in each iteration of the loop.
395 The increment amount across iterations is also expected to be
397 BSI - location where the new update stmt is to be placed.
398 STMT - the original scalar memory-access stmt that is being vectorized.
400 Output: Return NEW_DATAREF_PTR as illustrated above.
405 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
408 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
409 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
410 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
411 tree vptr_type = TREE_TYPE (dataref_ptr);
412 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
413 tree update = fold_convert (vptr_type, TYPE_SIZE_UNIT (vectype));
417 tree new_dataref_ptr;
419 incr_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, ptr_var,
420 build2 (PLUS_EXPR, vptr_type, dataref_ptr, update));
421 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
422 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
423 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
425 /* Update the vector-pointer's cross-iteration increment. */
426 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
428 tree use = USE_FROM_PTR (use_p);
430 if (use == dataref_ptr)
431 SET_USE (use_p, new_dataref_ptr);
433 gcc_assert (tree_int_cst_compare (use, update) == 0);
436 /* Copy the points-to information if it exists. */
437 if (DR_PTR_INFO (dr))
438 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
439 merge_alias_info (new_dataref_ptr, dataref_ptr);
441 return new_dataref_ptr;
445 /* Function vect_create_destination_var.
447 Create a new temporary of type VECTYPE. */
450 vect_create_destination_var (tree scalar_dest, tree vectype)
453 const char *new_name;
455 enum vect_var_kind kind;
457 kind = vectype ? vect_simple_var : vect_scalar_var;
458 type = vectype ? vectype : TREE_TYPE (scalar_dest);
460 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
462 new_name = get_name (scalar_dest);
465 vec_dest = vect_get_new_vect_var (type, kind, new_name);
466 add_referenced_var (vec_dest);
472 /* Function vect_init_vector.
474 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
475 the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be
476 used in the vectorization of STMT. */
479 vect_init_vector (tree stmt, tree vector_var, tree vector_type)
481 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
482 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
483 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
491 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
492 add_referenced_var (new_var);
494 init_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, new_var, vector_var);
495 new_temp = make_ssa_name (new_var, init_stmt);
496 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
498 pe = loop_preheader_edge (loop);
499 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
500 gcc_assert (!new_bb);
502 if (vect_print_dump_info (REPORT_DETAILS))
504 fprintf (vect_dump, "created new init_stmt: ");
505 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
508 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
513 /* Function get_initial_def_for_induction
516 STMT - a stmt that performs an induction operation in the loop.
517 IV_PHI - the initial value of the induction variable
520 Return a vector variable, initialized with the first VF values of
521 the induction variable. E.g., for an iv with IV_PHI='X' and
522 evolution S, for a vector of 4 units, we want to return:
523 [X, X + S, X + 2*S, X + 3*S]. */
526 get_initial_def_for_induction (tree stmt, tree iv_phi)
528 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
529 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
530 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
531 tree scalar_type = TREE_TYPE (iv_phi);
532 tree vectype = get_vectype_for_scalar_type (scalar_type);
533 int nunits = GET_MODE_NUNITS (TYPE_MODE (vectype));
534 edge pe = loop_preheader_edge (loop);
536 block_stmt_iterator bsi;
537 tree vec, vec_init, vec_step, t;
542 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
543 tree init_expr, step_expr;
544 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
547 int ncopies = vf / nunits;
549 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
551 gcc_assert (phi_info);
553 if (STMT_VINFO_VEC_STMT (phi_info))
555 induction_phi = STMT_VINFO_VEC_STMT (phi_info);
556 gcc_assert (TREE_CODE (induction_phi) == PHI_NODE);
558 if (vect_print_dump_info (REPORT_DETAILS))
560 fprintf (vect_dump, "induction already vectorized:");
561 print_generic_expr (vect_dump, iv_phi, TDF_SLIM);
562 fprintf (vect_dump, "\n");
563 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
566 return PHI_RESULT (induction_phi);
569 gcc_assert (ncopies >= 1);
571 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (iv_phi));
572 gcc_assert (access_fn);
573 ok = vect_is_simple_iv_evolution (loop->num, access_fn, &init_expr, &step_expr);
576 /* Create the vector that holds the initial_value of the induction. */
577 new_name = init_expr;
579 t = tree_cons (NULL_TREE, init_expr, t);
580 for (i = 1; i < nunits; i++)
582 /* Create: new_name = new_name + step_expr */
583 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
584 add_referenced_var (new_var);
585 init_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, new_var,
586 fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr));
587 new_name = make_ssa_name (new_var, init_stmt);
588 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
590 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
591 gcc_assert (!new_bb);
593 if (vect_print_dump_info (REPORT_DETAILS))
595 fprintf (vect_dump, "created new init_stmt: ");
596 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
598 t = tree_cons (NULL_TREE, new_name, t);
600 vec = build_constructor_from_list (vectype, nreverse (t));
601 vec_init = vect_init_vector (stmt, vec, vectype);
604 /* Create the vector that holds the step of the induction. */
605 expr = build_int_cst (scalar_type, vf);
606 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
608 for (i = 0; i < nunits; i++)
609 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
610 vec = build_constructor_from_list (vectype, t);
611 vec_step = vect_init_vector (stmt, vec, vectype);
614 /* Create the following def-use cycle:
616 vec_init = [X, X+S, X+2*S, X+3*S]
617 vec_step = [VF*S, VF*S, VF*S, VF*S]
619 vec_iv = PHI <vec_init, vec_loop>
623 vec_loop = vec_iv + vec_step; */
625 /* Create the induction-phi that defines the induction-operand. */
626 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
627 add_referenced_var (vec_dest);
628 induction_phi = create_phi_node (vec_dest, loop->header);
629 set_stmt_info (get_stmt_ann (induction_phi),
630 new_stmt_vec_info (induction_phi, loop_vinfo));
631 induc_def = PHI_RESULT (induction_phi);
633 /* Create the iv update inside the loop */
634 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, NULL_TREE,
635 build2 (PLUS_EXPR, vectype, induc_def, vec_step));
636 vec_def = make_ssa_name (vec_dest, new_stmt);
637 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
638 bsi = bsi_for_stmt (stmt);
639 vect_finish_stmt_generation (stmt, new_stmt, &bsi);
641 /* Set the arguments of the phi node: */
642 add_phi_arg (induction_phi, vec_init, loop_preheader_edge (loop));
643 add_phi_arg (induction_phi, vec_def, loop_latch_edge (loop));
646 /* In case the vectorization factor (VF) is bigger than the number
647 of elements that we can fit in a vectype (nunits), we have to generate
648 more than one vector stmt - i.e - we need to "unroll" the
649 vector stmt by a factor VF/nunits. For more details see documentation
650 in vectorizable_operation. */
654 stmt_vec_info prev_stmt_vinfo;
656 /* Create the vector that holds the step of the induction. */
657 expr = build_int_cst (scalar_type, nunits);
658 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
660 for (i = 0; i < nunits; i++)
661 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
662 vec = build_constructor_from_list (vectype, t);
663 vec_step = vect_init_vector (stmt, vec, vectype);
666 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
667 for (i = 1; i < ncopies; i++)
669 /* vec_i = vec_prev + vec_{step*nunits} */
671 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, NULL_TREE,
672 build2 (PLUS_EXPR, vectype, vec_def, vec_step));
673 vec_def = make_ssa_name (vec_dest, new_stmt);
674 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
675 bsi = bsi_for_stmt (stmt);
676 vect_finish_stmt_generation (stmt, new_stmt, &bsi);
678 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
679 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
683 if (vect_print_dump_info (REPORT_DETAILS))
685 fprintf (vect_dump, "transform induction: created def-use cycle:");
686 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
687 fprintf (vect_dump, "\n");
688 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
691 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
696 /* Function vect_get_vec_def_for_operand.
698 OP is an operand in STMT. This function returns a (vector) def that will be
699 used in the vectorized stmt for STMT.
701 In the case that OP is an SSA_NAME which is defined in the loop, then
702 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
704 In case OP is an invariant or constant, a new stmt that creates a vector def
705 needs to be introduced. */
708 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
713 stmt_vec_info def_stmt_info = NULL;
714 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
715 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
716 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
717 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
718 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
724 enum vect_def_type dt;
728 if (vect_print_dump_info (REPORT_DETAILS))
730 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
731 print_generic_expr (vect_dump, op, TDF_SLIM);
734 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
735 gcc_assert (is_simple_use);
736 if (vect_print_dump_info (REPORT_DETAILS))
740 fprintf (vect_dump, "def = ");
741 print_generic_expr (vect_dump, def, TDF_SLIM);
745 fprintf (vect_dump, " def_stmt = ");
746 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
752 /* Case 1: operand is a constant. */
753 case vect_constant_def:
758 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
759 if (vect_print_dump_info (REPORT_DETAILS))
760 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
762 for (i = nunits - 1; i >= 0; --i)
764 t = tree_cons (NULL_TREE, op, t);
766 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
767 vec_cst = build_vector (vector_type, t);
769 return vect_init_vector (stmt, vec_cst, vector_type);
772 /* Case 2: operand is defined outside the loop - loop invariant. */
773 case vect_invariant_def:
778 /* Create 'vec_inv = {inv,inv,..,inv}' */
779 if (vect_print_dump_info (REPORT_DETAILS))
780 fprintf (vect_dump, "Create vector_inv.");
782 for (i = nunits - 1; i >= 0; --i)
784 t = tree_cons (NULL_TREE, def, t);
787 /* FIXME: use build_constructor directly. */
788 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
789 vec_inv = build_constructor_from_list (vector_type, t);
790 return vect_init_vector (stmt, vec_inv, vector_type);
793 /* Case 3: operand is defined inside the loop. */
797 *scalar_def = def_stmt;
799 /* Get the def from the vectorized stmt. */
800 def_stmt_info = vinfo_for_stmt (def_stmt);
801 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
802 gcc_assert (vec_stmt);
803 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
807 /* Case 4: operand is defined by a loop header phi - reduction */
808 case vect_reduction_def:
810 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
812 /* Get the def before the loop */
813 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
814 return get_initial_def_for_reduction (stmt, op, scalar_def);
817 /* Case 5: operand is defined by loop-header phi - induction. */
818 case vect_induction_def:
820 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
822 /* Get the def before the loop */
823 return get_initial_def_for_induction (stmt, def_stmt);
832 /* Function vect_get_vec_def_for_stmt_copy
834 Return a vector-def for an operand. This function is used when the
835 vectorized stmt to be created (by the caller to this function) is a "copy"
836 created in case the vectorized result cannot fit in one vector, and several
837 copies of the vector-stmt are required. In this case the vector-def is
838 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
839 of the stmt that defines VEC_OPRND.
840 DT is the type of the vector def VEC_OPRND.
843 In case the vectorization factor (VF) is bigger than the number
844 of elements that can fit in a vectype (nunits), we have to generate
845 more than one vector stmt to vectorize the scalar stmt. This situation
846 arises when there are multiple data-types operated upon in the loop; the
847 smallest data-type determines the VF, and as a result, when vectorizing
848 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
849 vector stmt (each computing a vector of 'nunits' results, and together
850 computing 'VF' results in each iteration). This function is called when
851 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
852 which VF=16 and nunits=4, so the number of copies required is 4):
854 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
856 S1: x = load VS1.0: vx.0 = memref0 VS1.1
857 VS1.1: vx.1 = memref1 VS1.2
858 VS1.2: vx.2 = memref2 VS1.3
859 VS1.3: vx.3 = memref3
861 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
862 VSnew.1: vz1 = vx.1 + ... VSnew.2
863 VSnew.2: vz2 = vx.2 + ... VSnew.3
864 VSnew.3: vz3 = vx.3 + ...
866 The vectorization of S1 is explained in vectorizable_load.
867 The vectorization of S2:
868 To create the first vector-stmt out of the 4 copies - VSnew.0 -
869 the function 'vect_get_vec_def_for_operand' is called to
870 get the relevant vector-def for each operand of S2. For operand x it
871 returns the vector-def 'vx.0'.
873 To create the remaining copies of the vector-stmt (VSnew.j), this
874 function is called to get the relevant vector-def for each operand. It is
875 obtained from the respective VS1.j stmt, which is recorded in the
876 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
878 For example, to obtain the vector-def 'vx.1' in order to create the
879 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
880 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
881 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
882 and return its def ('vx.1').
883 Overall, to create the above sequence this function will be called 3 times:
884 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
885 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
886 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
889 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
891 tree vec_stmt_for_operand;
892 stmt_vec_info def_stmt_info;
894 /* Do nothing; can reuse same def. */
895 if (dt == vect_invariant_def || dt == vect_constant_def )
898 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
899 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
900 if (dt == vect_induction_def)
901 gcc_assert (TREE_CODE (vec_stmt_for_operand) == PHI_NODE);
902 gcc_assert (def_stmt_info);
903 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
904 gcc_assert (vec_stmt_for_operand);
905 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
911 /* Function vect_finish_stmt_generation.
913 Insert a new stmt. */
916 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
917 block_stmt_iterator *bsi)
919 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
920 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
922 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
923 set_stmt_info (get_stmt_ann (vec_stmt),
924 new_stmt_vec_info (vec_stmt, loop_vinfo));
926 if (vect_print_dump_info (REPORT_DETAILS))
928 fprintf (vect_dump, "add new stmt: ");
929 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
932 /* Make sure bsi points to the stmt that is being vectorized. */
933 gcc_assert (stmt == bsi_stmt (*bsi));
935 #ifdef USE_MAPPED_LOCATION
936 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
938 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
943 #define ADJUST_IN_EPILOG 1
945 /* Function get_initial_def_for_reduction
948 STMT - a stmt that performs a reduction operation in the loop.
949 INIT_VAL - the initial value of the reduction variable
952 SCALAR_DEF - a tree that holds a value to be added to the final result
953 of the reduction (used for "ADJUST_IN_EPILOG" - see below).
954 Return a vector variable, initialized according to the operation that STMT
955 performs. This vector will be used as the initial value of the
956 vector of partial results.
958 Option1 ("ADJUST_IN_EPILOG"): Initialize the vector as follows:
961 min/max: [init_val,init_val,..,init_val,init_val]
962 bit and/or: [init_val,init_val,..,init_val,init_val]
963 and when necessary (e.g. add/mult case) let the caller know
964 that it needs to adjust the result by init_val.
966 Option2: Initialize the vector as follows:
967 add: [0,0,...,0,init_val]
968 mult: [1,1,...,1,init_val]
969 min/max: [init_val,init_val,...,init_val]
970 bit and/or: [init_val,init_val,...,init_val]
971 and no adjustments are needed.
973 For example, for the following code:
979 STMT is 's = s + a[i]', and the reduction variable is 's'.
980 For a vector of 4 units, we want to return either [0,0,0,init_val],
981 or [0,0,0,0] and let the caller know that it needs to adjust
982 the result at the end by 'init_val'.
984 FORNOW: We use the "ADJUST_IN_EPILOG" scheme.
985 TODO: Use some cost-model to estimate which scheme is more profitable.
989 get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def)
991 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
992 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
993 int nunits = GET_MODE_NUNITS (TYPE_MODE (vectype));
995 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
996 tree type = TREE_TYPE (init_val);
998 tree vec, t = NULL_TREE;
999 bool need_epilog_adjust;
1003 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
1007 case WIDEN_SUM_EXPR:
1010 if (INTEGRAL_TYPE_P (type))
1011 def = build_int_cst (type, 0);
1013 def = build_real (type, dconst0);
1015 #ifdef ADJUST_IN_EPILOG
1016 /* All the 'nunits' elements are set to 0. The final result will be
1017 adjusted by 'init_val' at the loop epilog. */
1019 need_epilog_adjust = true;
1021 /* 'nunits - 1' elements are set to 0; The last element is set to
1022 'init_val'. No further adjustments at the epilog are needed. */
1023 nelements = nunits - 1;
1024 need_epilog_adjust = false;
1032 need_epilog_adjust = false;
1039 for (i = nelements - 1; i >= 0; --i)
1040 t = tree_cons (NULL_TREE, def, t);
1042 if (nelements == nunits - 1)
1044 /* Set the last element of the vector. */
1045 t = tree_cons (NULL_TREE, init_val, t);
1048 gcc_assert (nelements == nunits);
1050 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1051 if (TREE_CODE (init_val) == INTEGER_CST || TREE_CODE (init_val) == REAL_CST)
1052 vec = build_vector (vector_type, t);
1054 vec = build_constructor_from_list (vector_type, t);
1056 if (!need_epilog_adjust)
1057 *scalar_def = NULL_TREE;
1059 *scalar_def = init_val;
1061 return vect_init_vector (stmt, vec, vector_type);
1065 /* Function vect_create_epilog_for_reduction
1067 Create code at the loop-epilog to finalize the result of a reduction
1070 VECT_DEF is a vector of partial results.
1071 REDUC_CODE is the tree-code for the epilog reduction.
1072 STMT is the scalar reduction stmt that is being vectorized.
1073 REDUCTION_PHI is the phi-node that carries the reduction computation.
1076 1. Creates the reduction def-use cycle: sets the the arguments for
1078 The loop-entry argument is the vectorized initial-value of the reduction.
1079 The loop-latch argument is VECT_DEF - the vector of partial sums.
1080 2. "Reduces" the vector of partial results VECT_DEF into a single result,
1081 by applying the operation specified by REDUC_CODE if available, or by
1082 other means (whole-vector shifts or a scalar loop).
1083 The function also creates a new phi node at the loop exit to preserve
1084 loop-closed form, as illustrated below.
1086 The flow at the entry to this function:
1089 vec_def = phi <null, null> # REDUCTION_PHI
1090 VECT_DEF = vector_stmt # vectorized form of STMT
1091 s_loop = scalar_stmt # (scalar) STMT
1093 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1097 The above is transformed by this function into:
1100 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
1101 VECT_DEF = vector_stmt # vectorized form of STMT
1102 s_loop = scalar_stmt # (scalar) STMT
1104 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1105 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1106 v_out2 = reduce <v_out1>
1107 s_out3 = extract_field <v_out2, 0>
1108 s_out4 = adjust_result <s_out3>
1114 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
1115 enum tree_code reduc_code, tree reduction_phi)
1117 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1119 enum machine_mode mode;
1120 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1121 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1122 basic_block exit_bb;
1126 block_stmt_iterator exit_bsi;
1131 tree new_scalar_dest, exit_phi;
1132 tree bitsize, bitpos, bytesize;
1133 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
1134 tree scalar_initial_def;
1135 tree vec_initial_def;
1137 imm_use_iterator imm_iter;
1138 use_operand_p use_p;
1139 bool extract_scalar_result;
1143 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
1146 op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
1147 reduction_op = TREE_OPERAND (operation, op_type-1);
1148 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
1149 mode = TYPE_MODE (vectype);
1151 /*** 1. Create the reduction def-use cycle ***/
1153 /* 1.1 set the loop-entry arg of the reduction-phi: */
1154 /* For the case of reduction, vect_get_vec_def_for_operand returns
1155 the scalar def before the loop, that defines the initial value
1156 of the reduction variable. */
1157 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
1158 &scalar_initial_def);
1159 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
1161 /* 1.2 set the loop-latch arg for the reduction-phi: */
1162 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
1164 if (vect_print_dump_info (REPORT_DETAILS))
1166 fprintf (vect_dump, "transform reduction: created def-use cycle:");
1167 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
1168 fprintf (vect_dump, "\n");
1169 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
1173 /*** 2. Create epilog code
1174 The reduction epilog code operates across the elements of the vector
1175 of partial results computed by the vectorized loop.
1176 The reduction epilog code consists of:
1177 step 1: compute the scalar result in a vector (v_out2)
1178 step 2: extract the scalar result (s_out3) from the vector (v_out2)
1179 step 3: adjust the scalar result (s_out3) if needed.
1181 Step 1 can be accomplished using one the following three schemes:
1182 (scheme 1) using reduc_code, if available.
1183 (scheme 2) using whole-vector shifts, if available.
1184 (scheme 3) using a scalar loop. In this case steps 1+2 above are
1187 The overall epilog code looks like this:
1189 s_out0 = phi <s_loop> # original EXIT_PHI
1190 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1191 v_out2 = reduce <v_out1> # step 1
1192 s_out3 = extract_field <v_out2, 0> # step 2
1193 s_out4 = adjust_result <s_out3> # step 3
1195 (step 3 is optional, and step2 1 and 2 may be combined).
1196 Lastly, the uses of s_out0 are replaced by s_out4.
1200 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
1201 v_out1 = phi <v_loop> */
1203 exit_bb = single_exit (loop)->dest;
1204 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
1205 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
1206 exit_bsi = bsi_start (exit_bb);
1208 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
1209 (i.e. when reduc_code is not available) and in the final adjustment code
1210 (if needed). Also get the original scalar reduction variable as
1211 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
1212 represents a reduction pattern), the tree-code and scalar-def are
1213 taken from the original stmt that the pattern-stmt (STMT) replaces.
1214 Otherwise (it is a regular reduction) - the tree-code and scalar-def
1215 are taken from STMT. */
1217 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1220 /* Regular reduction */
1225 /* Reduction pattern */
1226 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
1227 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1228 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
1230 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1231 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
1232 scalar_type = TREE_TYPE (scalar_dest);
1233 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
1234 bitsize = TYPE_SIZE (scalar_type);
1235 bytesize = TYPE_SIZE_UNIT (scalar_type);
1237 /* 2.3 Create the reduction code, using one of the three schemes described
1240 if (reduc_code < NUM_TREE_CODES)
1242 /*** Case 1: Create:
1243 v_out2 = reduc_expr <v_out1> */
1245 if (vect_print_dump_info (REPORT_DETAILS))
1246 fprintf (vect_dump, "Reduce using direct vector reduction.");
1248 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1249 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
1250 build1 (reduc_code, vectype, PHI_RESULT (new_phi)));
1251 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1252 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1253 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1255 extract_scalar_result = true;
1259 enum tree_code shift_code = 0;
1260 bool have_whole_vector_shift = true;
1262 int element_bitsize = tree_low_cst (bitsize, 1);
1263 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1266 if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
1267 shift_code = VEC_RSHIFT_EXPR;
1269 have_whole_vector_shift = false;
1271 /* Regardless of whether we have a whole vector shift, if we're
1272 emulating the operation via tree-vect-generic, we don't want
1273 to use it. Only the first round of the reduction is likely
1274 to still be profitable via emulation. */
1275 /* ??? It might be better to emit a reduction tree code here, so that
1276 tree-vect-generic can expand the first round via bit tricks. */
1277 if (!VECTOR_MODE_P (mode))
1278 have_whole_vector_shift = false;
1281 optab optab = optab_for_tree_code (code, vectype);
1282 if (optab->handlers[mode].insn_code == CODE_FOR_nothing)
1283 have_whole_vector_shift = false;
1286 if (have_whole_vector_shift)
1288 /*** Case 2: Create:
1289 for (offset = VS/2; offset >= element_size; offset/=2)
1291 Create: va' = vec_shift <va, offset>
1292 Create: va = vop <va, va'>
1295 if (vect_print_dump_info (REPORT_DETAILS))
1296 fprintf (vect_dump, "Reduce using vector shifts");
1298 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1299 new_temp = PHI_RESULT (new_phi);
1301 for (bit_offset = vec_size_in_bits/2;
1302 bit_offset >= element_bitsize;
1305 tree bitpos = size_int (bit_offset);
1307 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1309 build2 (shift_code, vectype,
1311 new_name = make_ssa_name (vec_dest, epilog_stmt);
1312 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1313 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1315 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1317 build2 (code, vectype,
1318 new_name, new_temp));
1319 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1320 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1321 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1324 extract_scalar_result = true;
1330 /*** Case 3: Create:
1331 s = extract_field <v_out2, 0>
1332 for (offset = element_size;
1333 offset < vector_size;
1334 offset += element_size;)
1336 Create: s' = extract_field <v_out2, offset>
1337 Create: s = op <s, s'>
1340 if (vect_print_dump_info (REPORT_DETAILS))
1341 fprintf (vect_dump, "Reduce using scalar code. ");
1343 vec_temp = PHI_RESULT (new_phi);
1344 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1345 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1347 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1348 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1349 new_scalar_dest, rhs);
1350 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1351 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1352 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1354 for (bit_offset = element_bitsize;
1355 bit_offset < vec_size_in_bits;
1356 bit_offset += element_bitsize)
1358 tree bitpos = bitsize_int (bit_offset);
1359 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1362 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1363 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1364 new_scalar_dest, rhs);
1365 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
1366 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1367 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1369 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1371 build2 (code, scalar_type, new_name, new_temp));
1372 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1373 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1374 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1377 extract_scalar_result = false;
1381 /* 2.4 Extract the final scalar result. Create:
1382 s_out3 = extract_field <v_out2, bitpos> */
1384 if (extract_scalar_result)
1388 if (vect_print_dump_info (REPORT_DETAILS))
1389 fprintf (vect_dump, "extract scalar result");
1391 if (BYTES_BIG_ENDIAN)
1392 bitpos = size_binop (MULT_EXPR,
1393 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
1394 TYPE_SIZE (scalar_type));
1396 bitpos = bitsize_zero_node;
1398 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
1399 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1400 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1401 new_scalar_dest, rhs);
1402 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1403 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1404 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1407 /* 2.4 Adjust the final result by the initial value of the reduction
1408 variable. (When such adjustment is not needed, then
1409 'scalar_initial_def' is zero).
1412 s_out4 = scalar_expr <s_out3, scalar_initial_def> */
1414 if (scalar_initial_def)
1416 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1418 build2 (code, scalar_type, new_temp, scalar_initial_def));
1419 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1420 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1421 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1424 /* 2.6 Replace uses of s_out0 with uses of s_out3 */
1426 /* Find the loop-closed-use at the loop exit of the original scalar result.
1427 (The reduction result is expected to have two immediate uses - one at the
1428 latch block, and one at the loop exit). */
1430 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
1432 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
1434 exit_phi = USE_STMT (use_p);
1438 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
1439 gcc_assert (exit_phi);
1440 /* Replace the uses: */
1441 orig_name = PHI_RESULT (exit_phi);
1442 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
1443 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1444 SET_USE (use_p, new_temp);
1448 /* Function vectorizable_reduction.
1450 Check if STMT performs a reduction operation that can be vectorized.
1451 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1452 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1453 Return FALSE if not a vectorizable STMT, TRUE otherwise.
1455 This function also handles reduction idioms (patterns) that have been
1456 recognized in advance during vect_pattern_recog. In this case, STMT may be
1458 X = pattern_expr (arg0, arg1, ..., X)
1459 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
1460 sequence that had been detected and replaced by the pattern-stmt (STMT).
1462 In some cases of reduction patterns, the type of the reduction variable X is
1463 different than the type of the other arguments of STMT.
1464 In such cases, the vectype that is used when transforming STMT into a vector
1465 stmt is different than the vectype that is used to determine the
1466 vectorization factor, because it consists of a different number of elements
1467 than the actual number of elements that are being operated upon in parallel.
1469 For example, consider an accumulation of shorts into an int accumulator.
1470 On some targets it's possible to vectorize this pattern operating on 8
1471 shorts at a time (hence, the vectype for purposes of determining the
1472 vectorization factor should be V8HI); on the other hand, the vectype that
1473 is used to create the vector form is actually V4SI (the type of the result).
1475 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
1476 indicates what is the actual level of parallelism (V8HI in the example), so
1477 that the right vectorization factor would be derived. This vectype
1478 corresponds to the type of arguments to the reduction stmt, and should *NOT*
1479 be used to create the vectorized stmt. The right vectype for the vectorized
1480 stmt is obtained from the type of the result X:
1481 get_vectype_for_scalar_type (TREE_TYPE (X))
1483 This means that, contrary to "regular" reductions (or "regular" stmts in
1484 general), the following equation:
1485 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
1486 does *NOT* necessarily hold for reduction patterns. */
1489 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1494 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
1495 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1496 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1497 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1498 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1500 enum tree_code code, orig_code, epilog_reduc_code = 0;
1501 enum machine_mode vec_mode;
1503 optab optab, reduc_optab;
1504 tree new_temp = NULL_TREE;
1506 enum vect_def_type dt;
1511 stmt_vec_info orig_stmt_info;
1512 tree expr = NULL_TREE;
1514 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1515 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1516 stmt_vec_info prev_stmt_info;
1518 tree new_stmt = NULL_TREE;
1521 gcc_assert (ncopies >= 1);
1523 /* 1. Is vectorizable reduction? */
1525 /* Not supportable if the reduction variable is used in the loop. */
1526 if (STMT_VINFO_RELEVANT_P (stmt_info))
1529 if (!STMT_VINFO_LIVE_P (stmt_info))
1532 /* Make sure it was already recognized as a reduction computation. */
1533 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
1536 /* 2. Has this been recognized as a reduction pattern?
1538 Check if STMT represents a pattern that has been recognized
1539 in earlier analysis stages. For stmts that represent a pattern,
1540 the STMT_VINFO_RELATED_STMT field records the last stmt in
1541 the original sequence that constitutes the pattern. */
1543 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1546 orig_stmt_info = vinfo_for_stmt (orig_stmt);
1547 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
1548 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
1549 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
1552 /* 3. Check the operands of the operation. The first operands are defined
1553 inside the loop body. The last operand is the reduction variable,
1554 which is defined by the loop-header-phi. */
1556 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
1558 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1559 code = TREE_CODE (operation);
1560 op_type = TREE_CODE_LENGTH (code);
1561 if (op_type != binary_op && op_type != ternary_op)
1563 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1564 scalar_type = TREE_TYPE (scalar_dest);
1566 /* All uses but the last are expected to be defined in the loop.
1567 The last use is the reduction variable. */
1568 for (i = 0; i < op_type-1; i++)
1570 op = TREE_OPERAND (operation, i);
1571 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1572 gcc_assert (is_simple_use);
1573 if (dt != vect_loop_def
1574 && dt != vect_invariant_def
1575 && dt != vect_constant_def
1576 && dt != vect_induction_def)
1580 op = TREE_OPERAND (operation, i);
1581 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1582 gcc_assert (is_simple_use);
1583 gcc_assert (dt == vect_reduction_def);
1584 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1586 gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt));
1588 gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt));
1590 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
1593 /* 4. Supportable by target? */
1595 /* 4.1. check support for the operation in the loop */
1596 optab = optab_for_tree_code (code, vectype);
1599 if (vect_print_dump_info (REPORT_DETAILS))
1600 fprintf (vect_dump, "no optab.");
1603 vec_mode = TYPE_MODE (vectype);
1604 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1606 if (vect_print_dump_info (REPORT_DETAILS))
1607 fprintf (vect_dump, "op not supported by target.");
1608 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1609 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1610 < vect_min_worthwhile_factor (code))
1612 if (vect_print_dump_info (REPORT_DETAILS))
1613 fprintf (vect_dump, "proceeding using word mode.");
1616 /* Worthwhile without SIMD support? */
1617 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1618 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1619 < vect_min_worthwhile_factor (code))
1621 if (vect_print_dump_info (REPORT_DETAILS))
1622 fprintf (vect_dump, "not worthwhile without SIMD support.");
1626 /* 4.2. Check support for the epilog operation.
1628 If STMT represents a reduction pattern, then the type of the
1629 reduction variable may be different than the type of the rest
1630 of the arguments. For example, consider the case of accumulation
1631 of shorts into an int accumulator; The original code:
1632 S1: int_a = (int) short_a;
1633 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
1636 STMT: int_acc = widen_sum <short_a, int_acc>
1639 1. The tree-code that is used to create the vector operation in the
1640 epilog code (that reduces the partial results) is not the
1641 tree-code of STMT, but is rather the tree-code of the original
1642 stmt from the pattern that STMT is replacing. I.e, in the example
1643 above we want to use 'widen_sum' in the loop, but 'plus' in the
1645 2. The type (mode) we use to check available target support
1646 for the vector operation to be created in the *epilog*, is
1647 determined by the type of the reduction variable (in the example
1648 above we'd check this: plus_optab[vect_int_mode]).
1649 However the type (mode) we use to check available target support
1650 for the vector operation to be created *inside the loop*, is
1651 determined by the type of the other arguments to STMT (in the
1652 example we'd check this: widen_sum_optab[vect_short_mode]).
1654 This is contrary to "regular" reductions, in which the types of all
1655 the arguments are the same as the type of the reduction variable.
1656 For "regular" reductions we can therefore use the same vector type
1657 (and also the same tree-code) when generating the epilog code and
1658 when generating the code inside the loop. */
1662 /* This is a reduction pattern: get the vectype from the type of the
1663 reduction variable, and get the tree-code from orig_stmt. */
1664 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1665 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
1666 vec_mode = TYPE_MODE (vectype);
1670 /* Regular reduction: use the same vectype and tree-code as used for
1671 the vector code inside the loop can be used for the epilog code. */
1675 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
1677 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
1680 if (vect_print_dump_info (REPORT_DETAILS))
1681 fprintf (vect_dump, "no optab for reduction.");
1682 epilog_reduc_code = NUM_TREE_CODES;
1684 if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1686 if (vect_print_dump_info (REPORT_DETAILS))
1687 fprintf (vect_dump, "reduc op not supported by target.");
1688 epilog_reduc_code = NUM_TREE_CODES;
1691 if (!vec_stmt) /* transformation not required. */
1693 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
1699 if (vect_print_dump_info (REPORT_DETAILS))
1700 fprintf (vect_dump, "transform reduction.");
1702 /* Create the destination vector */
1703 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1705 /* Create the reduction-phi that defines the reduction-operand. */
1706 new_phi = create_phi_node (vec_dest, loop->header);
1708 /* In case the vectorization factor (VF) is bigger than the number
1709 of elements that we can fit in a vectype (nunits), we have to generate
1710 more than one vector stmt - i.e - we need to "unroll" the
1711 vector stmt by a factor VF/nunits. For more details see documentation
1712 in vectorizable_operation. */
1714 prev_stmt_info = NULL;
1715 for (j = 0; j < ncopies; j++)
1720 op = TREE_OPERAND (operation, 0);
1721 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
1722 if (op_type == ternary_op)
1724 op = TREE_OPERAND (operation, 1);
1725 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
1728 /* Get the vector def for the reduction variable from the phi node */
1729 reduc_def = PHI_RESULT (new_phi);
1733 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
1734 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
1735 if (op_type == ternary_op)
1736 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
1738 /* Get the vector def for the reduction variable from the vectorized
1739 reduction operation generated in the previous iteration (j-1) */
1740 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
1743 /* Arguments are ready. create the new vector stmt. */
1745 if (op_type == binary_op)
1746 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
1748 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
1750 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, expr);
1751 new_temp = make_ssa_name (vec_dest, new_stmt);
1752 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1753 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1756 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1758 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1759 prev_stmt_info = vinfo_for_stmt (new_stmt);
1762 /* Finalize the reduction-phi (set it's arguments) and create the
1763 epilog reduction code. */
1764 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
1768 /* Checks if CALL can be vectorized in type VECTYPE. Returns
1769 a function declaration if the target has a vectorized version
1770 of the function, or NULL_TREE if the function cannot be vectorized. */
1773 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
1775 tree fndecl = get_callee_fndecl (call);
1776 enum built_in_function code;
1778 /* We only handle functions that do not read or clobber memory -- i.e.
1779 const or novops ones. */
1780 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
1784 || TREE_CODE (fndecl) != FUNCTION_DECL
1785 || !DECL_BUILT_IN (fndecl))
1788 code = DECL_FUNCTION_CODE (fndecl);
1789 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
1793 /* Function vectorizable_call.
1795 Check if STMT performs a function call that can be vectorized.
1796 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1797 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1798 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1801 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1807 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
1808 tree vectype_out, vectype_in;
1809 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1810 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
1811 enum vect_def_type dt[2];
1812 int ncopies, j, nargs;
1814 /* Is STMT a vectorizable call? */
1815 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
1818 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
1821 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1822 if (TREE_CODE (operation) != CALL_EXPR)
1825 /* Process function arguments. */
1826 rhs_type = NULL_TREE;
1827 for (args = TREE_OPERAND (operation, 1), nargs = 0;
1828 args; args = TREE_CHAIN (args), ++nargs)
1830 tree op = TREE_VALUE (args);
1832 /* Bail out if the function has more than two arguments, we
1833 do not have interesting builtin functions to vectorize with
1834 more than two arguments. */
1838 /* We can only handle calls with arguments of the same type. */
1840 && rhs_type != TREE_TYPE (op))
1842 if (vect_print_dump_info (REPORT_DETAILS))
1843 fprintf (vect_dump, "argument types differ.");
1846 rhs_type = TREE_TYPE (op);
1848 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
1850 if (vect_print_dump_info (REPORT_DETAILS))
1851 fprintf (vect_dump, "use not simple.");
1856 /* No arguments is also not good. */
1860 vectype_in = get_vectype_for_scalar_type (rhs_type);
1862 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
1863 vectype_out = get_vectype_for_scalar_type (lhs_type);
1865 /* Only handle the case of vectors with the same number of elements.
1866 FIXME: We need a way to handle for example the SSE2 cvtpd2dq
1867 instruction which converts V2DFmode to V4SImode but only
1868 using the lower half of the V4SImode result. */
1869 if (TYPE_VECTOR_SUBPARTS (vectype_in) != TYPE_VECTOR_SUBPARTS (vectype_out))
1872 /* For now, we only vectorize functions if a target specific builtin
1873 is available. TODO -- in some cases, it might be profitable to
1874 insert the calls for pieces of the vector, in order to be able
1875 to vectorize other operations in the loop. */
1876 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
1877 if (fndecl == NULL_TREE)
1879 if (vect_print_dump_info (REPORT_DETAILS))
1880 fprintf (vect_dump, "function is not vectorizable.");
1885 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
1887 if (!vec_stmt) /* transformation not required. */
1889 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
1895 if (vect_print_dump_info (REPORT_DETAILS))
1896 fprintf (vect_dump, "transform operation.");
1898 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1899 / TYPE_VECTOR_SUBPARTS (vectype_out));
1900 gcc_assert (ncopies >= 1);
1903 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1904 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
1906 prev_stmt_info = NULL;
1907 for (j = 0; j < ncopies; ++j)
1909 tree new_stmt, vargs;
1913 /* Build argument list for the vectorized call. */
1915 for (args = TREE_OPERAND (operation, 1), n = 0;
1916 args; args = TREE_CHAIN (args), ++n)
1918 tree op = TREE_VALUE (args);
1921 vec_oprnd[n] = vect_get_vec_def_for_operand (op, stmt, NULL);
1923 vec_oprnd[n] = vect_get_vec_def_for_stmt_copy (dt[n], vec_oprnd[n]);
1925 vargs = tree_cons (NULL_TREE, vec_oprnd[n], vargs);
1927 vargs = nreverse (vargs);
1929 rhs = build_function_call_expr (fndecl, vargs);
1930 new_stmt = build2 (GIMPLE_MODIFY_STMT, NULL_TREE, vec_dest, rhs);
1931 new_temp = make_ssa_name (vec_dest, new_stmt);
1932 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1934 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1937 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1939 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1940 prev_stmt_info = vinfo_for_stmt (new_stmt);
1943 /* The call in STMT might prevent it from being removed in dce. We however
1944 cannot remove it here, due to the way the ssa name it defines is mapped
1945 to the new definition. So just replace rhs of the statement with something
1947 type = TREE_TYPE (scalar_dest);
1948 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
1954 /* Function vectorizable_assignment.
1956 Check if STMT performs an assignment (copy) that can be vectorized.
1957 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1958 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1959 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1962 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1968 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1969 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1970 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1973 enum vect_def_type dt;
1974 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1975 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1977 gcc_assert (ncopies >= 1);
1979 return false; /* FORNOW */
1981 /* Is vectorizable assignment? */
1982 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1985 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1987 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
1990 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1991 if (TREE_CODE (scalar_dest) != SSA_NAME)
1994 op = GIMPLE_STMT_OPERAND (stmt, 1);
1995 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
1997 if (vect_print_dump_info (REPORT_DETAILS))
1998 fprintf (vect_dump, "use not simple.");
2002 if (!vec_stmt) /* transformation not required. */
2004 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
2009 if (vect_print_dump_info (REPORT_DETAILS))
2010 fprintf (vect_dump, "transform assignment.");
2013 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2016 op = GIMPLE_STMT_OPERAND (stmt, 1);
2017 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
2019 /* Arguments are ready. create the new vector stmt. */
2020 *vec_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, vec_oprnd);
2021 new_temp = make_ssa_name (vec_dest, *vec_stmt);
2022 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
2023 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
2029 /* Function vect_min_worthwhile_factor.
2031 For a loop where we could vectorize the operation indicated by CODE,
2032 return the minimum vectorization factor that makes it worthwhile
2033 to use generic vectors. */
2035 vect_min_worthwhile_factor (enum tree_code code)
2056 /* Function vectorizable_operation.
2058 Check if STMT performs a binary or unary operation that can be vectorized.
2059 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2060 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2061 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2064 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2069 tree op0, op1 = NULL;
2070 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
2071 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2072 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2073 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2074 enum tree_code code;
2075 enum machine_mode vec_mode;
2080 enum machine_mode optab_op2_mode;
2082 enum vect_def_type dt0, dt1;
2084 stmt_vec_info prev_stmt_info;
2085 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
2088 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2091 gcc_assert (ncopies >= 1);
2093 /* Is STMT a vectorizable binary/unary operation? */
2094 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2097 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2099 if (STMT_VINFO_LIVE_P (stmt_info))
2101 /* FORNOW: not yet supported. */
2102 if (vect_print_dump_info (REPORT_DETAILS))
2103 fprintf (vect_dump, "value used after loop.");
2107 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2110 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2113 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2114 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2115 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2116 if (nunits_out != nunits_in)
2119 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2120 code = TREE_CODE (operation);
2121 optab = optab_for_tree_code (code, vectype);
2123 /* Support only unary or binary operations. */
2124 op_type = TREE_CODE_LENGTH (code);
2125 if (op_type != unary_op && op_type != binary_op)
2127 if (vect_print_dump_info (REPORT_DETAILS))
2128 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
2132 op0 = TREE_OPERAND (operation, 0);
2133 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2135 if (vect_print_dump_info (REPORT_DETAILS))
2136 fprintf (vect_dump, "use not simple.");
2140 if (op_type == binary_op)
2142 op1 = TREE_OPERAND (operation, 1);
2143 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2145 if (vect_print_dump_info (REPORT_DETAILS))
2146 fprintf (vect_dump, "use not simple.");
2151 /* Supportable by target? */
2154 if (vect_print_dump_info (REPORT_DETAILS))
2155 fprintf (vect_dump, "no optab.");
2158 vec_mode = TYPE_MODE (vectype);
2159 icode = (int) optab->handlers[(int) vec_mode].insn_code;
2160 if (icode == CODE_FOR_nothing)
2162 if (vect_print_dump_info (REPORT_DETAILS))
2163 fprintf (vect_dump, "op not supported by target.");
2164 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2165 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2166 < vect_min_worthwhile_factor (code))
2168 if (vect_print_dump_info (REPORT_DETAILS))
2169 fprintf (vect_dump, "proceeding using word mode.");
2172 /* Worthwhile without SIMD support? */
2173 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2174 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2175 < vect_min_worthwhile_factor (code))
2177 if (vect_print_dump_info (REPORT_DETAILS))
2178 fprintf (vect_dump, "not worthwhile without SIMD support.");
2182 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2184 /* FORNOW: not yet supported. */
2185 if (!VECTOR_MODE_P (vec_mode))
2188 /* Invariant argument is needed for a vector shift
2189 by a scalar shift operand. */
2190 optab_op2_mode = insn_data[icode].operand[2].mode;
2191 if (! (VECTOR_MODE_P (optab_op2_mode)
2192 || dt1 == vect_constant_def
2193 || dt1 == vect_invariant_def))
2195 if (vect_print_dump_info (REPORT_DETAILS))
2196 fprintf (vect_dump, "operand mode requires invariant argument.");
2201 if (!vec_stmt) /* transformation not required. */
2203 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
2209 if (vect_print_dump_info (REPORT_DETAILS))
2210 fprintf (vect_dump, "transform binary/unary operation.");
2213 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2215 /* In case the vectorization factor (VF) is bigger than the number
2216 of elements that we can fit in a vectype (nunits), we have to generate
2217 more than one vector stmt - i.e - we need to "unroll" the
2218 vector stmt by a factor VF/nunits. In doing so, we record a pointer
2219 from one copy of the vector stmt to the next, in the field
2220 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
2221 stages to find the correct vector defs to be used when vectorizing
2222 stmts that use the defs of the current stmt. The example below illustrates
2223 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
2224 4 vectorized stmts):
2226 before vectorization:
2227 RELATED_STMT VEC_STMT
2231 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
2233 RELATED_STMT VEC_STMT
2234 VS1_0: vx0 = memref0 VS1_1 -
2235 VS1_1: vx1 = memref1 VS1_2 -
2236 VS1_2: vx2 = memref2 VS1_3 -
2237 VS1_3: vx3 = memref3 - -
2238 S1: x = load - VS1_0
2241 step2: vectorize stmt S2 (done here):
2242 To vectorize stmt S2 we first need to find the relevant vector
2243 def for the first operand 'x'. This is, as usual, obtained from
2244 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
2245 that defines 'x' (S1). This way we find the stmt VS1_0, and the
2246 relevant vector def 'vx0'. Having found 'vx0' we can generate
2247 the vector stmt VS2_0, and as usual, record it in the
2248 STMT_VINFO_VEC_STMT of stmt S2.
2249 When creating the second copy (VS2_1), we obtain the relevant vector
2250 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
2251 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
2252 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
2253 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
2254 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
2255 chain of stmts and pointers:
2256 RELATED_STMT VEC_STMT
2257 VS1_0: vx0 = memref0 VS1_1 -
2258 VS1_1: vx1 = memref1 VS1_2 -
2259 VS1_2: vx2 = memref2 VS1_3 -
2260 VS1_3: vx3 = memref3 - -
2261 S1: x = load - VS1_0
2262 VS2_0: vz0 = vx0 + v1 VS2_1 -
2263 VS2_1: vz1 = vx1 + v1 VS2_2 -
2264 VS2_2: vz2 = vx2 + v1 VS2_3 -
2265 VS2_3: vz3 = vx3 + v1 - -
2266 S2: z = x + 1 - VS2_0 */
2268 prev_stmt_info = NULL;
2269 for (j = 0; j < ncopies; j++)
2274 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2275 if (op_type == binary_op)
2277 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2279 /* Vector shl and shr insn patterns can be defined with
2280 scalar operand 2 (shift operand). In this case, use
2281 constant or loop invariant op1 directly, without
2282 extending it to vector mode first. */
2283 optab_op2_mode = insn_data[icode].operand[2].mode;
2284 if (!VECTOR_MODE_P (optab_op2_mode))
2286 if (vect_print_dump_info (REPORT_DETAILS))
2287 fprintf (vect_dump, "operand 1 using scalar mode.");
2292 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2297 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2298 if (op_type == binary_op)
2299 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2302 /* Arguments are ready. create the new vector stmt. */
2304 if (op_type == binary_op)
2305 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
2306 build2 (code, vectype, vec_oprnd0, vec_oprnd1));
2308 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
2309 build1 (code, vectype, vec_oprnd0));
2310 new_temp = make_ssa_name (vec_dest, new_stmt);
2311 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2312 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2315 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2317 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2318 prev_stmt_info = vinfo_for_stmt (new_stmt);
2325 /* Function vectorizable_type_demotion
2327 Check if STMT performs a binary or unary operation that involves
2328 type demotion, and if it can be vectorized.
2329 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2330 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2331 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2334 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
2341 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2342 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2343 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2344 enum tree_code code;
2347 enum vect_def_type dt0;
2349 stmt_vec_info prev_stmt_info;
2359 enum machine_mode vec_mode;
2361 /* Is STMT a vectorizable type-demotion operation? */
2363 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2366 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2368 if (STMT_VINFO_LIVE_P (stmt_info))
2370 /* FORNOW: not yet supported. */
2371 if (vect_print_dump_info (REPORT_DETAILS))
2372 fprintf (vect_dump, "value used after loop.");
2376 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2379 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2382 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2383 code = TREE_CODE (operation);
2384 if (code != NOP_EXPR && code != CONVERT_EXPR)
2387 op0 = TREE_OPERAND (operation, 0);
2388 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2389 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2391 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2392 scalar_type = TREE_TYPE (scalar_dest);
2393 vectype_out = get_vectype_for_scalar_type (scalar_type);
2394 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2395 if (nunits_in != nunits_out / 2) /* FORNOW */
2398 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2399 gcc_assert (ncopies >= 1);
2401 if (! INTEGRAL_TYPE_P (scalar_type)
2402 || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2405 /* Check the operands of the operation. */
2406 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2408 if (vect_print_dump_info (REPORT_DETAILS))
2409 fprintf (vect_dump, "use not simple.");
2413 /* Supportable by target? */
2414 code = VEC_PACK_MOD_EXPR;
2415 optab = optab_for_tree_code (VEC_PACK_MOD_EXPR, vectype_in);
2419 vec_mode = TYPE_MODE (vectype_in);
2420 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
2423 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2425 if (!vec_stmt) /* transformation not required. */
2427 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
2433 if (vect_print_dump_info (REPORT_DETAILS))
2434 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
2438 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2440 /* In case the vectorization factor (VF) is bigger than the number
2441 of elements that we can fit in a vectype (nunits), we have to generate
2442 more than one vector stmt - i.e - we need to "unroll" the
2443 vector stmt by a factor VF/nunits. */
2444 prev_stmt_info = NULL;
2445 for (j = 0; j < ncopies; j++)
2450 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2451 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2455 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
2456 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2459 /* Arguments are ready. Create the new vector stmt. */
2460 expr = build2 (code, vectype_out, vec_oprnd0, vec_oprnd1);
2461 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, expr);
2462 new_temp = make_ssa_name (vec_dest, new_stmt);
2463 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2464 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2467 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2469 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2471 prev_stmt_info = vinfo_for_stmt (new_stmt);
2474 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2479 /* Function vect_gen_widened_results_half
2481 Create a vector stmt whose code, type, number of arguments, and result
2482 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
2483 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
2484 In the case that CODE is a CALL_EXPR, this means that a call to DECL
2485 needs to be created (DECL is a function-decl of a target-builtin).
2486 STMT is the original scalar stmt that we are vectorizing. */
2489 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
2490 tree vec_oprnd0, tree vec_oprnd1, int op_type,
2491 tree vec_dest, block_stmt_iterator *bsi,
2501 /* Generate half of the widened result: */
2502 if (code == CALL_EXPR)
2504 /* Target specific support */
2505 vec_params = build_tree_list (NULL_TREE, vec_oprnd0);
2506 if (op_type == binary_op)
2507 vec_params = tree_cons (NULL_TREE, vec_oprnd1, vec_params);
2508 expr = build_function_call_expr (decl, vec_params);
2512 /* Generic support */
2513 gcc_assert (op_type == TREE_CODE_LENGTH (code));
2514 if (op_type == binary_op)
2515 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
2517 expr = build1 (code, vectype, vec_oprnd0);
2519 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, expr);
2520 new_temp = make_ssa_name (vec_dest, new_stmt);
2521 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2522 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2524 if (code == CALL_EXPR)
2526 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2528 if (TREE_CODE (sym) == SSA_NAME)
2529 sym = SSA_NAME_VAR (sym);
2530 mark_sym_for_renaming (sym);
2538 /* Function vectorizable_type_promotion
2540 Check if STMT performs a binary or unary operation that involves
2541 type promotion, and if it can be vectorized.
2542 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2543 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2544 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2547 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
2553 tree op0, op1 = NULL;
2554 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2555 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2556 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2557 enum tree_code code, code1 = CODE_FOR_nothing, code2 = CODE_FOR_nothing;
2558 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
2561 enum vect_def_type dt0, dt1;
2563 stmt_vec_info prev_stmt_info;
2571 /* Is STMT a vectorizable type-promotion operation? */
2573 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2576 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2578 if (STMT_VINFO_LIVE_P (stmt_info))
2580 /* FORNOW: not yet supported. */
2581 if (vect_print_dump_info (REPORT_DETAILS))
2582 fprintf (vect_dump, "value used after loop.");
2586 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2589 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2592 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2593 code = TREE_CODE (operation);
2594 if (code != NOP_EXPR && code != WIDEN_MULT_EXPR)
2597 op0 = TREE_OPERAND (operation, 0);
2598 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2599 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2600 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2601 gcc_assert (ncopies >= 1);
2603 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2604 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2605 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2606 if (nunits_out != nunits_in / 2) /* FORNOW */
2609 if (! INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
2610 || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2613 /* Check the operands of the operation. */
2614 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2616 if (vect_print_dump_info (REPORT_DETAILS))
2617 fprintf (vect_dump, "use not simple.");
2621 op_type = TREE_CODE_LENGTH (code);
2622 if (op_type == binary_op)
2624 op1 = TREE_OPERAND (operation, 1);
2625 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2627 if (vect_print_dump_info (REPORT_DETAILS))
2628 fprintf (vect_dump, "use not simple.");
2633 /* Supportable by target? */
2634 if (!supportable_widening_operation (code, stmt, vectype_in,
2635 &decl1, &decl2, &code1, &code2))
2638 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2640 if (!vec_stmt) /* transformation not required. */
2642 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
2648 if (vect_print_dump_info (REPORT_DETAILS))
2649 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
2653 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2655 /* In case the vectorization factor (VF) is bigger than the number
2656 of elements that we can fit in a vectype (nunits), we have to generate
2657 more than one vector stmt - i.e - we need to "unroll" the
2658 vector stmt by a factor VF/nunits. */
2660 prev_stmt_info = NULL;
2661 for (j = 0; j < ncopies; j++)
2666 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2667 if (op_type == binary_op)
2668 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2672 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2673 if (op_type == binary_op)
2674 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2677 /* Arguments are ready. Create the new vector stmt. We are creating
2678 two vector defs because the widened result does not fit in one vector.
2679 The vectorized stmt can be expressed as a call to a taregt builtin,
2680 or a using a tree-code. */
2681 /* Generate first half of the widened result: */
2682 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
2683 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2685 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2687 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2688 prev_stmt_info = vinfo_for_stmt (new_stmt);
2690 /* Generate second half of the widened result: */
2691 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
2692 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2693 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2694 prev_stmt_info = vinfo_for_stmt (new_stmt);
2698 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2703 /* Function vect_strided_store_supported.
2705 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
2706 and FALSE otherwise. */
2709 vect_strided_store_supported (tree vectype)
2711 optab interleave_high_optab, interleave_low_optab;
2714 mode = (int) TYPE_MODE (vectype);
2716 /* Check that the operation is supported. */
2717 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
2719 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
2721 if (!interleave_high_optab || !interleave_low_optab)
2723 if (vect_print_dump_info (REPORT_DETAILS))
2724 fprintf (vect_dump, "no optab for interleave.");
2728 if (interleave_high_optab->handlers[(int) mode].insn_code
2730 || interleave_low_optab->handlers[(int) mode].insn_code
2731 == CODE_FOR_nothing)
2733 if (vect_print_dump_info (REPORT_DETAILS))
2734 fprintf (vect_dump, "interleave op not supported by target.");
2741 /* Function vect_permute_store_chain.
2743 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
2744 a power of 2, generate interleave_high/low stmts to reorder the data
2745 correctly for the stores. Return the final references for stores in
2748 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
2749 The input is 4 vectors each containing 8 elements. We assign a number to each
2750 element, the input sequence is:
2752 1st vec: 0 1 2 3 4 5 6 7
2753 2nd vec: 8 9 10 11 12 13 14 15
2754 3rd vec: 16 17 18 19 20 21 22 23
2755 4th vec: 24 25 26 27 28 29 30 31
2757 The output sequence should be:
2759 1st vec: 0 8 16 24 1 9 17 25
2760 2nd vec: 2 10 18 26 3 11 19 27
2761 3rd vec: 4 12 20 28 5 13 21 30
2762 4th vec: 6 14 22 30 7 15 23 31
2764 i.e., we interleave the contents of the four vectors in their order.
2766 We use interleave_high/low instructions to create such output. The input of
2767 each interleave_high/low operation is two vectors:
2770 the even elements of the result vector are obtained left-to-right from the
2771 high/low elements of the first vector. The odd elements of the result are
2772 obtained left-to-right from the high/low elements of the second vector.
2773 The output of interleave_high will be: 0 4 1 5
2774 and of interleave_low: 2 6 3 7
2777 The permutation is done in log LENGTH stages. In each stage interleave_high
2778 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
2779 where the first argument is taken from the first half of DR_CHAIN and the
2780 second argument from it's second half.
2783 I1: interleave_high (1st vec, 3rd vec)
2784 I2: interleave_low (1st vec, 3rd vec)
2785 I3: interleave_high (2nd vec, 4th vec)
2786 I4: interleave_low (2nd vec, 4th vec)
2788 The output for the first stage is:
2790 I1: 0 16 1 17 2 18 3 19
2791 I2: 4 20 5 21 6 22 7 23
2792 I3: 8 24 9 25 10 26 11 27
2793 I4: 12 28 13 29 14 30 15 31
2795 The output of the second stage, i.e. the final result is:
2797 I1: 0 8 16 24 1 9 17 25
2798 I2: 2 10 18 26 3 11 19 27
2799 I3: 4 12 20 28 5 13 21 30
2800 I4: 6 14 22 30 7 15 23 31. */
2803 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
2804 unsigned int length,
2806 block_stmt_iterator *bsi,
2807 VEC(tree,heap) **result_chain)
2809 tree perm_dest, perm_stmt, vect1, vect2, high, low;
2810 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2814 VEC(tree,heap) *first, *second;
2816 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2817 first = VEC_alloc (tree, heap, length/2);
2818 second = VEC_alloc (tree, heap, length/2);
2820 /* Check that the operation is supported. */
2821 if (!vect_strided_store_supported (vectype))
2824 *result_chain = VEC_copy (tree, heap, dr_chain);
2826 for (i = 0; i < exact_log2 (length); i++)
2828 for (j = 0; j < length/2; j++)
2830 vect1 = VEC_index (tree, dr_chain, j);
2831 vect2 = VEC_index (tree, dr_chain, j+length/2);
2833 /* Create interleaving stmt:
2834 in the case of big endian:
2835 high = interleave_high (vect1, vect2)
2836 and in the case of little endian:
2837 high = interleave_low (vect1, vect2). */
2838 perm_dest = create_tmp_var (vectype, "vect_inter_high");
2839 DECL_GIMPLE_REG_P (perm_dest) = 1;
2840 add_referenced_var (perm_dest);
2841 if (BYTES_BIG_ENDIAN)
2842 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
2843 build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype,
2846 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
2847 build2 (VEC_INTERLEAVE_LOW_EXPR, vectype,
2849 high = make_ssa_name (perm_dest, perm_stmt);
2850 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
2851 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
2852 VEC_replace (tree, *result_chain, 2*j, high);
2854 /* Create interleaving stmt:
2855 in the case of big endian:
2856 low = interleave_low (vect1, vect2)
2857 and in the case of little endian:
2858 low = interleave_high (vect1, vect2). */
2859 perm_dest = create_tmp_var (vectype, "vect_inter_low");
2860 DECL_GIMPLE_REG_P (perm_dest) = 1;
2861 add_referenced_var (perm_dest);
2862 if (BYTES_BIG_ENDIAN)
2863 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
2864 build2 (VEC_INTERLEAVE_LOW_EXPR, vectype,
2867 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
2868 build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype,
2870 low = make_ssa_name (perm_dest, perm_stmt);
2871 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
2872 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
2873 VEC_replace (tree, *result_chain, 2*j+1, low);
2875 dr_chain = VEC_copy (tree, heap, *result_chain);
2881 /* Function vectorizable_store.
2883 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
2885 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2886 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2887 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2890 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2895 tree vec_oprnd = NULL_TREE;
2896 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2897 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
2898 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2899 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2900 enum machine_mode vec_mode;
2902 enum dr_alignment_support alignment_support_cheme;
2904 def_operand_p def_p;
2906 enum vect_def_type dt;
2907 stmt_vec_info prev_stmt_info = NULL;
2908 tree dataref_ptr = NULL_TREE;
2909 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2910 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2912 tree next_stmt, first_stmt;
2913 bool strided_store = false;
2914 unsigned int group_size, i;
2915 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
2916 gcc_assert (ncopies >= 1);
2918 /* Is vectorizable store? */
2920 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2923 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2924 if (TREE_CODE (scalar_dest) != ARRAY_REF
2925 && TREE_CODE (scalar_dest) != INDIRECT_REF
2926 && !DR_GROUP_FIRST_DR (stmt_info))
2929 op = GIMPLE_STMT_OPERAND (stmt, 1);
2930 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
2932 if (vect_print_dump_info (REPORT_DETAILS))
2933 fprintf (vect_dump, "use not simple.");
2937 vec_mode = TYPE_MODE (vectype);
2938 /* FORNOW. In some cases can vectorize even if data-type not supported
2939 (e.g. - array initialization with 0). */
2940 if (mov_optab->handlers[(int)vec_mode].insn_code == CODE_FOR_nothing)
2943 if (!STMT_VINFO_DATA_REF (stmt_info))
2946 if (DR_GROUP_FIRST_DR (stmt_info))
2948 strided_store = true;
2949 if (!vect_strided_store_supported (vectype))
2953 if (!vec_stmt) /* transformation not required. */
2955 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
2961 if (vect_print_dump_info (REPORT_DETAILS))
2962 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
2966 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
2967 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2968 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
2970 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
2972 /* We vectorize all the stmts of the interleaving group when we
2973 reach the last stmt in the group. */
2974 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
2975 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
2977 *vec_stmt = NULL_TREE;
2988 dr_chain = VEC_alloc (tree, heap, group_size);
2989 oprnds = VEC_alloc (tree, heap, group_size);
2991 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
2992 gcc_assert (alignment_support_cheme);
2993 gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */
2995 /* In case the vectorization factor (VF) is bigger than the number
2996 of elements that we can fit in a vectype (nunits), we have to generate
2997 more than one vector stmt - i.e - we need to "unroll" the
2998 vector stmt by a factor VF/nunits. For more details see documentation in
2999 vect_get_vec_def_for_copy_stmt. */
3001 /* In case of interleaving (non-unit strided access):
3008 We create vectorized stores starting from base address (the access of the
3009 first stmt in the chain (S2 in the above example), when the last store stmt
3010 of the chain (S4) is reached:
3013 VS2: &base + vec_size*1 = vx0
3014 VS3: &base + vec_size*2 = vx1
3015 VS4: &base + vec_size*3 = vx3
3017 Then permutation statements are generated:
3019 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
3020 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
3023 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3024 (the order of the data-refs in the output of vect_permute_store_chain
3025 corresponds to the order of scalar stmts in the interleaving chain - see
3026 the documentation of vect_permute_store_chain()).
3028 In case of both multiple types and interleaving, above vector stores and
3029 permutation stmts are created for every copy. The result vector stmts are
3030 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3031 STMT_VINFO_RELATED_STMT for the next copies.
3034 prev_stmt_info = NULL;
3035 for (j = 0; j < ncopies; j++)
3042 /* For interleaved stores we collect vectorized defs for all the
3043 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
3044 as an input to vect_permute_store_chain(), and OPRNDS as an input
3045 to vect_get_vec_def_for_stmt_copy() for the next copy.
3046 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
3047 OPRNDS are of size 1. */
3048 next_stmt = first_stmt;
3049 for (i = 0; i < group_size; i++)
3051 /* Since gaps are not supported for interleaved stores, GROUP_SIZE
3052 is the exact number of stmts in the chain. Therefore, NEXT_STMT
3053 can't be NULL_TREE. In case that there is no interleaving,
3054 GROUP_SIZE is 1, and only one iteration of the loop will be
3056 gcc_assert (next_stmt);
3057 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
3058 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
3059 VEC_quick_push(tree, dr_chain, vec_oprnd);
3060 VEC_quick_push(tree, oprnds, vec_oprnd);
3061 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3063 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE,
3064 &dummy, &ptr_incr, false,
3065 TREE_TYPE (vec_oprnd));
3069 /* For interleaved stores we created vectorized defs for all the
3070 defs stored in OPRNDS in the previous iteration (previous copy).
3071 DR_CHAIN is then used as an input to vect_permute_store_chain(),
3072 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
3074 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
3075 OPRNDS are of size 1. */
3076 for (i = 0; i < group_size; i++)
3078 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
3079 VEC_index (tree, oprnds, i));
3080 VEC_replace(tree, dr_chain, i, vec_oprnd);
3081 VEC_replace(tree, oprnds, i, vec_oprnd);
3083 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3088 result_chain = VEC_alloc (tree, heap, group_size);
3090 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
3095 next_stmt = first_stmt;
3096 for (i = 0; i < group_size; i++)
3098 /* For strided stores vectorized defs are interleaved in
3099 vect_permute_store_chain(). */
3101 vec_oprnd = VEC_index(tree, result_chain, i);
3103 data_ref = build_fold_indirect_ref (dataref_ptr);
3104 /* Arguments are ready. Create the new vector stmt. */
3105 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, data_ref,
3107 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3109 /* Set the VDEFs for the vector pointer. If this virtual def
3110 has a use outside the loop and a loop peel is performed
3111 then the def may be renamed by the peel. Mark it for
3112 renaming so the later use will also be renamed. */
3113 copy_virtual_operands (new_stmt, next_stmt);
3116 /* The original store is deleted so the same SSA_NAMEs
3118 FOR_EACH_SSA_TREE_OPERAND (def, next_stmt, iter, SSA_OP_VDEF)
3120 SSA_NAME_DEF_STMT (def) = new_stmt;
3121 mark_sym_for_renaming (SSA_NAME_VAR (def));
3124 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3128 /* Create new names for all the definitions created by COPY and
3129 add replacement mappings for each new name. */
3130 FOR_EACH_SSA_DEF_OPERAND (def_p, new_stmt, iter, SSA_OP_VDEF)
3132 create_new_def_for (DEF_FROM_PTR (def_p), new_stmt, def_p);
3133 mark_sym_for_renaming (SSA_NAME_VAR (DEF_FROM_PTR (def_p)));
3136 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3139 prev_stmt_info = vinfo_for_stmt (new_stmt);
3140 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3143 /* Bump the vector pointer. */
3144 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3152 /* Function vect_setup_realignment
3154 This function is called when vectorizing an unaligned load using
3155 the dr_unaligned_software_pipeline scheme.
3156 This function generates the following code at the loop prolog:
3159 msq_init = *(floor(p)); # prolog load
3160 realignment_token = call target_builtin;
3162 msq = phi (msq_init, ---)
3164 The code above sets up a new (vector) pointer, pointing to the first
3165 location accessed by STMT, and a "floor-aligned" load using that pointer.
3166 It also generates code to compute the "realignment-token" (if the relevant
3167 target hook was defined), and creates a phi-node at the loop-header bb
3168 whose arguments are the result of the prolog-load (created by this
3169 function) and the result of a load that takes place in the loop (to be
3170 created by the caller to this function).
3171 The caller to this function uses the phi-result (msq) to create the
3172 realignment code inside the loop, and sets up the missing phi argument,
3176 msq = phi (msq_init, lsq)
3177 lsq = *(floor(p')); # load in loop
3178 result = realign_load (msq, lsq, realignment_token);
3181 STMT - (scalar) load stmt to be vectorized. This load accesses
3182 a memory location that may be unaligned.
3183 BSI - place where new code is to be inserted.
3186 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
3187 target hook, if defined.
3188 Return value - the result of the loop-header phi node. */
3191 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
3192 tree *realignment_token)
3194 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3195 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3196 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3197 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3198 edge pe = loop_preheader_edge (loop);
3199 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3212 /* 1. Create msq_init = *(floor(p1)) in the loop preheader */
3213 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3214 ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true,
3216 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
3217 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, data_ref);
3218 new_temp = make_ssa_name (vec_dest, new_stmt);
3219 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3220 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3221 gcc_assert (!new_bb);
3222 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
3223 copy_virtual_operands (new_stmt, stmt);
3224 update_vuses_to_preheader (new_stmt, loop);
3226 /* 2. Create permutation mask, if required, in loop preheader. */
3227 if (targetm.vectorize.builtin_mask_for_load)
3230 tree params = build_tree_list (NULL_TREE, init_addr);
3232 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
3233 new_stmt = build_function_call_expr (builtin_decl, params);
3234 vec_dest = vect_create_destination_var (scalar_dest,
3235 TREE_TYPE (new_stmt));
3236 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3238 new_temp = make_ssa_name (vec_dest, new_stmt);
3239 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3240 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3241 gcc_assert (!new_bb);
3242 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
3244 /* The result of the CALL_EXPR to this builtin is determined from
3245 the value of the parameter and no global variables are touched
3246 which makes the builtin a "const" function. Requiring the
3247 builtin to have the "const" attribute makes it unnecessary
3248 to call mark_call_clobbered. */
3249 gcc_assert (TREE_READONLY (builtin_decl));
3252 /* 3. Create msq = phi <msq_init, lsq> in loop */
3253 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3254 msq = make_ssa_name (vec_dest, NULL_TREE);
3255 phi_stmt = create_phi_node (msq, loop->header);
3256 SSA_NAME_DEF_STMT (msq) = phi_stmt;
3257 add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop));
3263 /* Function vect_strided_load_supported.
3265 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
3266 and FALSE otherwise. */
3269 vect_strided_load_supported (tree vectype)
3271 optab perm_even_optab, perm_odd_optab;
3274 mode = (int) TYPE_MODE (vectype);
3276 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
3277 if (!perm_even_optab)
3279 if (vect_print_dump_info (REPORT_DETAILS))
3280 fprintf (vect_dump, "no optab for perm_even.");
3284 if (perm_even_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3286 if (vect_print_dump_info (REPORT_DETAILS))
3287 fprintf (vect_dump, "perm_even op not supported by target.");
3291 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
3292 if (!perm_odd_optab)
3294 if (vect_print_dump_info (REPORT_DETAILS))
3295 fprintf (vect_dump, "no optab for perm_odd.");
3299 if (perm_odd_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3301 if (vect_print_dump_info (REPORT_DETAILS))
3302 fprintf (vect_dump, "perm_odd op not supported by target.");
3309 /* Function vect_permute_load_chain.
3311 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
3312 a power of 2, generate extract_even/odd stmts to reorder the input data
3313 correctly. Return the final references for loads in RESULT_CHAIN.
3315 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3316 The input is 4 vectors each containing 8 elements. We assign a number to each
3317 element, the input sequence is:
3319 1st vec: 0 1 2 3 4 5 6 7
3320 2nd vec: 8 9 10 11 12 13 14 15
3321 3rd vec: 16 17 18 19 20 21 22 23
3322 4th vec: 24 25 26 27 28 29 30 31
3324 The output sequence should be:
3326 1st vec: 0 4 8 12 16 20 24 28
3327 2nd vec: 1 5 9 13 17 21 25 29
3328 3rd vec: 2 6 10 14 18 22 26 30
3329 4th vec: 3 7 11 15 19 23 27 31
3331 i.e., the first output vector should contain the first elements of each
3332 interleaving group, etc.
3334 We use extract_even/odd instructions to create such output. The input of each
3335 extract_even/odd operation is two vectors
3339 and the output is the vector of extracted even/odd elements. The output of
3340 extract_even will be: 0 2 4 6
3341 and of extract_odd: 1 3 5 7
3344 The permutation is done in log LENGTH stages. In each stage extract_even and
3345 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
3346 order. In our example,
3348 E1: extract_even (1st vec, 2nd vec)
3349 E2: extract_odd (1st vec, 2nd vec)
3350 E3: extract_even (3rd vec, 4th vec)
3351 E4: extract_odd (3rd vec, 4th vec)
3353 The output for the first stage will be:
3355 E1: 0 2 4 6 8 10 12 14
3356 E2: 1 3 5 7 9 11 13 15
3357 E3: 16 18 20 22 24 26 28 30
3358 E4: 17 19 21 23 25 27 29 31
3360 In order to proceed and create the correct sequence for the next stage (or
3361 for the correct output, if the second stage is the last one, as in our
3362 example), we first put the output of extract_even operation and then the
3363 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
3364 The input for the second stage is:
3366 1st vec (E1): 0 2 4 6 8 10 12 14
3367 2nd vec (E3): 16 18 20 22 24 26 28 30
3368 3rd vec (E2): 1 3 5 7 9 11 13 15
3369 4th vec (E4): 17 19 21 23 25 27 29 31
3371 The output of the second stage:
3373 E1: 0 4 8 12 16 20 24 28
3374 E2: 2 6 10 14 18 22 26 30
3375 E3: 1 5 9 13 17 21 25 29
3376 E4: 3 7 11 15 19 23 27 31
3378 And RESULT_CHAIN after reordering:
3380 1st vec (E1): 0 4 8 12 16 20 24 28
3381 2nd vec (E3): 1 5 9 13 17 21 25 29
3382 3rd vec (E2): 2 6 10 14 18 22 26 30
3383 4th vec (E4): 3 7 11 15 19 23 27 31. */
3386 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
3387 unsigned int length,
3389 block_stmt_iterator *bsi,
3390 VEC(tree,heap) **result_chain)
3392 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
3393 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3397 /* Check that the operation is supported. */
3398 if (!vect_strided_load_supported (vectype))
3401 *result_chain = VEC_copy (tree, heap, dr_chain);
3402 for (i = 0; i < exact_log2 (length); i++)
3404 for (j = 0; j < length; j +=2)
3406 first_vect = VEC_index (tree, dr_chain, j);
3407 second_vect = VEC_index (tree, dr_chain, j+1);
3409 /* data_ref = permute_even (first_data_ref, second_data_ref); */
3410 perm_dest = create_tmp_var (vectype, "vect_perm_even");
3411 DECL_GIMPLE_REG_P (perm_dest) = 1;
3412 add_referenced_var (perm_dest);
3414 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
3415 build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
3416 first_vect, second_vect));
3418 data_ref = make_ssa_name (perm_dest, perm_stmt);
3419 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3420 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3421 mark_symbols_for_renaming (perm_stmt);
3423 VEC_replace (tree, *result_chain, j/2, data_ref);
3425 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
3426 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
3427 DECL_GIMPLE_REG_P (perm_dest) = 1;
3428 add_referenced_var (perm_dest);
3430 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
3431 build2 (VEC_EXTRACT_ODD_EXPR, vectype,
3432 first_vect, second_vect));
3433 data_ref = make_ssa_name (perm_dest, perm_stmt);
3434 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3435 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3436 mark_symbols_for_renaming (perm_stmt);
3438 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
3440 dr_chain = VEC_copy (tree, heap, *result_chain);
3446 /* Function vect_transform_strided_load.
3448 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
3449 to perform their permutation and ascribe the result vectorized statements to
3450 the scalar statements.
3454 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
3455 block_stmt_iterator *bsi)
3457 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3458 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3459 tree next_stmt, new_stmt;
3460 VEC(tree,heap) *result_chain = NULL;
3461 unsigned int i, gap_count;
3464 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
3465 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
3466 vectors, that are ready for vector computation. */
3467 result_chain = VEC_alloc (tree, heap, size);
3469 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
3472 /* Put a permuted data-ref in the VECTORIZED_STMT field.
3473 Since we scan the chain starting from it's first node, their order
3474 corresponds the order of data-refs in RESULT_CHAIN. */
3475 next_stmt = first_stmt;
3477 for (i = 0; VEC_iterate(tree, result_chain, i, tmp_data_ref); i++)
3482 /* Skip the gaps. Loads created for the gaps will be removed by dead
3483 code elimination pass later.
3484 DR_GROUP_GAP is the number of steps in elements from the previous
3485 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
3486 correspond to the gaps.
3488 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
3496 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
3497 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
3498 copies, and we put the new vector statement in the first available
3500 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
3501 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
3504 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
3505 tree rel_stmt = STMT_VINFO_RELATED_STMT (
3506 vinfo_for_stmt (prev_stmt));
3509 prev_stmt = rel_stmt;
3510 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
3512 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
3514 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3516 /* If NEXT_STMT accesses the same DR as the previous statement,
3517 put the same TMP_DATA_REF as its vectorized statement; otherwise
3518 get the next data-ref from RESULT_CHAIN. */
3519 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
3527 /* vectorizable_load.
3529 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
3531 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3532 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3533 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3536 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3539 tree vec_dest = NULL;
3540 tree data_ref = NULL;
3542 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3543 stmt_vec_info prev_stmt_info;
3544 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3545 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3546 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
3547 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3550 tree new_stmt = NULL_TREE;
3552 enum dr_alignment_support alignment_support_cheme;
3553 tree dataref_ptr = NULL_TREE;
3555 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3556 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3557 int i, j, group_size;
3558 tree msq = NULL_TREE, lsq;
3559 tree offset = NULL_TREE;
3560 tree realignment_token = NULL_TREE;
3561 tree phi_stmt = NULL_TREE;
3562 VEC(tree,heap) *dr_chain = NULL;
3563 bool strided_load = false;
3566 /* Is vectorizable load? */
3567 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3570 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3572 if (STMT_VINFO_LIVE_P (stmt_info))
3574 /* FORNOW: not yet supported. */
3575 if (vect_print_dump_info (REPORT_DETAILS))
3576 fprintf (vect_dump, "value used after loop.");
3580 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3583 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3584 if (TREE_CODE (scalar_dest) != SSA_NAME)
3587 op = GIMPLE_STMT_OPERAND (stmt, 1);
3588 if (TREE_CODE (op) != ARRAY_REF
3589 && TREE_CODE (op) != INDIRECT_REF
3590 && !DR_GROUP_FIRST_DR (stmt_info))
3593 if (!STMT_VINFO_DATA_REF (stmt_info))
3596 mode = (int) TYPE_MODE (vectype);
3598 /* FORNOW. In some cases can vectorize even if data-type not supported
3599 (e.g. - data copies). */
3600 if (mov_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3602 if (vect_print_dump_info (REPORT_DETAILS))
3603 fprintf (vect_dump, "Aligned load, but unsupported type.");
3607 /* Check if the load is a part of an interleaving chain. */
3608 if (DR_GROUP_FIRST_DR (stmt_info))
3610 strided_load = true;
3612 /* Check if interleaving is supported. */
3613 if (!vect_strided_load_supported (vectype))
3617 if (!vec_stmt) /* transformation not required. */
3619 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
3625 if (vect_print_dump_info (REPORT_DETAILS))
3626 fprintf (vect_dump, "transform load.");
3630 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3631 /* Check if the chain of loads is already vectorized. */
3632 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
3634 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3637 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3638 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3639 dr_chain = VEC_alloc (tree, heap, group_size);
3648 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3649 gcc_assert (alignment_support_cheme);
3652 /* In case the vectorization factor (VF) is bigger than the number
3653 of elements that we can fit in a vectype (nunits), we have to generate
3654 more than one vector stmt - i.e - we need to "unroll" the
3655 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3656 from one copy of the vector stmt to the next, in the field
3657 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3658 stages to find the correct vector defs to be used when vectorizing
3659 stmts that use the defs of the current stmt. The example below illustrates
3660 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3661 4 vectorized stmts):
3663 before vectorization:
3664 RELATED_STMT VEC_STMT
3668 step 1: vectorize stmt S1:
3669 We first create the vector stmt VS1_0, and, as usual, record a
3670 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
3671 Next, we create the vector stmt VS1_1, and record a pointer to
3672 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
3673 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
3675 RELATED_STMT VEC_STMT
3676 VS1_0: vx0 = memref0 VS1_1 -
3677 VS1_1: vx1 = memref1 VS1_2 -
3678 VS1_2: vx2 = memref2 VS1_3 -
3679 VS1_3: vx3 = memref3 - -
3680 S1: x = load - VS1_0
3683 See in documentation in vect_get_vec_def_for_stmt_copy for how the
3684 information we recorded in RELATED_STMT field is used to vectorize
3687 /* In case of interleaving (non-unit strided access):
3694 Vectorized loads are created in the order of memory accesses
3695 starting from the access of the first stmt of the chain:
3698 VS2: vx1 = &base + vec_size*1
3699 VS3: vx3 = &base + vec_size*2
3700 VS4: vx4 = &base + vec_size*3
3702 Then permutation statements are generated:
3704 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
3705 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
3708 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3709 (the order of the data-refs in the output of vect_permute_load_chain
3710 corresponds to the order of scalar stmts in the interleaving chain - see
3711 the documentation of vect_permute_load_chain()).
3712 The generation of permutation stmts and recording them in
3713 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
3715 In case of both multiple types and interleaving, the vector loads and
3716 permutation stmts above are created for every copy. The result vector stmts
3717 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3718 STMT_VINFO_RELATED_STMT for the next copies. */
3720 /* If the data reference is aligned (dr_aligned) or potentially unaligned
3721 on a target that supports unaligned accesses (dr_unaligned_supported)
3722 we generate the following code:
3726 p = p + indx * vectype_size;
3731 Otherwise, the data reference is potentially unaligned on a target that
3732 does not support unaligned accesses (dr_unaligned_software_pipeline) -
3733 then generate the following code, in which the data in each iteration is
3734 obtained by two vector loads, one from the previous iteration, and one
3735 from the current iteration:
3737 msq_init = *(floor(p1))
3738 p2 = initial_addr + VS - 1;
3739 realignment_token = call target_builtin;
3742 p2 = p2 + indx * vectype_size
3744 vec_dest = realign_load (msq, lsq, realignment_token)
3749 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3751 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token);
3752 phi_stmt = SSA_NAME_DEF_STMT (msq);
3753 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
3756 prev_stmt_info = NULL;
3757 for (j = 0; j < ncopies; j++)
3759 /* 1. Create the vector pointer update chain. */
3761 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset, &dummy,
3762 &ptr_incr, false, NULL_TREE);
3764 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3766 for (i = 0; i < group_size; i++)
3768 /* 2. Create the vector-load in the loop. */
3769 switch (alignment_support_cheme)
3772 gcc_assert (aligned_access_p (first_dr));
3773 data_ref = build_fold_indirect_ref (dataref_ptr);
3775 case dr_unaligned_supported:
3777 int mis = DR_MISALIGNMENT (first_dr);
3778 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
3780 gcc_assert (!aligned_access_p (first_dr));
3781 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
3783 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
3786 case dr_unaligned_software_pipeline:
3787 gcc_assert (!aligned_access_p (first_dr));
3788 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
3793 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3794 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3796 new_temp = make_ssa_name (vec_dest, new_stmt);
3797 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3798 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3799 copy_virtual_operands (new_stmt, stmt);
3800 mark_symbols_for_renaming (new_stmt);
3802 /* 3. Handle explicit realignment if necessary/supported. */
3803 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3806 <vec_dest = realign_load (msq, lsq, realignment_token)> */
3807 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
3808 if (!realignment_token)
3809 realignment_token = dataref_ptr;
3810 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3812 build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token);
3813 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3815 new_temp = make_ssa_name (vec_dest, new_stmt);
3816 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3817 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3818 if (i == group_size - 1 && j == ncopies - 1)
3819 add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop));
3823 VEC_quick_push (tree, dr_chain, new_temp);
3824 if (i < group_size - 1)
3825 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3830 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
3832 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3833 dr_chain = VEC_alloc (tree, heap, group_size);
3838 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3840 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3841 prev_stmt_info = vinfo_for_stmt (new_stmt);
3849 /* Function vectorizable_live_operation.
3851 STMT computes a value that is used outside the loop. Check if
3852 it can be supported. */
3855 vectorizable_live_operation (tree stmt,
3856 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3857 tree *vec_stmt ATTRIBUTE_UNUSED)
3860 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3861 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3863 enum tree_code code;
3867 enum vect_def_type dt;
3869 if (!STMT_VINFO_LIVE_P (stmt_info))
3872 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3875 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3878 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3879 code = TREE_CODE (operation);
3881 op_type = TREE_CODE_LENGTH (code);
3883 /* FORNOW: support only if all uses are invariant. This means
3884 that the scalar operations can remain in place, unvectorized.
3885 The original last scalar value that they compute will be used. */
3887 for (i = 0; i < op_type; i++)
3889 op = TREE_OPERAND (operation, i);
3890 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
3892 if (vect_print_dump_info (REPORT_DETAILS))
3893 fprintf (vect_dump, "use not simple.");
3897 if (dt != vect_invariant_def && dt != vect_constant_def)
3901 /* No transformation is required for the cases we currently support. */
3906 /* Function vect_is_simple_cond.
3909 LOOP - the loop that is being vectorized.
3910 COND - Condition that is checked for simple use.
3912 Returns whether a COND can be vectorized. Checks whether
3913 condition operands are supportable using vec_is_simple_use. */
3916 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
3920 enum vect_def_type dt;
3922 if (!COMPARISON_CLASS_P (cond))
3925 lhs = TREE_OPERAND (cond, 0);
3926 rhs = TREE_OPERAND (cond, 1);
3928 if (TREE_CODE (lhs) == SSA_NAME)
3930 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
3931 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
3934 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST)
3937 if (TREE_CODE (rhs) == SSA_NAME)
3939 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
3940 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
3943 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST)
3949 /* vectorizable_condition.
3951 Check if STMT is conditional modify expression that can be vectorized.
3952 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3953 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
3956 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3959 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3961 tree scalar_dest = NULL_TREE;
3962 tree vec_dest = NULL_TREE;
3963 tree op = NULL_TREE;
3964 tree cond_expr, then_clause, else_clause;
3965 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3966 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3967 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
3968 tree vec_compare, vec_cond_expr;
3970 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3971 enum machine_mode vec_mode;
3973 enum vect_def_type dt;
3974 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3975 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3977 gcc_assert (ncopies >= 1);
3979 return false; /* FORNOW */
3981 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3984 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3986 if (STMT_VINFO_LIVE_P (stmt_info))
3988 /* FORNOW: not yet supported. */
3989 if (vect_print_dump_info (REPORT_DETAILS))
3990 fprintf (vect_dump, "value used after loop.");
3994 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3997 op = GIMPLE_STMT_OPERAND (stmt, 1);
3999 if (TREE_CODE (op) != COND_EXPR)
4002 cond_expr = TREE_OPERAND (op, 0);
4003 then_clause = TREE_OPERAND (op, 1);
4004 else_clause = TREE_OPERAND (op, 2);
4006 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
4009 /* We do not handle two different vector types for the condition
4011 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
4014 if (TREE_CODE (then_clause) == SSA_NAME)
4016 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
4017 if (!vect_is_simple_use (then_clause, loop_vinfo,
4018 &then_def_stmt, &def, &dt))
4021 else if (TREE_CODE (then_clause) != INTEGER_CST
4022 && TREE_CODE (then_clause) != REAL_CST)
4025 if (TREE_CODE (else_clause) == SSA_NAME)
4027 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
4028 if (!vect_is_simple_use (else_clause, loop_vinfo,
4029 &else_def_stmt, &def, &dt))
4032 else if (TREE_CODE (else_clause) != INTEGER_CST
4033 && TREE_CODE (else_clause) != REAL_CST)
4037 vec_mode = TYPE_MODE (vectype);
4041 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
4042 return expand_vec_cond_expr_p (op, vec_mode);
4048 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4049 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4051 /* Handle cond expr. */
4053 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
4055 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
4056 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
4057 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
4059 /* Arguments are ready. create the new vector stmt. */
4060 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
4061 vec_cond_lhs, vec_cond_rhs);
4062 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
4063 vec_compare, vec_then_clause, vec_else_clause);
4065 *vec_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
4067 new_temp = make_ssa_name (vec_dest, *vec_stmt);
4068 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
4069 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
4074 /* Function vect_transform_stmt.
4076 Create a vectorized stmt to replace STMT, and insert it at BSI. */
4079 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
4081 bool is_store = false;
4082 tree vec_stmt = NULL_TREE;
4083 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4084 tree orig_stmt_in_pattern;
4087 if (STMT_VINFO_RELEVANT_P (stmt_info))
4089 switch (STMT_VINFO_TYPE (stmt_info))
4091 case type_demotion_vec_info_type:
4092 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
4096 case type_promotion_vec_info_type:
4097 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
4101 case op_vec_info_type:
4102 done = vectorizable_operation (stmt, bsi, &vec_stmt);
4106 case assignment_vec_info_type:
4107 done = vectorizable_assignment (stmt, bsi, &vec_stmt);
4111 case load_vec_info_type:
4112 done = vectorizable_load (stmt, bsi, &vec_stmt);
4116 case store_vec_info_type:
4117 done = vectorizable_store (stmt, bsi, &vec_stmt);
4119 if (DR_GROUP_FIRST_DR (stmt_info))
4121 /* In case of interleaving, the whole chain is vectorized when the
4122 last store in the chain is reached. Store stmts before the last
4123 one are skipped, and there vec_stmt_info shouldn't be freed
4125 *strided_store = true;
4126 if (STMT_VINFO_VEC_STMT (stmt_info))
4133 case condition_vec_info_type:
4134 done = vectorizable_condition (stmt, bsi, &vec_stmt);
4138 case call_vec_info_type:
4139 done = vectorizable_call (stmt, bsi, &vec_stmt);
4143 if (vect_print_dump_info (REPORT_DETAILS))
4144 fprintf (vect_dump, "stmt not supported.");
4148 gcc_assert (vec_stmt || *strided_store);
4151 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
4152 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
4153 if (orig_stmt_in_pattern)
4155 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
4156 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
4158 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4160 /* STMT was inserted by the vectorizer to replace a
4161 computation idiom. ORIG_STMT_IN_PATTERN is a stmt in the
4162 original sequence that computed this idiom. We need to
4163 record a pointer to VEC_STMT in the stmt_info of
4164 ORIG_STMT_IN_PATTERN. See more details in the
4165 documentation of vect_pattern_recog. */
4167 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
4173 if (STMT_VINFO_LIVE_P (stmt_info))
4175 switch (STMT_VINFO_TYPE (stmt_info))
4177 case reduc_vec_info_type:
4178 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
4183 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
4192 /* This function builds ni_name = number of iterations loop executes
4193 on the loop preheader. */
4196 vect_build_loop_niters (loop_vec_info loop_vinfo)
4198 tree ni_name, stmt, var;
4200 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4201 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
4203 var = create_tmp_var (TREE_TYPE (ni), "niters");
4204 add_referenced_var (var);
4205 ni_name = force_gimple_operand (ni, &stmt, false, var);
4207 pe = loop_preheader_edge (loop);
4210 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4211 gcc_assert (!new_bb);
4218 /* This function generates the following statements:
4220 ni_name = number of iterations loop executes
4221 ratio = ni_name / vf
4222 ratio_mult_vf_name = ratio * vf
4224 and places them at the loop preheader edge. */
4227 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
4229 tree *ratio_mult_vf_name_ptr,
4230 tree *ratio_name_ptr)
4238 tree ratio_mult_vf_name;
4239 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4240 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
4241 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4244 pe = loop_preheader_edge (loop);
4246 /* Generate temporary variable that contains
4247 number of iterations loop executes. */
4249 ni_name = vect_build_loop_niters (loop_vinfo);
4250 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
4252 /* Create: ratio = ni >> log2(vf) */
4254 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
4255 if (!is_gimple_val (ratio_name))
4257 var = create_tmp_var (TREE_TYPE (ni), "bnd");
4258 add_referenced_var (var);
4260 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
4261 pe = loop_preheader_edge (loop);
4262 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4263 gcc_assert (!new_bb);
4266 /* Create: ratio_mult_vf = ratio << log2 (vf). */
4268 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
4269 ratio_name, log_vf);
4270 if (!is_gimple_val (ratio_mult_vf_name))
4272 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
4273 add_referenced_var (var);
4275 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
4277 pe = loop_preheader_edge (loop);
4278 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4279 gcc_assert (!new_bb);
4282 *ni_name_ptr = ni_name;
4283 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
4284 *ratio_name_ptr = ratio_name;
4290 /* Function update_vuses_to_preheader.
4293 STMT - a statement with potential VUSEs.
4294 LOOP - the loop whose preheader will contain STMT.
4296 It's possible to vectorize a loop even though an SSA_NAME from a VUSE
4297 appears to be defined in a VDEF in another statement in a loop.
4298 One such case is when the VUSE is at the dereference of a __restricted__
4299 pointer in a load and the VDEF is at the dereference of a different
4300 __restricted__ pointer in a store. Vectorization may result in
4301 copy_virtual_uses being called to copy the problematic VUSE to a new
4302 statement that is being inserted in the loop preheader. This procedure
4303 is called to change the SSA_NAME in the new statement's VUSE from the
4304 SSA_NAME updated in the loop to the related SSA_NAME available on the
4305 path entering the loop.
4307 When this function is called, we have the following situation:
4312 # name1 = phi < name0 , name2>
4317 # name2 = vdef <name1>
4322 Stmt S1 was created in the loop preheader block as part of misaligned-load
4323 handling. This function fixes the name of the vuse of S1 from 'name1' to
4327 update_vuses_to_preheader (tree stmt, struct loop *loop)
4329 basic_block header_bb = loop->header;
4330 edge preheader_e = loop_preheader_edge (loop);
4332 use_operand_p use_p;
4334 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VUSE)
4336 tree ssa_name = USE_FROM_PTR (use_p);
4337 tree def_stmt = SSA_NAME_DEF_STMT (ssa_name);
4338 tree name_var = SSA_NAME_VAR (ssa_name);
4339 basic_block bb = bb_for_stmt (def_stmt);
4341 /* For a use before any definitions, def_stmt is a NOP_EXPR. */
4342 if (!IS_EMPTY_STMT (def_stmt)
4343 && flow_bb_inside_loop_p (loop, bb))
4345 /* If the block containing the statement defining the SSA_NAME
4346 is in the loop then it's necessary to find the definition
4347 outside the loop using the PHI nodes of the header. */
4349 bool updated = false;
4351 for (phi = phi_nodes (header_bb); phi; phi = TREE_CHAIN (phi))
4353 if (SSA_NAME_VAR (PHI_RESULT (phi)) == name_var)
4355 SET_USE (use_p, PHI_ARG_DEF (phi, preheader_e->dest_idx));
4360 gcc_assert (updated);
4366 /* Function vect_update_ivs_after_vectorizer.
4368 "Advance" the induction variables of LOOP to the value they should take
4369 after the execution of LOOP. This is currently necessary because the
4370 vectorizer does not handle induction variables that are used after the
4371 loop. Such a situation occurs when the last iterations of LOOP are
4373 1. We introduced new uses after LOOP for IVs that were not originally used
4374 after LOOP: the IVs of LOOP are now used by an epilog loop.
4375 2. LOOP is going to be vectorized; this means that it will iterate N/VF
4376 times, whereas the loop IVs should be bumped N times.
4379 - LOOP - a loop that is going to be vectorized. The last few iterations
4380 of LOOP were peeled.
4381 - NITERS - the number of iterations that LOOP executes (before it is
4382 vectorized). i.e, the number of times the ivs should be bumped.
4383 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
4384 coming out from LOOP on which there are uses of the LOOP ivs
4385 (this is the path from LOOP->exit to epilog_loop->preheader).
4387 The new definitions of the ivs are placed in LOOP->exit.
4388 The phi args associated with the edge UPDATE_E in the bb
4389 UPDATE_E->dest are updated accordingly.
4391 Assumption 1: Like the rest of the vectorizer, this function assumes
4392 a single loop exit that has a single predecessor.
4394 Assumption 2: The phi nodes in the LOOP header and in update_bb are
4395 organized in the same order.
4397 Assumption 3: The access function of the ivs is simple enough (see
4398 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
4400 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
4401 coming out of LOOP on which the ivs of LOOP are used (this is the path
4402 that leads to the epilog loop; other paths skip the epilog loop). This
4403 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
4404 needs to have its phis updated.
4408 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
4411 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4412 basic_block exit_bb = single_exit (loop)->dest;
4414 basic_block update_bb = update_e->dest;
4416 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
4418 /* Make sure there exists a single-predecessor exit bb: */
4419 gcc_assert (single_pred_p (exit_bb));
4421 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
4423 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
4425 tree access_fn = NULL;
4426 tree evolution_part;
4429 tree var, stmt, ni, ni_name;
4430 block_stmt_iterator last_bsi;
4432 if (vect_print_dump_info (REPORT_DETAILS))
4434 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
4435 print_generic_expr (vect_dump, phi, TDF_SLIM);
4438 /* Skip virtual phi's. */
4439 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
4441 if (vect_print_dump_info (REPORT_DETAILS))
4442 fprintf (vect_dump, "virtual phi. skip.");
4446 /* Skip reduction phis. */
4447 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
4449 if (vect_print_dump_info (REPORT_DETAILS))
4450 fprintf (vect_dump, "reduc phi. skip.");
4454 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
4455 gcc_assert (access_fn);
4457 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
4458 gcc_assert (evolution_part != NULL_TREE);
4460 /* FORNOW: We do not support IVs whose evolution function is a polynomial
4461 of degree >= 2 or exponential. */
4462 gcc_assert (!tree_is_chrec (evolution_part));
4464 step_expr = evolution_part;
4465 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
4468 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
4469 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
4470 fold_convert (TREE_TYPE (init_expr),
4475 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
4476 add_referenced_var (var);
4478 ni_name = force_gimple_operand (ni, &stmt, false, var);
4480 /* Insert stmt into exit_bb. */
4481 last_bsi = bsi_last (exit_bb);
4483 bsi_insert_before (&last_bsi, stmt, BSI_SAME_STMT);
4485 /* Fix phi expressions in the successor bb. */
4486 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
4491 /* Function vect_do_peeling_for_loop_bound
4493 Peel the last iterations of the loop represented by LOOP_VINFO.
4494 The peeled iterations form a new epilog loop. Given that the loop now
4495 iterates NITERS times, the new epilog loop iterates
4496 NITERS % VECTORIZATION_FACTOR times.
4498 The original loop will later be made to iterate
4499 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
4502 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
4504 tree ni_name, ratio_mult_vf_name;
4505 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4506 struct loop *new_loop;
4508 basic_block preheader;
4512 if (vect_print_dump_info (REPORT_DETAILS))
4513 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
4515 initialize_original_copy_tables ();
4517 /* Generate the following variables on the preheader of original loop:
4519 ni_name = number of iteration the original loop executes
4520 ratio = ni_name / vf
4521 ratio_mult_vf_name = ratio * vf */
4522 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
4523 &ratio_mult_vf_name, ratio);
4525 loop_num = loop->num;
4526 /* Threshold for vectorized loop. */
4527 th = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)) *
4528 LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4529 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
4530 ratio_mult_vf_name, ni_name, false, th);
4531 gcc_assert (new_loop);
4532 gcc_assert (loop_num == loop->num);
4533 #ifdef ENABLE_CHECKING
4534 slpeel_verify_cfg_after_peeling (loop, new_loop);
4537 /* A guard that controls whether the new_loop is to be executed or skipped
4538 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
4539 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
4540 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
4541 is on the path where the LOOP IVs are used and need to be updated. */
4543 preheader = loop_preheader_edge (new_loop)->src;
4544 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
4545 update_e = EDGE_PRED (preheader, 0);
4547 update_e = EDGE_PRED (preheader, 1);
4549 /* Update IVs of original loop as if they were advanced
4550 by ratio_mult_vf_name steps. */
4551 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
4553 /* After peeling we have to reset scalar evolution analyzer. */
4556 free_original_copy_tables ();
4560 /* Function vect_gen_niters_for_prolog_loop
4562 Set the number of iterations for the loop represented by LOOP_VINFO
4563 to the minimum between LOOP_NITERS (the original iteration count of the loop)
4564 and the misalignment of DR - the data reference recorded in
4565 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
4566 this loop, the data reference DR will refer to an aligned location.
4568 The following computation is generated:
4570 If the misalignment of DR is known at compile time:
4571 addr_mis = int mis = DR_MISALIGNMENT (dr);
4572 Else, compute address misalignment in bytes:
4573 addr_mis = addr & (vectype_size - 1)
4575 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
4577 (elem_size = element type size; an element is the scalar element
4578 whose type is the inner type of the vectype)
4582 prolog_niters = min ( LOOP_NITERS ,
4583 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
4584 where group_size is the size of the interleaved group.
4588 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
4590 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
4591 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4592 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4594 tree iters, iters_name;
4597 tree dr_stmt = DR_STMT (dr);
4598 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
4599 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4600 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
4601 tree niters_type = TREE_TYPE (loop_niters);
4603 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
4605 if (DR_GROUP_FIRST_DR (stmt_info))
4607 /* For interleaved access element size must be multiplied by the size of
4608 the interleaved group. */
4609 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
4610 DR_GROUP_FIRST_DR (stmt_info)));
4611 element_size *= group_size;
4614 pe = loop_preheader_edge (loop);
4616 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
4618 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
4619 int elem_misalign = byte_misalign / element_size;
4621 if (vect_print_dump_info (REPORT_DETAILS))
4622 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
4623 iters = build_int_cst (niters_type,
4624 (vf - elem_misalign)&(vf/group_size-1));
4628 tree new_stmts = NULL_TREE;
4630 vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE);
4631 tree ptr_type = TREE_TYPE (start_addr);
4632 tree size = TYPE_SIZE (ptr_type);
4633 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
4634 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
4635 tree elem_size_log =
4636 build_int_cst (type, exact_log2 (vectype_align/vf));
4637 tree vf_minus_1 = build_int_cst (type, vf - 1);
4638 tree vf_tree = build_int_cst (type, vf);
4642 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
4643 gcc_assert (!new_bb);
4645 /* Create: byte_misalign = addr & (vectype_size - 1) */
4647 fold_build2 (BIT_AND_EXPR, type, start_addr, vectype_size_minus_1);
4649 /* Create: elem_misalign = byte_misalign / element_size */
4651 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
4653 /* Create: (niters_type) (VF - elem_misalign)&(VF - 1) */
4654 iters = fold_build2 (MINUS_EXPR, type, vf_tree, elem_misalign);
4655 iters = fold_build2 (BIT_AND_EXPR, type, iters, vf_minus_1);
4656 iters = fold_convert (niters_type, iters);
4659 /* Create: prolog_loop_niters = min (iters, loop_niters) */
4660 /* If the loop bound is known at compile time we already verified that it is
4661 greater than vf; since the misalignment ('iters') is at most vf, there's
4662 no need to generate the MIN_EXPR in this case. */
4663 if (TREE_CODE (loop_niters) != INTEGER_CST)
4664 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
4666 if (vect_print_dump_info (REPORT_DETAILS))
4668 fprintf (vect_dump, "niters for prolog loop: ");
4669 print_generic_expr (vect_dump, iters, TDF_SLIM);
4672 var = create_tmp_var (niters_type, "prolog_loop_niters");
4673 add_referenced_var (var);
4674 iters_name = force_gimple_operand (iters, &stmt, false, var);
4676 /* Insert stmt on loop preheader edge. */
4679 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4680 gcc_assert (!new_bb);
4687 /* Function vect_update_init_of_dr
4689 NITERS iterations were peeled from LOOP. DR represents a data reference
4690 in LOOP. This function updates the information recorded in DR to
4691 account for the fact that the first NITERS iterations had already been
4692 executed. Specifically, it updates the OFFSET field of DR. */
4695 vect_update_init_of_dr (struct data_reference *dr, tree niters)
4697 tree offset = DR_OFFSET (dr);
4699 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
4700 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
4701 DR_OFFSET (dr) = offset;
4705 /* Function vect_update_inits_of_drs
4707 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
4708 This function updates the information recorded for the data references in
4709 the loop to account for the fact that the first NITERS iterations had
4710 already been executed. Specifically, it updates the initial_condition of the
4711 access_function of all the data_references in the loop. */
4714 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
4717 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
4718 struct data_reference *dr;
4720 if (vect_dump && (dump_flags & TDF_DETAILS))
4721 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
4723 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
4724 vect_update_init_of_dr (dr, niters);
4728 /* Function vect_do_peeling_for_alignment
4730 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
4731 'niters' is set to the misalignment of one of the data references in the
4732 loop, thereby forcing it to refer to an aligned location at the beginning
4733 of the execution of this loop. The data reference for which we are
4734 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
4737 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
4739 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4740 tree niters_of_prolog_loop, ni_name;
4742 struct loop *new_loop;
4744 if (vect_print_dump_info (REPORT_DETAILS))
4745 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
4747 initialize_original_copy_tables ();
4749 ni_name = vect_build_loop_niters (loop_vinfo);
4750 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
4752 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
4754 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
4755 niters_of_prolog_loop, ni_name, true, 0);
4756 gcc_assert (new_loop);
4757 #ifdef ENABLE_CHECKING
4758 slpeel_verify_cfg_after_peeling (new_loop, loop);
4761 /* Update number of times loop executes. */
4762 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
4763 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
4764 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
4766 /* Update the init conditions of the access functions of all data refs. */
4767 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
4769 /* After peeling we have to reset scalar evolution analyzer. */
4772 free_original_copy_tables ();
4776 /* Function vect_create_cond_for_align_checks.
4778 Create a conditional expression that represents the alignment checks for
4779 all of data references (array element references) whose alignment must be
4783 LOOP_VINFO - two fields of the loop information are used.
4784 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
4785 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
4788 COND_EXPR_STMT_LIST - statements needed to construct the conditional
4790 The returned value is the conditional expression to be used in the if
4791 statement that controls which version of the loop gets executed at runtime.
4793 The algorithm makes two assumptions:
4794 1) The number of bytes "n" in a vector is a power of 2.
4795 2) An address "a" is aligned if a%n is zero and that this
4796 test can be done as a&(n-1) == 0. For example, for 16
4797 byte vectors the test is a&0xf == 0. */
4800 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
4801 tree *cond_expr_stmt_list)
4803 VEC(tree,heap) *may_misalign_stmts
4804 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
4806 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
4810 tree int_ptrsize_type;
4812 tree or_tmp_name = NULL_TREE;
4813 tree and_tmp, and_tmp_name, and_stmt;
4816 /* Check that mask is one less than a power of 2, i.e., mask is
4817 all zeros followed by all ones. */
4818 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
4820 /* CHECKME: what is the best integer or unsigned type to use to hold a
4821 cast from a pointer value? */
4822 psize = TYPE_SIZE (ptr_type_node);
4824 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
4826 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
4827 of the first vector of the i'th data reference. */
4829 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
4831 tree new_stmt_list = NULL_TREE;
4833 tree addr_tmp, addr_tmp_name, addr_stmt;
4834 tree or_tmp, new_or_tmp_name, or_stmt;
4836 /* create: addr_tmp = (int)(address_of_first_vector) */
4837 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
4841 if (new_stmt_list != NULL_TREE)
4842 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
4844 sprintf (tmp_name, "%s%d", "addr2int", i);
4845 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4846 add_referenced_var (addr_tmp);
4847 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
4848 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
4849 addr_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
4850 addr_tmp_name, addr_stmt);
4851 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
4852 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
4854 /* The addresses are OR together. */
4856 if (or_tmp_name != NULL_TREE)
4858 /* create: or_tmp = or_tmp | addr_tmp */
4859 sprintf (tmp_name, "%s%d", "orptrs", i);
4860 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4861 add_referenced_var (or_tmp);
4862 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
4863 or_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
4865 build2 (BIT_IOR_EXPR, int_ptrsize_type,
4868 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
4869 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
4870 or_tmp_name = new_or_tmp_name;
4873 or_tmp_name = addr_tmp_name;
4877 mask_cst = build_int_cst (int_ptrsize_type, mask);
4879 /* create: and_tmp = or_tmp & mask */
4880 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
4881 add_referenced_var (and_tmp);
4882 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
4884 and_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
4886 build2 (BIT_AND_EXPR, int_ptrsize_type,
4887 or_tmp_name, mask_cst));
4888 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
4889 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
4891 /* Make and_tmp the left operand of the conditional test against zero.
4892 if and_tmp has a nonzero bit then some address is unaligned. */
4893 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
4894 return build2 (EQ_EXPR, boolean_type_node,
4895 and_tmp_name, ptrsize_zero);
4899 /* Function vect_transform_loop.
4901 The analysis phase has determined that the loop is vectorizable.
4902 Vectorize the loop - created vectorized stmts to replace the scalar
4903 stmts in the loop, and update the loop exit condition. */
4906 vect_transform_loop (loop_vec_info loop_vinfo)
4908 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4909 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4910 int nbbs = loop->num_nodes;
4911 block_stmt_iterator si;
4914 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4917 if (vect_print_dump_info (REPORT_DETAILS))
4918 fprintf (vect_dump, "=== vec_transform_loop ===");
4920 /* If the loop has data references that may or may not be aligned then
4921 two versions of the loop need to be generated, one which is vectorized
4922 and one which isn't. A test is then generated to control which of the
4923 loops is executed. The test checks for the alignment of all of the
4924 data references that may or may not be aligned. */
4926 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
4930 tree cond_expr_stmt_list = NULL_TREE;
4931 basic_block condition_bb;
4932 block_stmt_iterator cond_exp_bsi;
4933 basic_block merge_bb;
4934 basic_block new_exit_bb;
4936 tree orig_phi, new_phi, arg;
4937 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
4939 cond_expr = vect_create_cond_for_align_checks (loop_vinfo,
4940 &cond_expr_stmt_list);
4941 initialize_original_copy_tables ();
4942 nloop = loop_version (loop, cond_expr, &condition_bb,
4943 prob, prob, REG_BR_PROB_BASE - prob, true);
4944 free_original_copy_tables();
4946 /** Loop versioning violates an assumption we try to maintain during
4947 vectorization - that the loop exit block has a single predecessor.
4948 After versioning, the exit block of both loop versions is the same
4949 basic block (i.e. it has two predecessors). Just in order to simplify
4950 following transformations in the vectorizer, we fix this situation
4951 here by adding a new (empty) block on the exit-edge of the loop,
4952 with the proper loop-exit phis to maintain loop-closed-form. **/
4954 merge_bb = single_exit (loop)->dest;
4955 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
4956 new_exit_bb = split_edge (single_exit (loop));
4957 new_exit_e = single_exit (loop);
4958 e = EDGE_SUCC (new_exit_bb, 0);
4960 for (orig_phi = phi_nodes (merge_bb); orig_phi;
4961 orig_phi = PHI_CHAIN (orig_phi))
4963 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
4965 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
4966 add_phi_arg (new_phi, arg, new_exit_e);
4967 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
4970 /** end loop-exit-fixes after versioning **/
4972 update_ssa (TODO_update_ssa);
4973 cond_exp_bsi = bsi_last (condition_bb);
4974 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
4977 /* CHECKME: we wouldn't need this if we called update_ssa once
4979 bitmap_zero (vect_memsyms_to_rename);
4981 /* Peel the loop if there are data refs with unknown alignment.
4982 Only one data ref with unknown store is allowed. */
4984 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4985 vect_do_peeling_for_alignment (loop_vinfo);
4987 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4988 compile time constant), or it is a constant that doesn't divide by the
4989 vectorization factor, then an epilog loop needs to be created.
4990 We therefore duplicate the loop: the original loop will be vectorized,
4991 and will compute the first (n/VF) iterations. The second copy of the loop
4992 will remain scalar and will compute the remaining (n%VF) iterations.
4993 (VF is the vectorization factor). */
4995 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4996 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4997 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
4998 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
5000 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5001 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5003 /* 1) Make sure the loop header has exactly two entries
5004 2) Make sure we have a preheader basic block. */
5006 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5008 split_edge (loop_preheader_edge (loop));
5010 /* FORNOW: the vectorizer supports only loops which body consist
5011 of one basic block (header + empty latch). When the vectorizer will
5012 support more involved loop forms, the order by which the BBs are
5013 traversed need to be reconsidered. */
5015 for (i = 0; i < nbbs; i++)
5017 basic_block bb = bbs[i];
5019 for (si = bsi_start (bb); !bsi_end_p (si);)
5021 tree stmt = bsi_stmt (si);
5022 stmt_vec_info stmt_info;
5025 if (vect_print_dump_info (REPORT_DETAILS))
5027 fprintf (vect_dump, "------>vectorizing statement: ");
5028 print_generic_expr (vect_dump, stmt, TDF_SLIM);
5030 stmt_info = vinfo_for_stmt (stmt);
5031 gcc_assert (stmt_info);
5032 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5033 && !STMT_VINFO_LIVE_P (stmt_info))
5039 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5040 != (unsigned HOST_WIDE_INT) vectorization_factor)
5041 && vect_print_dump_info (REPORT_DETAILS))
5042 fprintf (vect_dump, "multiple-types.");
5044 /* -------- vectorize statement ------------ */
5045 if (vect_print_dump_info (REPORT_DETAILS))
5046 fprintf (vect_dump, "transform statement.");
5048 strided_store = false;
5049 is_store = vect_transform_stmt (stmt, &si, &strided_store);
5053 if (DR_GROUP_FIRST_DR (stmt_info))
5055 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5056 interleaving chain was completed - free all the stores in
5058 tree next = DR_GROUP_FIRST_DR (stmt_info);
5060 stmt_vec_info next_stmt_info;
5064 next_stmt_info = vinfo_for_stmt (next);
5065 /* Free the attached stmt_vec_info and remove the stmt. */
5066 ann = stmt_ann (next);
5067 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
5068 free (next_stmt_info);
5069 set_stmt_info (ann, NULL);
5072 bsi_remove (&si, true);
5077 /* Free the attached stmt_vec_info and remove the stmt. */
5078 ann = stmt_ann (stmt);
5080 set_stmt_info (ann, NULL);
5081 bsi_remove (&si, true);
5089 /* This is case of skipped interleaved store. We don't free
5090 its stmt_vec_info. */
5091 bsi_remove (&si, true);
5099 slpeel_make_loop_iterate_ntimes (loop, ratio);
5101 mark_set_for_renaming (vect_memsyms_to_rename);
5103 /* The memory tags and pointers in vectorized statements need to
5104 have their SSA forms updated. FIXME, why can't this be delayed
5105 until all the loops have been transformed? */
5106 update_ssa (TODO_update_ssa);
5108 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
5109 fprintf (vect_dump, "LOOP VECTORIZED.");