2 Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 /* Loop Vectorization Pass.
23 This pass tries to vectorize loops. This first implementation focuses on
24 simple inner-most loops, with no conditional control flow, and a set of
25 simple operations which vector form can be expressed using existing
26 tree codes (PLUS, MULT etc).
28 For example, the vectorizer transforms the following simple loop:
30 short a[N]; short b[N]; short c[N]; int i;
36 as if it was manually vectorized by rewriting the source code into:
38 typedef int __attribute__((mode(V8HI))) v8hi;
39 short a[N]; short b[N]; short c[N]; int i;
40 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
43 for (i=0; i<N/8; i++){
50 The main entry to this pass is vectorize_loops(), in which
51 the vectorizer applies a set of analyses on a given set of loops,
52 followed by the actual vectorization transformation for the loops that
53 had successfully passed the analysis phase.
55 Throughout this pass we make a distinction between two types of
56 data: scalars (which are represented by SSA_NAMES), and memory references
57 ("data-refs"). These two types of data require different handling both
58 during analysis and transformation. The types of data-refs that the
59 vectorizer currently supports are ARRAY_REFS which base is an array DECL
60 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
61 accesses are required to have a simple (consecutive) access pattern.
65 The driver for the analysis phase is vect_analyze_loop_nest().
66 It applies a set of analyses, some of which rely on the scalar evolution
67 analyzer (scev) developed by Sebastian Pop.
69 During the analysis phase the vectorizer records some information
70 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
71 loop, as well as general information about the loop as a whole, which is
72 recorded in a "loop_vec_info" struct attached to each loop.
76 The loop transformation phase scans all the stmts in the loop, and
77 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
78 the loop that needs to be vectorized. It insert the vector code sequence
79 just before the scalar stmt S, and records a pointer to the vector code
80 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
81 attached to S). This pointer will be used for the vectorization of following
82 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
83 otherwise, we rely on dead code elimination for removing it.
85 For example, say stmt S1 was vectorized into stmt VS1:
88 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
91 To vectorize stmt S2, the vectorizer first finds the stmt that defines
92 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
93 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
94 resulting sequence would be:
97 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
99 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
101 Operands that are not SSA_NAMEs, are data-refs that appear in
102 load/store operations (like 'x[i]' in S1), and are handled differently.
106 Currently the only target specific information that is used is the
107 size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
108 support different sizes of vectors, for now will need to specify one value
109 for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
111 Since we only vectorize operations which vector form can be
112 expressed using existing tree codes, to verify that an operation is
113 supported, the vectorizer checks the relevant optab at the relevant
114 machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
115 the value found is CODE_FOR_nothing, then there's no target support, and
116 we can't vectorize the stmt.
118 For additional information on this project see:
119 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
124 #include "coretypes.h"
130 #include "basic-block.h"
131 #include "diagnostic.h"
132 #include "tree-flow.h"
133 #include "tree-dump.h"
136 #include "cfglayout.h"
142 #include "tree-chrec.h"
143 #include "tree-data-ref.h"
144 #include "tree-scalar-evolution.h"
146 #include "tree-vectorizer.h"
147 #include "tree-pass.h"
149 /*************************************************************************
150 General Vectorization Utilities
151 *************************************************************************/
153 /* vect_dump will be set to stderr or dump_file if exist. */
156 /* vect_verbosity_level set to an invalid value
157 to mark that it's uninitialized. */
158 enum verbosity_levels vect_verbosity_level = MAX_VERBOSITY_LEVEL;
161 static LOC vect_loop_location;
163 /* Bitmap of virtual variables to be renamed. */
164 bitmap vect_memsyms_to_rename;
166 /*************************************************************************
167 Simple Loop Peeling Utilities
169 Utilities to support loop peeling for vectorization purposes.
170 *************************************************************************/
173 /* Renames the use *OP_P. */
176 rename_use_op (use_operand_p op_p)
180 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
183 new_name = get_current_def (USE_FROM_PTR (op_p));
185 /* Something defined outside of the loop. */
189 /* An ordinary ssa name defined in the loop. */
191 SET_USE (op_p, new_name);
195 /* Renames the variables in basic block BB. */
198 rename_variables_in_bb (basic_block bb)
201 block_stmt_iterator bsi;
207 struct loop *loop = bb->loop_father;
209 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
211 stmt = bsi_stmt (bsi);
212 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
213 rename_use_op (use_p);
216 FOR_EACH_EDGE (e, ei, bb->succs)
218 if (!flow_bb_inside_loop_p (loop, e->dest))
220 for (phi = phi_nodes (e->dest); phi; phi = PHI_CHAIN (phi))
221 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (phi, e));
226 /* Renames variables in new generated LOOP. */
229 rename_variables_in_loop (struct loop *loop)
234 bbs = get_loop_body (loop);
236 for (i = 0; i < loop->num_nodes; i++)
237 rename_variables_in_bb (bbs[i]);
243 /* Update the PHI nodes of NEW_LOOP.
245 NEW_LOOP is a duplicate of ORIG_LOOP.
246 AFTER indicates whether NEW_LOOP executes before or after ORIG_LOOP:
247 AFTER is true if NEW_LOOP executes after ORIG_LOOP, and false if it
248 executes before it. */
251 slpeel_update_phis_for_duplicate_loop (struct loop *orig_loop,
252 struct loop *new_loop, bool after)
255 tree phi_new, phi_orig;
257 edge orig_loop_latch = loop_latch_edge (orig_loop);
258 edge orig_entry_e = loop_preheader_edge (orig_loop);
259 edge new_loop_exit_e = single_exit (new_loop);
260 edge new_loop_entry_e = loop_preheader_edge (new_loop);
261 edge entry_arg_e = (after ? orig_loop_latch : orig_entry_e);
264 step 1. For each loop-header-phi:
265 Add the first phi argument for the phi in NEW_LOOP
266 (the one associated with the entry of NEW_LOOP)
268 step 2. For each loop-header-phi:
269 Add the second phi argument for the phi in NEW_LOOP
270 (the one associated with the latch of NEW_LOOP)
272 step 3. Update the phis in the successor block of NEW_LOOP.
274 case 1: NEW_LOOP was placed before ORIG_LOOP:
275 The successor block of NEW_LOOP is the header of ORIG_LOOP.
276 Updating the phis in the successor block can therefore be done
277 along with the scanning of the loop header phis, because the
278 header blocks of ORIG_LOOP and NEW_LOOP have exactly the same
279 phi nodes, organized in the same order.
281 case 2: NEW_LOOP was placed after ORIG_LOOP:
282 The successor block of NEW_LOOP is the original exit block of
283 ORIG_LOOP - the phis to be updated are the loop-closed-ssa phis.
284 We postpone updating these phis to a later stage (when
285 loop guards are added).
289 /* Scan the phis in the headers of the old and new loops
290 (they are organized in exactly the same order). */
292 for (phi_new = phi_nodes (new_loop->header),
293 phi_orig = phi_nodes (orig_loop->header);
295 phi_new = PHI_CHAIN (phi_new), phi_orig = PHI_CHAIN (phi_orig))
298 def = PHI_ARG_DEF_FROM_EDGE (phi_orig, entry_arg_e);
299 add_phi_arg (phi_new, def, new_loop_entry_e);
302 def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch);
303 if (TREE_CODE (def) != SSA_NAME)
306 new_ssa_name = get_current_def (def);
309 /* This only happens if there are no definitions
310 inside the loop. use the phi_result in this case. */
311 new_ssa_name = PHI_RESULT (phi_new);
314 /* An ordinary ssa name defined in the loop. */
315 add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop));
317 /* step 3 (case 1). */
320 gcc_assert (new_loop_exit_e == orig_entry_e);
321 SET_PHI_ARG_DEF (phi_orig,
322 new_loop_exit_e->dest_idx,
329 /* Update PHI nodes for a guard of the LOOP.
332 - LOOP, GUARD_EDGE: LOOP is a loop for which we added guard code that
333 controls whether LOOP is to be executed. GUARD_EDGE is the edge that
334 originates from the guard-bb, skips LOOP and reaches the (unique) exit
335 bb of LOOP. This loop-exit-bb is an empty bb with one successor.
336 We denote this bb NEW_MERGE_BB because before the guard code was added
337 it had a single predecessor (the LOOP header), and now it became a merge
338 point of two paths - the path that ends with the LOOP exit-edge, and
339 the path that ends with GUARD_EDGE.
340 - NEW_EXIT_BB: New basic block that is added by this function between LOOP
341 and NEW_MERGE_BB. It is used to place loop-closed-ssa-form exit-phis.
343 ===> The CFG before the guard-code was added:
346 if (exit_loop) goto update_bb
347 else goto LOOP_header_bb
350 ==> The CFG after the guard-code was added:
352 if (LOOP_guard_condition) goto new_merge_bb
353 else goto LOOP_header_bb
356 if (exit_loop_condition) goto new_merge_bb
357 else goto LOOP_header_bb
362 ==> The CFG after this function:
364 if (LOOP_guard_condition) goto new_merge_bb
365 else goto LOOP_header_bb
368 if (exit_loop_condition) goto new_exit_bb
369 else goto LOOP_header_bb
376 1. creates and updates the relevant phi nodes to account for the new
377 incoming edge (GUARD_EDGE) into NEW_MERGE_BB. This involves:
378 1.1. Create phi nodes at NEW_MERGE_BB.
379 1.2. Update the phi nodes at the successor of NEW_MERGE_BB (denoted
380 UPDATE_BB). UPDATE_BB was the exit-bb of LOOP before NEW_MERGE_BB
381 2. preserves loop-closed-ssa-form by creating the required phi nodes
382 at the exit of LOOP (i.e, in NEW_EXIT_BB).
384 There are two flavors to this function:
386 slpeel_update_phi_nodes_for_guard1:
387 Here the guard controls whether we enter or skip LOOP, where LOOP is a
388 prolog_loop (loop1 below), and the new phis created in NEW_MERGE_BB are
389 for variables that have phis in the loop header.
391 slpeel_update_phi_nodes_for_guard2:
392 Here the guard controls whether we enter or skip LOOP, where LOOP is an
393 epilog_loop (loop2 below), and the new phis created in NEW_MERGE_BB are
394 for variables that have phis in the loop exit.
396 I.E., the overall structure is:
399 guard1 (goto loop1/merg1_bb)
402 guard2 (goto merge1_bb/merge2_bb)
409 slpeel_update_phi_nodes_for_guard1 takes care of creating phis in
410 loop1_exit_bb and merge1_bb. These are entry phis (phis for the vars
411 that have phis in loop1->header).
413 slpeel_update_phi_nodes_for_guard2 takes care of creating phis in
414 loop2_exit_bb and merge2_bb. These are exit phis (phis for the vars
415 that have phis in next_bb). It also adds some of these phis to
418 slpeel_update_phi_nodes_for_guard1 is always called before
419 slpeel_update_phi_nodes_for_guard2. They are both needed in order
420 to create correct data-flow and loop-closed-ssa-form.
422 Generally slpeel_update_phi_nodes_for_guard1 creates phis for variables
423 that change between iterations of a loop (and therefore have a phi-node
424 at the loop entry), whereas slpeel_update_phi_nodes_for_guard2 creates
425 phis for variables that are used out of the loop (and therefore have
426 loop-closed exit phis). Some variables may be both updated between
427 iterations and used after the loop. This is why in loop1_exit_bb we
428 may need both entry_phis (created by slpeel_update_phi_nodes_for_guard1)
429 and exit phis (created by slpeel_update_phi_nodes_for_guard2).
431 - IS_NEW_LOOP: if IS_NEW_LOOP is true, then LOOP is a newly created copy of
432 an original loop. i.e., we have:
435 guard_bb (goto LOOP/new_merge)
441 If IS_NEW_LOOP is false, then LOOP is an original loop, in which case we
445 guard_bb (goto LOOP/new_merge)
451 The SSA names defined in the original loop have a current
452 reaching definition that that records the corresponding new
453 ssa-name used in the new duplicated loop copy.
456 /* Function slpeel_update_phi_nodes_for_guard1
459 - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
460 - DEFS - a bitmap of ssa names to mark new names for which we recorded
463 In the context of the overall structure, we have:
466 guard1 (goto loop1/merg1_bb)
469 guard2 (goto merge1_bb/merge2_bb)
476 For each name updated between loop iterations (i.e - for each name that has
477 an entry (loop-header) phi in LOOP) we create a new phi in:
478 1. merge1_bb (to account for the edge from guard1)
479 2. loop1_exit_bb (an exit-phi to keep LOOP in loop-closed form)
483 slpeel_update_phi_nodes_for_guard1 (edge guard_edge, struct loop *loop,
484 bool is_new_loop, basic_block *new_exit_bb,
487 tree orig_phi, new_phi;
488 tree update_phi, update_phi2;
489 tree guard_arg, loop_arg;
490 basic_block new_merge_bb = guard_edge->dest;
491 edge e = EDGE_SUCC (new_merge_bb, 0);
492 basic_block update_bb = e->dest;
493 basic_block orig_bb = loop->header;
495 tree current_new_name;
498 /* Create new bb between loop and new_merge_bb. */
499 *new_exit_bb = split_edge (single_exit (loop));
501 new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
503 for (orig_phi = phi_nodes (orig_bb), update_phi = phi_nodes (update_bb);
504 orig_phi && update_phi;
505 orig_phi = PHI_CHAIN (orig_phi), update_phi = PHI_CHAIN (update_phi))
507 /* Virtual phi; Mark it for renaming. We actually want to call
508 mar_sym_for_renaming, but since all ssa renaming datastructures
509 are going to be freed before we get to call ssa_upate, we just
510 record this name for now in a bitmap, and will mark it for
512 name = PHI_RESULT (orig_phi);
513 if (!is_gimple_reg (SSA_NAME_VAR (name)))
514 bitmap_set_bit (vect_memsyms_to_rename, DECL_UID (SSA_NAME_VAR (name)));
516 /** 1. Handle new-merge-point phis **/
518 /* 1.1. Generate new phi node in NEW_MERGE_BB: */
519 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
522 /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
523 of LOOP. Set the two phi args in NEW_PHI for these edges: */
524 loop_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, EDGE_SUCC (loop->latch, 0));
525 guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, loop_preheader_edge (loop));
527 add_phi_arg (new_phi, loop_arg, new_exit_e);
528 add_phi_arg (new_phi, guard_arg, guard_edge);
530 /* 1.3. Update phi in successor block. */
531 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == loop_arg
532 || PHI_ARG_DEF_FROM_EDGE (update_phi, e) == guard_arg);
533 SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
534 update_phi2 = new_phi;
537 /** 2. Handle loop-closed-ssa-form phis **/
539 if (!is_gimple_reg (PHI_RESULT (orig_phi)))
542 /* 2.1. Generate new phi node in NEW_EXIT_BB: */
543 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
546 /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop. */
547 add_phi_arg (new_phi, loop_arg, single_exit (loop));
549 /* 2.3. Update phi in successor of NEW_EXIT_BB: */
550 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
551 SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));
553 /* 2.4. Record the newly created name with set_current_def.
554 We want to find a name such that
555 name = get_current_def (orig_loop_name)
556 and to set its current definition as follows:
557 set_current_def (name, new_phi_name)
559 If LOOP is a new loop then loop_arg is already the name we're
560 looking for. If LOOP is the original loop, then loop_arg is
561 the orig_loop_name and the relevant name is recorded in its
562 current reaching definition. */
564 current_new_name = loop_arg;
567 current_new_name = get_current_def (loop_arg);
568 /* current_def is not available only if the variable does not
569 change inside the loop, in which case we also don't care
570 about recording a current_def for it because we won't be
571 trying to create loop-exit-phis for it. */
572 if (!current_new_name)
575 gcc_assert (get_current_def (current_new_name) == NULL_TREE);
577 set_current_def (current_new_name, PHI_RESULT (new_phi));
578 bitmap_set_bit (*defs, SSA_NAME_VERSION (current_new_name));
581 set_phi_nodes (new_merge_bb, phi_reverse (phi_nodes (new_merge_bb)));
585 /* Function slpeel_update_phi_nodes_for_guard2
588 - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
590 In the context of the overall structure, we have:
593 guard1 (goto loop1/merg1_bb)
596 guard2 (goto merge1_bb/merge2_bb)
603 For each name used out side the loop (i.e - for each name that has an exit
604 phi in next_bb) we create a new phi in:
605 1. merge2_bb (to account for the edge from guard_bb)
606 2. loop2_exit_bb (an exit-phi to keep LOOP in loop-closed form)
607 3. guard2 bb (an exit phi to keep the preceding loop in loop-closed form),
608 if needed (if it wasn't handled by slpeel_update_phis_nodes_for_phi1).
612 slpeel_update_phi_nodes_for_guard2 (edge guard_edge, struct loop *loop,
613 bool is_new_loop, basic_block *new_exit_bb)
615 tree orig_phi, new_phi;
616 tree update_phi, update_phi2;
617 tree guard_arg, loop_arg;
618 basic_block new_merge_bb = guard_edge->dest;
619 edge e = EDGE_SUCC (new_merge_bb, 0);
620 basic_block update_bb = e->dest;
622 tree orig_def, orig_def_new_name;
623 tree new_name, new_name2;
626 /* Create new bb between loop and new_merge_bb. */
627 *new_exit_bb = split_edge (single_exit (loop));
629 new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
631 for (update_phi = phi_nodes (update_bb); update_phi;
632 update_phi = PHI_CHAIN (update_phi))
634 orig_phi = update_phi;
635 orig_def = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
636 /* This loop-closed-phi actually doesn't represent a use
637 out of the loop - the phi arg is a constant. */
638 if (TREE_CODE (orig_def) != SSA_NAME)
640 orig_def_new_name = get_current_def (orig_def);
643 /** 1. Handle new-merge-point phis **/
645 /* 1.1. Generate new phi node in NEW_MERGE_BB: */
646 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
649 /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
650 of LOOP. Set the two PHI args in NEW_PHI for these edges: */
652 new_name2 = NULL_TREE;
653 if (orig_def_new_name)
655 new_name = orig_def_new_name;
656 /* Some variables have both loop-entry-phis and loop-exit-phis.
657 Such variables were given yet newer names by phis placed in
658 guard_bb by slpeel_update_phi_nodes_for_guard1. I.e:
659 new_name2 = get_current_def (get_current_def (orig_name)). */
660 new_name2 = get_current_def (new_name);
665 guard_arg = orig_def;
670 guard_arg = new_name;
674 guard_arg = new_name2;
676 add_phi_arg (new_phi, loop_arg, new_exit_e);
677 add_phi_arg (new_phi, guard_arg, guard_edge);
679 /* 1.3. Update phi in successor block. */
680 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == orig_def);
681 SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
682 update_phi2 = new_phi;
685 /** 2. Handle loop-closed-ssa-form phis **/
687 /* 2.1. Generate new phi node in NEW_EXIT_BB: */
688 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
691 /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop. */
692 add_phi_arg (new_phi, loop_arg, single_exit (loop));
694 /* 2.3. Update phi in successor of NEW_EXIT_BB: */
695 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
696 SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));
699 /** 3. Handle loop-closed-ssa-form phis for first loop **/
701 /* 3.1. Find the relevant names that need an exit-phi in
702 GUARD_BB, i.e. names for which
703 slpeel_update_phi_nodes_for_guard1 had not already created a
704 phi node. This is the case for names that are used outside
705 the loop (and therefore need an exit phi) but are not updated
706 across loop iterations (and therefore don't have a
709 slpeel_update_phi_nodes_for_guard1 is responsible for
710 creating loop-exit phis in GUARD_BB for names that have a
711 loop-header-phi. When such a phi is created we also record
712 the new name in its current definition. If this new name
713 exists, then guard_arg was set to this new name (see 1.2
714 above). Therefore, if guard_arg is not this new name, this
715 is an indication that an exit-phi in GUARD_BB was not yet
716 created, so we take care of it here. */
717 if (guard_arg == new_name2)
721 /* 3.2. Generate new phi node in GUARD_BB: */
722 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
725 /* 3.3. GUARD_BB has one incoming edge: */
726 gcc_assert (EDGE_COUNT (guard_edge->src->preds) == 1);
727 add_phi_arg (new_phi, arg, EDGE_PRED (guard_edge->src, 0));
729 /* 3.4. Update phi in successor of GUARD_BB: */
730 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, guard_edge)
732 SET_PHI_ARG_DEF (update_phi2, guard_edge->dest_idx, PHI_RESULT (new_phi));
735 set_phi_nodes (new_merge_bb, phi_reverse (phi_nodes (new_merge_bb)));
739 /* Make the LOOP iterate NITERS times. This is done by adding a new IV
740 that starts at zero, increases by one and its limit is NITERS.
742 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
745 slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters)
747 tree indx_before_incr, indx_after_incr, cond_stmt, cond;
749 edge exit_edge = single_exit (loop);
750 block_stmt_iterator loop_cond_bsi;
751 block_stmt_iterator incr_bsi;
753 tree init = build_int_cst (TREE_TYPE (niters), 0);
754 tree step = build_int_cst (TREE_TYPE (niters), 1);
757 orig_cond = get_loop_exit_condition (loop);
758 gcc_assert (orig_cond);
759 loop_cond_bsi = bsi_for_stmt (orig_cond);
761 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
762 create_iv (init, step, NULL_TREE, loop,
763 &incr_bsi, insert_after, &indx_before_incr, &indx_after_incr);
765 if (exit_edge->flags & EDGE_TRUE_VALUE) /* 'then' edge exits the loop. */
766 cond = build2 (GE_EXPR, boolean_type_node, indx_after_incr, niters);
767 else /* 'then' edge loops back. */
768 cond = build2 (LT_EXPR, boolean_type_node, indx_after_incr, niters);
770 cond_stmt = build3 (COND_EXPR, TREE_TYPE (orig_cond), cond,
771 NULL_TREE, NULL_TREE);
772 bsi_insert_before (&loop_cond_bsi, cond_stmt, BSI_SAME_STMT);
774 /* Remove old loop exit test: */
775 bsi_remove (&loop_cond_bsi, true);
777 loop_loc = find_loop_location (loop);
778 if (dump_file && (dump_flags & TDF_DETAILS))
780 if (loop_loc != UNKNOWN_LOC)
781 fprintf (dump_file, "\nloop at %s:%d: ",
782 LOC_FILE (loop_loc), LOC_LINE (loop_loc));
783 print_generic_expr (dump_file, cond_stmt, TDF_SLIM);
786 loop->nb_iterations = niters;
790 /* Given LOOP this function generates a new copy of it and puts it
791 on E which is either the entry or exit of LOOP. */
794 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
796 struct loop *new_loop;
797 basic_block *new_bbs, *bbs;
800 basic_block exit_dest;
804 at_exit = (e == single_exit (loop));
805 if (!at_exit && e != loop_preheader_edge (loop))
808 bbs = get_loop_body (loop);
810 /* Check whether duplication is possible. */
811 if (!can_copy_bbs_p (bbs, loop->num_nodes))
817 /* Generate new loop structure. */
818 new_loop = duplicate_loop (loop, loop_outer (loop));
825 exit_dest = single_exit (loop)->dest;
826 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
827 exit_dest) == loop->header ?
830 new_bbs = XNEWVEC (basic_block, loop->num_nodes);
832 exit = single_exit (loop);
833 copy_bbs (bbs, loop->num_nodes, new_bbs,
834 &exit, 1, &new_exit, NULL,
837 /* Duplicating phi args at exit bbs as coming
838 also from exit of duplicated loop. */
839 for (phi = phi_nodes (exit_dest); phi; phi = PHI_CHAIN (phi))
841 phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, single_exit (loop));
844 edge new_loop_exit_edge;
846 if (EDGE_SUCC (new_loop->header, 0)->dest == new_loop->latch)
847 new_loop_exit_edge = EDGE_SUCC (new_loop->header, 1);
849 new_loop_exit_edge = EDGE_SUCC (new_loop->header, 0);
851 add_phi_arg (phi, phi_arg, new_loop_exit_edge);
855 if (at_exit) /* Add the loop copy at exit. */
857 redirect_edge_and_branch_force (e, new_loop->header);
858 PENDING_STMT (e) = NULL;
859 set_immediate_dominator (CDI_DOMINATORS, new_loop->header, e->src);
861 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header);
863 else /* Add the copy at entry. */
866 edge entry_e = loop_preheader_edge (loop);
867 basic_block preheader = entry_e->src;
869 if (!flow_bb_inside_loop_p (new_loop,
870 EDGE_SUCC (new_loop->header, 0)->dest))
871 new_exit_e = EDGE_SUCC (new_loop->header, 0);
873 new_exit_e = EDGE_SUCC (new_loop->header, 1);
875 redirect_edge_and_branch_force (new_exit_e, loop->header);
876 PENDING_STMT (new_exit_e) = NULL;
877 set_immediate_dominator (CDI_DOMINATORS, loop->header,
880 /* We have to add phi args to the loop->header here as coming
881 from new_exit_e edge. */
882 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
884 phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, entry_e);
886 add_phi_arg (phi, phi_arg, new_exit_e);
889 redirect_edge_and_branch_force (entry_e, new_loop->header);
890 PENDING_STMT (entry_e) = NULL;
891 set_immediate_dominator (CDI_DOMINATORS, new_loop->header, preheader);
901 /* Given the condition statement COND, put it as the last statement
902 of GUARD_BB; EXIT_BB is the basic block to skip the loop;
903 Assumes that this is the single exit of the guarded loop.
904 Returns the skip edge. */
907 slpeel_add_loop_guard (basic_block guard_bb, tree cond, basic_block exit_bb,
910 block_stmt_iterator bsi;
913 tree gimplify_stmt_list;
915 enter_e = EDGE_SUCC (guard_bb, 0);
916 enter_e->flags &= ~EDGE_FALLTHRU;
917 enter_e->flags |= EDGE_FALSE_VALUE;
918 bsi = bsi_last (guard_bb);
921 force_gimple_operand (cond, &gimplify_stmt_list, true,
923 cond_stmt = build3 (COND_EXPR, void_type_node, cond,
924 NULL_TREE, NULL_TREE);
925 if (gimplify_stmt_list)
926 bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
928 bsi = bsi_last (guard_bb);
929 bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
931 /* Add new edge to connect guard block to the merge/loop-exit block. */
932 new_e = make_edge (guard_bb, exit_bb, EDGE_TRUE_VALUE);
933 set_immediate_dominator (CDI_DOMINATORS, exit_bb, dom_bb);
938 /* This function verifies that the following restrictions apply to LOOP:
940 (2) it consists of exactly 2 basic blocks - header, and an empty latch.
941 (3) it is single entry, single exit
942 (4) its exit condition is the last stmt in the header
943 (5) E is the entry/exit edge of LOOP.
947 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
949 edge exit_e = single_exit (loop);
950 edge entry_e = loop_preheader_edge (loop);
951 tree orig_cond = get_loop_exit_condition (loop);
952 block_stmt_iterator loop_exit_bsi = bsi_last (exit_e->src);
954 if (need_ssa_update_p ())
958 /* All loops have an outer scope; the only case loop->outer is NULL is for
959 the function itself. */
960 || !loop_outer (loop)
961 || loop->num_nodes != 2
962 || !empty_block_p (loop->latch)
963 || !single_exit (loop)
964 /* Verify that new loop exit condition can be trivially modified. */
965 || (!orig_cond || orig_cond != bsi_stmt (loop_exit_bsi))
966 || (e != exit_e && e != entry_e))
972 #ifdef ENABLE_CHECKING
974 slpeel_verify_cfg_after_peeling (struct loop *first_loop,
975 struct loop *second_loop)
977 basic_block loop1_exit_bb = single_exit (first_loop)->dest;
978 basic_block loop2_entry_bb = loop_preheader_edge (second_loop)->src;
979 basic_block loop1_entry_bb = loop_preheader_edge (first_loop)->src;
981 /* A guard that controls whether the second_loop is to be executed or skipped
982 is placed in first_loop->exit. first_loopt->exit therefore has two
983 successors - one is the preheader of second_loop, and the other is a bb
986 gcc_assert (EDGE_COUNT (loop1_exit_bb->succs) == 2);
988 /* 1. Verify that one of the successors of first_loopt->exit is the preheader
991 /* The preheader of new_loop is expected to have two predecessors:
992 first_loop->exit and the block that precedes first_loop. */
994 gcc_assert (EDGE_COUNT (loop2_entry_bb->preds) == 2
995 && ((EDGE_PRED (loop2_entry_bb, 0)->src == loop1_exit_bb
996 && EDGE_PRED (loop2_entry_bb, 1)->src == loop1_entry_bb)
997 || (EDGE_PRED (loop2_entry_bb, 1)->src == loop1_exit_bb
998 && EDGE_PRED (loop2_entry_bb, 0)->src == loop1_entry_bb)));
1000 /* Verify that the other successor of first_loopt->exit is after the
1006 /* If the run time cost model check determines that vectorization is
1007 not profitable and hence scalar loop should be generated then set
1008 FIRST_NITERS to prologue peeled iterations. This will allow all the
1009 iterations to be executed in the prologue peeled scalar loop. */
1012 set_prologue_iterations (basic_block bb_before_first_loop,
1018 basic_block cond_bb, then_bb;
1019 tree var, prologue_after_cost_adjust_name, stmt;
1020 block_stmt_iterator bsi;
1022 edge e_true, e_false, e_fallthru;
1024 tree gimplify_stmt_list;
1025 tree cost_pre_condition = NULL_TREE;
1026 tree scalar_loop_iters =
1027 unshare_expr (LOOP_VINFO_NITERS_UNCHANGED (loop_vec_info_for_loop (loop)));
1029 e = single_pred_edge (bb_before_first_loop);
1030 cond_bb = split_edge(e);
1032 e = single_pred_edge (bb_before_first_loop);
1033 then_bb = split_edge(e);
1034 set_immediate_dominator (CDI_DOMINATORS, then_bb, cond_bb);
1036 e_false = make_single_succ_edge (cond_bb, bb_before_first_loop,
1038 set_immediate_dominator (CDI_DOMINATORS, bb_before_first_loop, cond_bb);
1040 e_true = EDGE_PRED (then_bb, 0);
1041 e_true->flags &= ~EDGE_FALLTHRU;
1042 e_true->flags |= EDGE_TRUE_VALUE;
1044 e_fallthru = EDGE_SUCC (then_bb, 0);
1046 cost_pre_condition =
1047 build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
1048 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
1049 cost_pre_condition =
1050 force_gimple_operand (cost_pre_condition, &gimplify_stmt_list,
1052 cond_stmt = build3 (COND_EXPR, void_type_node, cost_pre_condition,
1053 NULL_TREE, NULL_TREE);
1055 bsi = bsi_last (cond_bb);
1056 if (gimplify_stmt_list)
1057 bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
1059 bsi = bsi_last (cond_bb);
1060 bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
1062 var = create_tmp_var (TREE_TYPE (scalar_loop_iters),
1063 "prologue_after_cost_adjust");
1064 add_referenced_var (var);
1065 prologue_after_cost_adjust_name =
1066 force_gimple_operand (scalar_loop_iters, &stmt, false, var);
1068 bsi = bsi_last (then_bb);
1070 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1072 newphi = create_phi_node (var, bb_before_first_loop);
1073 add_phi_arg (newphi, prologue_after_cost_adjust_name, e_fallthru);
1074 add_phi_arg (newphi, first_niters, e_false);
1076 first_niters = PHI_RESULT (newphi);
1080 /* Function slpeel_tree_peel_loop_to_edge.
1082 Peel the first (last) iterations of LOOP into a new prolog (epilog) loop
1083 that is placed on the entry (exit) edge E of LOOP. After this transformation
1084 we have two loops one after the other - first-loop iterates FIRST_NITERS
1085 times, and second-loop iterates the remainder NITERS - FIRST_NITERS times.
1086 If the cost model indicates that it is profitable to emit a scalar
1087 loop instead of the vector one, then the prolog (epilog) loop will iterate
1088 for the entire unchanged scalar iterations of the loop.
1091 - LOOP: the loop to be peeled.
1092 - E: the exit or entry edge of LOOP.
1093 If it is the entry edge, we peel the first iterations of LOOP. In this
1094 case first-loop is LOOP, and second-loop is the newly created loop.
1095 If it is the exit edge, we peel the last iterations of LOOP. In this
1096 case, first-loop is the newly created loop, and second-loop is LOOP.
1097 - NITERS: the number of iterations that LOOP iterates.
1098 - FIRST_NITERS: the number of iterations that the first-loop should iterate.
1099 - UPDATE_FIRST_LOOP_COUNT: specified whether this function is responsible
1100 for updating the loop bound of the first-loop to FIRST_NITERS. If it
1101 is false, the caller of this function may want to take care of this
1102 (this can be useful if we don't want new stmts added to first-loop).
1103 - TH: cost model profitability threshold of iterations for vectorization.
1104 - CHECK_PROFITABILITY: specify whether cost model check has not occured
1105 during versioning and hence needs to occur during
1106 prologue generation or whether cost model check
1107 has not occured during prologue generation and hence
1108 needs to occur during epilogue generation.
1112 The function returns a pointer to the new loop-copy, or NULL if it failed
1113 to perform the transformation.
1115 The function generates two if-then-else guards: one before the first loop,
1116 and the other before the second loop:
1118 if (FIRST_NITERS == 0) then skip the first loop,
1119 and go directly to the second loop.
1120 The second guard is:
1121 if (FIRST_NITERS == NITERS) then skip the second loop.
1123 FORNOW only simple loops are supported (see slpeel_can_duplicate_loop_p).
1124 FORNOW the resulting code will not be in loop-closed-ssa form.
1128 slpeel_tree_peel_loop_to_edge (struct loop *loop,
1129 edge e, tree first_niters,
1130 tree niters, bool update_first_loop_count,
1131 unsigned int th, bool check_profitability)
1133 struct loop *new_loop = NULL, *first_loop, *second_loop;
1135 tree pre_condition = NULL_TREE;
1137 basic_block bb_before_second_loop, bb_after_second_loop;
1138 basic_block bb_before_first_loop;
1139 basic_block bb_between_loops;
1140 basic_block new_exit_bb;
1141 edge exit_e = single_exit (loop);
1143 tree cost_pre_condition = NULL_TREE;
1145 if (!slpeel_can_duplicate_loop_p (loop, e))
1148 /* We have to initialize cfg_hooks. Then, when calling
1149 cfg_hooks->split_edge, the function tree_split_edge
1150 is actually called and, when calling cfg_hooks->duplicate_block,
1151 the function tree_duplicate_bb is called. */
1152 tree_register_cfg_hooks ();
1155 /* 1. Generate a copy of LOOP and put it on E (E is the entry/exit of LOOP).
1156 Resulting CFG would be:
1169 if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e)))
1171 loop_loc = find_loop_location (loop);
1172 if (dump_file && (dump_flags & TDF_DETAILS))
1174 if (loop_loc != UNKNOWN_LOC)
1175 fprintf (dump_file, "\n%s:%d: note: ",
1176 LOC_FILE (loop_loc), LOC_LINE (loop_loc));
1177 fprintf (dump_file, "tree_duplicate_loop_to_edge_cfg failed.\n");
1184 /* NEW_LOOP was placed after LOOP. */
1186 second_loop = new_loop;
1190 /* NEW_LOOP was placed before LOOP. */
1191 first_loop = new_loop;
1195 definitions = ssa_names_to_replace ();
1196 slpeel_update_phis_for_duplicate_loop (loop, new_loop, e == exit_e);
1197 rename_variables_in_loop (new_loop);
1200 /* 2. Add the guard code in one of the following ways:
1202 2.a Add the guard that controls whether the first loop is executed.
1203 This occurs when this function is invoked for prologue or epilogiue
1204 generation and when the cost model check can be done at compile time.
1206 Resulting CFG would be:
1208 bb_before_first_loop:
1209 if (FIRST_NITERS == 0) GOTO bb_before_second_loop
1216 bb_before_second_loop:
1224 2.b Add the cost model check that allows the prologue
1225 to iterate for the entire unchanged scalar
1226 iterations of the loop in the event that the cost
1227 model indicates that the scalar loop is more
1228 profitable than the vector one. This occurs when
1229 this function is invoked for prologue generation
1230 and the cost model check needs to be done at run
1233 Resulting CFG after prologue peeling would be:
1235 if (scalar_loop_iterations <= th)
1236 FIRST_NITERS = scalar_loop_iterations
1238 bb_before_first_loop:
1239 if (FIRST_NITERS == 0) GOTO bb_before_second_loop
1246 bb_before_second_loop:
1254 2.c Add the cost model check that allows the epilogue
1255 to iterate for the entire unchanged scalar
1256 iterations of the loop in the event that the cost
1257 model indicates that the scalar loop is more
1258 profitable than the vector one. This occurs when
1259 this function is invoked for epilogue generation
1260 and the cost model check needs to be done at run
1263 Resulting CFG after prologue peeling would be:
1265 bb_before_first_loop:
1266 if ((scalar_loop_iterations <= th)
1268 FIRST_NITERS == 0) GOTO bb_before_second_loop
1275 bb_before_second_loop:
1284 bb_before_first_loop = split_edge (loop_preheader_edge (first_loop));
1285 bb_before_second_loop = split_edge (single_exit (first_loop));
1287 /* Epilogue peeling. */
1288 if (!update_first_loop_count)
1291 fold_build2 (LE_EXPR, boolean_type_node, first_niters,
1292 build_int_cst (TREE_TYPE (first_niters), 0));
1293 if (check_profitability)
1295 tree scalar_loop_iters
1296 = unshare_expr (LOOP_VINFO_NITERS_UNCHANGED
1297 (loop_vec_info_for_loop (loop)));
1298 cost_pre_condition =
1299 build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
1300 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
1302 pre_condition = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
1303 cost_pre_condition, pre_condition);
1307 /* Prologue peeling. */
1310 if (check_profitability)
1311 set_prologue_iterations (bb_before_first_loop, first_niters,
1315 fold_build2 (LE_EXPR, boolean_type_node, first_niters,
1316 build_int_cst (TREE_TYPE (first_niters), 0));
1319 skip_e = slpeel_add_loop_guard (bb_before_first_loop, pre_condition,
1320 bb_before_second_loop, bb_before_first_loop);
1321 slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop,
1322 first_loop == new_loop,
1323 &new_exit_bb, &definitions);
1326 /* 3. Add the guard that controls whether the second loop is executed.
1327 Resulting CFG would be:
1329 bb_before_first_loop:
1330 if (FIRST_NITERS == 0) GOTO bb_before_second_loop (skip first loop)
1338 if (FIRST_NITERS == NITERS) GOTO bb_after_second_loop (skip second loop)
1339 GOTO bb_before_second_loop
1341 bb_before_second_loop:
1347 bb_after_second_loop:
1352 bb_between_loops = new_exit_bb;
1353 bb_after_second_loop = split_edge (single_exit (second_loop));
1356 fold_build2 (EQ_EXPR, boolean_type_node, first_niters, niters);
1357 skip_e = slpeel_add_loop_guard (bb_between_loops, pre_condition,
1358 bb_after_second_loop, bb_before_first_loop);
1359 slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop,
1360 second_loop == new_loop, &new_exit_bb);
1362 /* 4. Make first-loop iterate FIRST_NITERS times, if requested.
1364 if (update_first_loop_count)
1365 slpeel_make_loop_iterate_ntimes (first_loop, first_niters);
1367 BITMAP_FREE (definitions);
1368 delete_update_ssa ();
1373 /* Function vect_get_loop_location.
1375 Extract the location of the loop in the source code.
1376 If the loop is not well formed for vectorization, an estimated
1377 location is calculated.
1378 Return the loop location if succeed and NULL if not. */
1381 find_loop_location (struct loop *loop)
1383 tree node = NULL_TREE;
1385 block_stmt_iterator si;
1390 node = get_loop_exit_condition (loop);
1392 if (node && CAN_HAVE_LOCATION_P (node) && EXPR_HAS_LOCATION (node)
1393 && EXPR_FILENAME (node) && EXPR_LINENO (node))
1394 return EXPR_LOC (node);
1396 /* If we got here the loop is probably not "well formed",
1397 try to estimate the loop location */
1404 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
1406 node = bsi_stmt (si);
1407 if (node && CAN_HAVE_LOCATION_P (node) && EXPR_HAS_LOCATION (node))
1408 return EXPR_LOC (node);
1415 /*************************************************************************
1416 Vectorization Debug Information.
1417 *************************************************************************/
1419 /* Function vect_set_verbosity_level.
1421 Called from toplev.c upon detection of the
1422 -ftree-vectorizer-verbose=N option. */
1425 vect_set_verbosity_level (const char *val)
1430 if (vl < MAX_VERBOSITY_LEVEL)
1431 vect_verbosity_level = vl;
1433 vect_verbosity_level = MAX_VERBOSITY_LEVEL - 1;
1437 /* Function vect_set_dump_settings.
1439 Fix the verbosity level of the vectorizer if the
1440 requested level was not set explicitly using the flag
1441 -ftree-vectorizer-verbose=N.
1442 Decide where to print the debugging information (dump_file/stderr).
1443 If the user defined the verbosity level, but there is no dump file,
1444 print to stderr, otherwise print to the dump file. */
1447 vect_set_dump_settings (void)
1449 vect_dump = dump_file;
1451 /* Check if the verbosity level was defined by the user: */
1452 if (vect_verbosity_level != MAX_VERBOSITY_LEVEL)
1454 /* If there is no dump file, print to stderr. */
1460 /* User didn't specify verbosity level: */
1461 if (dump_file && (dump_flags & TDF_DETAILS))
1462 vect_verbosity_level = REPORT_DETAILS;
1463 else if (dump_file && (dump_flags & TDF_STATS))
1464 vect_verbosity_level = REPORT_UNVECTORIZED_LOOPS;
1466 vect_verbosity_level = REPORT_NONE;
1468 gcc_assert (dump_file || vect_verbosity_level == REPORT_NONE);
1472 /* Function debug_loop_details.
1474 For vectorization debug dumps. */
1477 vect_print_dump_info (enum verbosity_levels vl)
1479 if (vl > vect_verbosity_level)
1482 if (!current_function_decl || !vect_dump)
1485 if (vect_loop_location == UNKNOWN_LOC)
1486 fprintf (vect_dump, "\n%s:%d: note: ",
1487 DECL_SOURCE_FILE (current_function_decl),
1488 DECL_SOURCE_LINE (current_function_decl));
1490 fprintf (vect_dump, "\n%s:%d: note: ",
1491 LOC_FILE (vect_loop_location), LOC_LINE (vect_loop_location));
1497 /*************************************************************************
1498 Vectorization Utilities.
1499 *************************************************************************/
1501 /* Function new_stmt_vec_info.
1503 Create and initialize a new stmt_vec_info struct for STMT. */
1506 new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo)
1509 res = (stmt_vec_info) xcalloc (1, sizeof (struct _stmt_vec_info));
1511 STMT_VINFO_TYPE (res) = undef_vec_info_type;
1512 STMT_VINFO_STMT (res) = stmt;
1513 STMT_VINFO_LOOP_VINFO (res) = loop_vinfo;
1514 STMT_VINFO_RELEVANT (res) = 0;
1515 STMT_VINFO_LIVE_P (res) = false;
1516 STMT_VINFO_VECTYPE (res) = NULL;
1517 STMT_VINFO_VEC_STMT (res) = NULL;
1518 STMT_VINFO_IN_PATTERN_P (res) = false;
1519 STMT_VINFO_RELATED_STMT (res) = NULL;
1520 STMT_VINFO_DATA_REF (res) = NULL;
1522 STMT_VINFO_DR_BASE_ADDRESS (res) = NULL;
1523 STMT_VINFO_DR_OFFSET (res) = NULL;
1524 STMT_VINFO_DR_INIT (res) = NULL;
1525 STMT_VINFO_DR_STEP (res) = NULL;
1526 STMT_VINFO_DR_ALIGNED_TO (res) = NULL;
1528 if (TREE_CODE (stmt) == PHI_NODE && is_loop_header_bb_p (bb_for_stmt (stmt)))
1529 STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type;
1531 STMT_VINFO_DEF_TYPE (res) = vect_loop_def;
1532 STMT_VINFO_SAME_ALIGN_REFS (res) = VEC_alloc (dr_p, heap, 5);
1533 STMT_VINFO_INSIDE_OF_LOOP_COST (res) = 0;
1534 STMT_VINFO_OUTSIDE_OF_LOOP_COST (res) = 0;
1535 STMT_SLP_TYPE (res) = 0;
1536 DR_GROUP_FIRST_DR (res) = NULL_TREE;
1537 DR_GROUP_NEXT_DR (res) = NULL_TREE;
1538 DR_GROUP_SIZE (res) = 0;
1539 DR_GROUP_STORE_COUNT (res) = 0;
1540 DR_GROUP_GAP (res) = 0;
1541 DR_GROUP_SAME_DR_STMT (res) = NULL_TREE;
1542 DR_GROUP_READ_WRITE_DEPENDENCE (res) = false;
1548 /* Function bb_in_loop_p
1550 Used as predicate for dfs order traversal of the loop bbs. */
1553 bb_in_loop_p (const_basic_block bb, const void *data)
1555 const struct loop *const loop = (const struct loop *)data;
1556 if (flow_bb_inside_loop_p (loop, bb))
1562 /* Function new_loop_vec_info.
1564 Create and initialize a new loop_vec_info struct for LOOP, as well as
1565 stmt_vec_info structs for all the stmts in LOOP. */
1568 new_loop_vec_info (struct loop *loop)
1572 block_stmt_iterator si;
1573 unsigned int i, nbbs;
1575 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1576 LOOP_VINFO_LOOP (res) = loop;
1578 bbs = get_loop_body (loop);
1580 /* Create/Update stmt_info for all stmts in the loop. */
1581 for (i = 0; i < loop->num_nodes; i++)
1583 basic_block bb = bbs[i];
1586 /* BBs in a nested inner-loop will have been already processed (because
1587 we will have called vect_analyze_loop_form for any nested inner-loop).
1588 Therefore, for stmts in an inner-loop we just want to update the
1589 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
1590 loop_info of the outer-loop we are currently considering to vectorize
1591 (instead of the loop_info of the inner-loop).
1592 For stmts in other BBs we need to create a stmt_info from scratch. */
1593 if (bb->loop_father != loop)
1595 /* Inner-loop bb. */
1596 gcc_assert (loop->inner && bb->loop_father == loop->inner);
1597 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1599 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
1600 loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1601 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
1602 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
1604 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
1606 tree stmt = bsi_stmt (si);
1607 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1608 loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1609 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
1610 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
1615 /* bb in current nest. */
1616 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1618 stmt_ann_t ann = get_stmt_ann (phi);
1619 set_stmt_info (ann, new_stmt_vec_info (phi, res));
1622 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
1624 tree stmt = bsi_stmt (si);
1625 stmt_ann_t ann = stmt_ann (stmt);
1626 set_stmt_info (ann, new_stmt_vec_info (stmt, res));
1631 /* CHECKME: We want to visit all BBs before their successors (except for
1632 latch blocks, for which this assertion wouldn't hold). In the simple
1633 case of the loop forms we allow, a dfs order of the BBs would the same
1634 as reversed postorder traversal, so we are safe. */
1637 bbs = XCNEWVEC (basic_block, loop->num_nodes);
1638 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1639 bbs, loop->num_nodes, loop);
1640 gcc_assert (nbbs == loop->num_nodes);
1642 LOOP_VINFO_BBS (res) = bbs;
1643 LOOP_VINFO_NITERS (res) = NULL;
1644 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1645 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
1646 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1647 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
1648 LOOP_VINFO_VECT_FACTOR (res) = 0;
1649 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
1650 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
1651 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1652 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
1653 VEC_alloc (tree, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
1654 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
1655 VEC_alloc (ddr_p, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1656 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (tree, heap, 10);
1657 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
1658 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1664 /* Function destroy_loop_vec_info.
1666 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1667 stmts in the loop. */
1670 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1675 block_stmt_iterator si;
1677 VEC (slp_instance, heap) *slp_instances;
1678 slp_instance instance;
1683 loop = LOOP_VINFO_LOOP (loop_vinfo);
1685 bbs = LOOP_VINFO_BBS (loop_vinfo);
1686 nbbs = loop->num_nodes;
1690 free (LOOP_VINFO_BBS (loop_vinfo));
1691 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
1692 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1693 VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1700 for (j = 0; j < nbbs; j++)
1702 basic_block bb = bbs[j];
1704 stmt_vec_info stmt_info;
1706 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1708 stmt_ann_t ann = stmt_ann (phi);
1710 stmt_info = vinfo_for_stmt (phi);
1712 set_stmt_info (ann, NULL);
1715 for (si = bsi_start (bb); !bsi_end_p (si); )
1717 tree stmt = bsi_stmt (si);
1718 stmt_ann_t ann = stmt_ann (stmt);
1719 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1723 /* Check if this is a "pattern stmt" (introduced by the
1724 vectorizer during the pattern recognition pass). */
1725 bool remove_stmt_p = false;
1726 tree orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1729 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
1731 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
1732 remove_stmt_p = true;
1735 /* Free stmt_vec_info. */
1736 VEC_free (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmt_info));
1738 set_stmt_info (ann, NULL);
1740 /* Remove dead "pattern stmts". */
1742 bsi_remove (&si, true);
1748 free (LOOP_VINFO_BBS (loop_vinfo));
1749 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
1750 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1751 VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1752 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1753 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1754 for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
1755 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
1756 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
1763 /* Function vect_force_dr_alignment_p.
1765 Returns whether the alignment of a DECL can be forced to be aligned
1766 on ALIGNMENT bit boundary. */
1769 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
1771 if (TREE_CODE (decl) != VAR_DECL)
1774 if (DECL_EXTERNAL (decl))
1777 if (TREE_ASM_WRITTEN (decl))
1780 if (TREE_STATIC (decl))
1781 return (alignment <= MAX_OFILE_ALIGNMENT);
1783 /* This used to be PREFERRED_STACK_BOUNDARY, however, that is not 100%
1784 correct until someone implements forced stack alignment. */
1785 return (alignment <= STACK_BOUNDARY);
1789 /* Function get_vectype_for_scalar_type.
1791 Returns the vector type corresponding to SCALAR_TYPE as supported
1795 get_vectype_for_scalar_type (tree scalar_type)
1797 enum machine_mode inner_mode = TYPE_MODE (scalar_type);
1798 int nbytes = GET_MODE_SIZE (inner_mode);
1802 if (nbytes == 0 || nbytes >= UNITS_PER_SIMD_WORD)
1805 /* FORNOW: Only a single vector size per target (UNITS_PER_SIMD_WORD)
1807 nunits = UNITS_PER_SIMD_WORD / nbytes;
1809 vectype = build_vector_type (scalar_type, nunits);
1810 if (vect_print_dump_info (REPORT_DETAILS))
1812 fprintf (vect_dump, "get vectype with %d units of type ", nunits);
1813 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
1819 if (vect_print_dump_info (REPORT_DETAILS))
1821 fprintf (vect_dump, "vectype: ");
1822 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1825 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1826 && !INTEGRAL_MODE_P (TYPE_MODE (vectype)))
1828 if (vect_print_dump_info (REPORT_DETAILS))
1829 fprintf (vect_dump, "mode not supported by target.");
1837 /* Function vect_supportable_dr_alignment
1839 Return whether the data reference DR is supported with respect to its
1842 enum dr_alignment_support
1843 vect_supportable_dr_alignment (struct data_reference *dr)
1845 tree stmt = DR_STMT (dr);
1846 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1847 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1848 enum machine_mode mode = (int) TYPE_MODE (vectype);
1849 struct loop *vect_loop = LOOP_VINFO_LOOP (STMT_VINFO_LOOP_VINFO (stmt_info));
1850 bool nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
1851 bool invariant_in_outerloop = false;
1853 if (aligned_access_p (dr))
1856 if (nested_in_vect_loop)
1858 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
1859 invariant_in_outerloop =
1860 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
1863 /* Possibly unaligned access. */
1865 /* We can choose between using the implicit realignment scheme (generating
1866 a misaligned_move stmt) and the explicit realignment scheme (generating
1867 aligned loads with a REALIGN_LOAD). There are two variants to the explicit
1868 realignment scheme: optimized, and unoptimized.
1869 We can optimize the realignment only if the step between consecutive
1870 vector loads is equal to the vector size. Since the vector memory
1871 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
1872 is guaranteed that the misalignment amount remains the same throughout the
1873 execution of the vectorized loop. Therefore, we can create the
1874 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
1875 at the loop preheader.
1877 However, in the case of outer-loop vectorization, when vectorizing a
1878 memory access in the inner-loop nested within the LOOP that is now being
1879 vectorized, while it is guaranteed that the misalignment of the
1880 vectorized memory access will remain the same in different outer-loop
1881 iterations, it is *not* guaranteed that is will remain the same throughout
1882 the execution of the inner-loop. This is because the inner-loop advances
1883 with the original scalar step (and not in steps of VS). If the inner-loop
1884 step happens to be a multiple of VS, then the misalignment remains fixed
1885 and we can use the optimized realignment scheme. For example:
1891 When vectorizing the i-loop in the above example, the step between
1892 consecutive vector loads is 1, and so the misalignment does not remain
1893 fixed across the execution of the inner-loop, and the realignment cannot
1894 be optimized (as illustrated in the following pseudo vectorized loop):
1896 for (i=0; i<N; i+=4)
1897 for (j=0; j<M; j++){
1898 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
1899 // when j is {0,1,2,3,4,5,6,7,...} respectively.
1900 // (assuming that we start from an aligned address).
1903 We therefore have to use the unoptimized realignment scheme:
1905 for (i=0; i<N; i+=4)
1906 for (j=k; j<M; j+=4)
1907 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
1908 // that the misalignment of the initial address is
1911 The loop can then be vectorized as follows:
1913 for (k=0; k<4; k++){
1914 rt = get_realignment_token (&vp[k]);
1915 for (i=0; i<N; i+=4){
1917 for (j=k; j<M; j+=4){
1919 va = REALIGN_LOAD <v1,v2,rt>;
1926 if (DR_IS_READ (dr))
1928 if (optab_handler (vec_realign_load_optab, mode)->insn_code !=
1930 && (!targetm.vectorize.builtin_mask_for_load
1931 || targetm.vectorize.builtin_mask_for_load ()))
1933 if (nested_in_vect_loop
1934 && TREE_INT_CST_LOW (DR_STEP (dr)) != UNITS_PER_SIMD_WORD)
1935 return dr_explicit_realign;
1937 return dr_explicit_realign_optimized;
1940 if (optab_handler (movmisalign_optab, mode)->insn_code !=
1942 /* Can't software pipeline the loads, but can at least do them. */
1943 return dr_unaligned_supported;
1947 return dr_unaligned_unsupported;
1951 /* Function vect_is_simple_use.
1954 LOOP - the loop that is being vectorized.
1955 OPERAND - operand of a stmt in LOOP.
1956 DEF - the defining stmt in case OPERAND is an SSA_NAME.
1958 Returns whether a stmt with OPERAND can be vectorized.
1959 Supportable operands are constants, loop invariants, and operands that are
1960 defined by the current iteration of the loop. Unsupportable operands are
1961 those that are defined by a previous iteration of the loop (as is the case
1962 in reduction/induction computations). */
1965 vect_is_simple_use (tree operand, loop_vec_info loop_vinfo, tree *def_stmt,
1966 tree *def, enum vect_def_type *dt)
1969 stmt_vec_info stmt_vinfo;
1970 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1972 *def_stmt = NULL_TREE;
1975 if (vect_print_dump_info (REPORT_DETAILS))
1977 fprintf (vect_dump, "vect_is_simple_use: operand ");
1978 print_generic_expr (vect_dump, operand, TDF_SLIM);
1981 if (TREE_CODE (operand) == INTEGER_CST || TREE_CODE (operand) == REAL_CST)
1983 *dt = vect_constant_def;
1986 if (is_gimple_min_invariant (operand))
1989 *dt = vect_invariant_def;
1993 if (TREE_CODE (operand) == PAREN_EXPR)
1995 if (vect_print_dump_info (REPORT_DETAILS))
1996 fprintf (vect_dump, "non-associatable copy.");
1997 operand = TREE_OPERAND (operand, 0);
1999 if (TREE_CODE (operand) != SSA_NAME)
2001 if (vect_print_dump_info (REPORT_DETAILS))
2002 fprintf (vect_dump, "not ssa-name.");
2006 *def_stmt = SSA_NAME_DEF_STMT (operand);
2007 if (*def_stmt == NULL_TREE )
2009 if (vect_print_dump_info (REPORT_DETAILS))
2010 fprintf (vect_dump, "no def_stmt.");
2014 if (vect_print_dump_info (REPORT_DETAILS))
2016 fprintf (vect_dump, "def_stmt: ");
2017 print_generic_expr (vect_dump, *def_stmt, TDF_SLIM);
2020 /* empty stmt is expected only in case of a function argument.
2021 (Otherwise - we expect a phi_node or a GIMPLE_MODIFY_STMT). */
2022 if (IS_EMPTY_STMT (*def_stmt))
2024 tree arg = TREE_OPERAND (*def_stmt, 0);
2025 if (is_gimple_min_invariant (arg))
2028 *dt = vect_invariant_def;
2032 if (vect_print_dump_info (REPORT_DETAILS))
2033 fprintf (vect_dump, "Unexpected empty stmt.");
2037 bb = bb_for_stmt (*def_stmt);
2038 if (!flow_bb_inside_loop_p (loop, bb))
2039 *dt = vect_invariant_def;
2042 stmt_vinfo = vinfo_for_stmt (*def_stmt);
2043 *dt = STMT_VINFO_DEF_TYPE (stmt_vinfo);
2046 if (*dt == vect_unknown_def_type)
2048 if (vect_print_dump_info (REPORT_DETAILS))
2049 fprintf (vect_dump, "Unsupported pattern.");
2053 if (vect_print_dump_info (REPORT_DETAILS))
2054 fprintf (vect_dump, "type of def: %d.",*dt);
2056 switch (TREE_CODE (*def_stmt))
2059 *def = PHI_RESULT (*def_stmt);
2062 case GIMPLE_MODIFY_STMT:
2063 *def = GIMPLE_STMT_OPERAND (*def_stmt, 0);
2067 if (vect_print_dump_info (REPORT_DETAILS))
2068 fprintf (vect_dump, "unsupported defining stmt: ");
2076 /* Function supportable_widening_operation
2078 Check whether an operation represented by the code CODE is a
2079 widening operation that is supported by the target platform in
2080 vector form (i.e., when operating on arguments of type VECTYPE).
2082 Widening operations we currently support are NOP (CONVERT), FLOAT
2083 and WIDEN_MULT. This function checks if these operations are supported
2084 by the target platform either directly (via vector tree-codes), or via
2088 - CODE1 and CODE2 are codes of vector operations to be used when
2089 vectorizing the operation, if available.
2090 - DECL1 and DECL2 are decls of target builtin functions to be used
2091 when vectorizing the operation, if available. In this case,
2092 CODE1 and CODE2 are CALL_EXPR. */
2095 supportable_widening_operation (enum tree_code code, tree stmt, tree vectype,
2096 tree *decl1, tree *decl2,
2097 enum tree_code *code1, enum tree_code *code2)
2099 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2100 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
2101 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2103 enum machine_mode vec_mode;
2104 enum insn_code icode1, icode2;
2105 optab optab1, optab2;
2106 tree expr = GIMPLE_STMT_OPERAND (stmt, 1);
2107 tree type = TREE_TYPE (expr);
2108 tree wide_vectype = get_vectype_for_scalar_type (type);
2109 enum tree_code c1, c2;
2111 /* The result of a vectorized widening operation usually requires two vectors
2112 (because the widened results do not fit int one vector). The generated
2113 vector results would normally be expected to be generated in the same
2114 order as in the original scalar computation. i.e. if 8 results are
2115 generated in each vector iteration, they are to be organized as follows:
2116 vect1: [res1,res2,res3,res4], vect2: [res5,res6,res7,res8].
2118 However, in the special case that the result of the widening operation is
2119 used in a reduction computation only, the order doesn't matter (because
2120 when vectorizing a reduction we change the order of the computation).
2121 Some targets can take advantage of this and generate more efficient code.
2122 For example, targets like Altivec, that support widen_mult using a sequence
2123 of {mult_even,mult_odd} generate the following vectors:
2124 vect1: [res1,res3,res5,res7], vect2: [res2,res4,res6,res8].
2126 When vectorizaing outer-loops, we execute the inner-loop sequentially
2127 (each vectorized inner-loop iteration contributes to VF outer-loop
2128 iterations in parallel). We therefore don't allow to change the order
2129 of the computation in the inner-loop during outer-loop vectorization. */
2131 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction
2132 && !nested_in_vect_loop_p (vect_loop, stmt))
2138 && code == WIDEN_MULT_EXPR
2139 && targetm.vectorize.builtin_mul_widen_even
2140 && targetm.vectorize.builtin_mul_widen_even (vectype)
2141 && targetm.vectorize.builtin_mul_widen_odd
2142 && targetm.vectorize.builtin_mul_widen_odd (vectype))
2144 if (vect_print_dump_info (REPORT_DETAILS))
2145 fprintf (vect_dump, "Unordered widening operation detected.");
2147 *code1 = *code2 = CALL_EXPR;
2148 *decl1 = targetm.vectorize.builtin_mul_widen_even (vectype);
2149 *decl2 = targetm.vectorize.builtin_mul_widen_odd (vectype);
2155 case WIDEN_MULT_EXPR:
2156 if (BYTES_BIG_ENDIAN)
2158 c1 = VEC_WIDEN_MULT_HI_EXPR;
2159 c2 = VEC_WIDEN_MULT_LO_EXPR;
2163 c2 = VEC_WIDEN_MULT_HI_EXPR;
2164 c1 = VEC_WIDEN_MULT_LO_EXPR;
2170 if (BYTES_BIG_ENDIAN)
2172 c1 = VEC_UNPACK_HI_EXPR;
2173 c2 = VEC_UNPACK_LO_EXPR;
2177 c2 = VEC_UNPACK_HI_EXPR;
2178 c1 = VEC_UNPACK_LO_EXPR;
2183 if (BYTES_BIG_ENDIAN)
2185 c1 = VEC_UNPACK_FLOAT_HI_EXPR;
2186 c2 = VEC_UNPACK_FLOAT_LO_EXPR;
2190 c2 = VEC_UNPACK_FLOAT_HI_EXPR;
2191 c1 = VEC_UNPACK_FLOAT_LO_EXPR;
2195 case FIX_TRUNC_EXPR:
2196 /* ??? Not yet implemented due to missing VEC_UNPACK_FIX_TRUNC_HI_EXPR/
2197 VEC_UNPACK_FIX_TRUNC_LO_EXPR tree codes and optabs used for
2198 computing the operation. */
2205 if (code == FIX_TRUNC_EXPR)
2207 /* The signedness is determined from output operand. */
2208 optab1 = optab_for_tree_code (c1, type);
2209 optab2 = optab_for_tree_code (c2, type);
2213 optab1 = optab_for_tree_code (c1, vectype);
2214 optab2 = optab_for_tree_code (c2, vectype);
2217 if (!optab1 || !optab2)
2220 vec_mode = TYPE_MODE (vectype);
2221 if ((icode1 = optab_handler (optab1, vec_mode)->insn_code) == CODE_FOR_nothing
2222 || insn_data[icode1].operand[0].mode != TYPE_MODE (wide_vectype)
2223 || (icode2 = optab_handler (optab2, vec_mode)->insn_code)
2225 || insn_data[icode2].operand[0].mode != TYPE_MODE (wide_vectype))
2234 /* Function supportable_narrowing_operation
2236 Check whether an operation represented by the code CODE is a
2237 narrowing operation that is supported by the target platform in
2238 vector form (i.e., when operating on arguments of type VECTYPE).
2240 Narrowing operations we currently support are NOP (CONVERT) and
2241 FIX_TRUNC. This function checks if these operations are supported by
2242 the target platform directly via vector tree-codes.
2245 - CODE1 is the code of a vector operation to be used when
2246 vectorizing the operation, if available. */
2249 supportable_narrowing_operation (enum tree_code code,
2250 const_tree stmt, const_tree vectype,
2251 enum tree_code *code1)
2253 enum machine_mode vec_mode;
2254 enum insn_code icode1;
2256 tree expr = GIMPLE_STMT_OPERAND (stmt, 1);
2257 tree type = TREE_TYPE (expr);
2258 tree narrow_vectype = get_vectype_for_scalar_type (type);
2265 c1 = VEC_PACK_TRUNC_EXPR;
2268 case FIX_TRUNC_EXPR:
2269 c1 = VEC_PACK_FIX_TRUNC_EXPR;
2273 /* ??? Not yet implemented due to missing VEC_PACK_FLOAT_EXPR
2274 tree code and optabs used for computing the operation. */
2281 if (code == FIX_TRUNC_EXPR)
2282 /* The signedness is determined from output operand. */
2283 optab1 = optab_for_tree_code (c1, type);
2285 optab1 = optab_for_tree_code (c1, vectype);
2290 vec_mode = TYPE_MODE (vectype);
2291 if ((icode1 = optab_handler (optab1, vec_mode)->insn_code) == CODE_FOR_nothing
2292 || insn_data[icode1].operand[0].mode != TYPE_MODE (narrow_vectype))
2300 /* Function reduction_code_for_scalar_code
2303 CODE - tree_code of a reduction operations.
2306 REDUC_CODE - the corresponding tree-code to be used to reduce the
2307 vector of partial results into a single scalar result (which
2308 will also reside in a vector).
2310 Return TRUE if a corresponding REDUC_CODE was found, FALSE otherwise. */
2313 reduction_code_for_scalar_code (enum tree_code code,
2314 enum tree_code *reduc_code)
2319 *reduc_code = REDUC_MAX_EXPR;
2323 *reduc_code = REDUC_MIN_EXPR;
2327 *reduc_code = REDUC_PLUS_EXPR;
2336 /* Function vect_is_simple_reduction
2338 Detect a cross-iteration def-use cycle that represents a simple
2339 reduction computation. We look for the following pattern:
2344 a2 = operation (a3, a1)
2347 1. operation is commutative and associative and it is safe to
2348 change the order of the computation.
2349 2. no uses for a2 in the loop (a2 is used out of the loop)
2350 3. no uses of a1 in the loop besides the reduction operation.
2352 Condition 1 is tested here.
2353 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. */
2356 vect_is_simple_reduction (loop_vec_info loop_info, tree phi)
2358 struct loop *loop = (bb_for_stmt (phi))->loop_father;
2359 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2360 edge latch_e = loop_latch_edge (loop);
2361 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2362 tree def_stmt, def1, def2;
2363 enum tree_code code;
2365 tree operation, op1, op2;
2369 imm_use_iterator imm_iter;
2370 use_operand_p use_p;
2372 gcc_assert (loop == vect_loop || flow_loop_nested_p (vect_loop, loop));
2374 name = PHI_RESULT (phi);
2376 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2378 tree use_stmt = USE_STMT (use_p);
2379 if (flow_bb_inside_loop_p (loop, bb_for_stmt (use_stmt))
2380 && vinfo_for_stmt (use_stmt)
2381 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2385 if (vect_print_dump_info (REPORT_DETAILS))
2386 fprintf (vect_dump, "reduction used in loop.");
2391 if (TREE_CODE (loop_arg) != SSA_NAME)
2393 if (vect_print_dump_info (REPORT_DETAILS))
2395 fprintf (vect_dump, "reduction: not ssa_name: ");
2396 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
2401 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2404 if (vect_print_dump_info (REPORT_DETAILS))
2405 fprintf (vect_dump, "reduction: no def_stmt.");
2409 if (TREE_CODE (def_stmt) != GIMPLE_MODIFY_STMT)
2411 if (vect_print_dump_info (REPORT_DETAILS))
2412 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
2416 name = GIMPLE_STMT_OPERAND (def_stmt, 0);
2418 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2420 tree use_stmt = USE_STMT (use_p);
2421 if (flow_bb_inside_loop_p (loop, bb_for_stmt (use_stmt))
2422 && vinfo_for_stmt (use_stmt)
2423 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2427 if (vect_print_dump_info (REPORT_DETAILS))
2428 fprintf (vect_dump, "reduction used in loop.");
2433 operation = GIMPLE_STMT_OPERAND (def_stmt, 1);
2434 code = TREE_CODE (operation);
2435 if (!commutative_tree_code (code) || !associative_tree_code (code))
2437 if (vect_print_dump_info (REPORT_DETAILS))
2439 fprintf (vect_dump, "reduction: not commutative/associative: ");
2440 print_generic_expr (vect_dump, operation, TDF_SLIM);
2445 op_type = TREE_OPERAND_LENGTH (operation);
2446 if (op_type != binary_op)
2448 if (vect_print_dump_info (REPORT_DETAILS))
2450 fprintf (vect_dump, "reduction: not binary operation: ");
2451 print_generic_expr (vect_dump, operation, TDF_SLIM);
2456 op1 = TREE_OPERAND (operation, 0);
2457 op2 = TREE_OPERAND (operation, 1);
2458 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
2460 if (vect_print_dump_info (REPORT_DETAILS))
2462 fprintf (vect_dump, "reduction: uses not ssa_names: ");
2463 print_generic_expr (vect_dump, operation, TDF_SLIM);
2468 /* Check that it's ok to change the order of the computation. */
2469 type = TREE_TYPE (operation);
2470 if (TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op1))
2471 || TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op2)))
2473 if (vect_print_dump_info (REPORT_DETAILS))
2475 fprintf (vect_dump, "reduction: multiple types: operation type: ");
2476 print_generic_expr (vect_dump, type, TDF_SLIM);
2477 fprintf (vect_dump, ", operands types: ");
2478 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
2479 fprintf (vect_dump, ",");
2480 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
2485 /* Generally, when vectorizing a reduction we change the order of the
2486 computation. This may change the behavior of the program in some
2487 cases, so we need to check that this is ok. One exception is when
2488 vectorizing an outer-loop: the inner-loop is executed sequentially,
2489 and therefore vectorizing reductions in the inner-loop durint
2490 outer-loop vectorization is safe. */
2492 /* CHECKME: check for !flag_finite_math_only too? */
2493 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2494 && !nested_in_vect_loop_p (vect_loop, def_stmt))
2496 /* Changing the order of operations changes the semantics. */
2497 if (vect_print_dump_info (REPORT_DETAILS))
2499 fprintf (vect_dump, "reduction: unsafe fp math optimization: ");
2500 print_generic_expr (vect_dump, operation, TDF_SLIM);
2504 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2505 && !nested_in_vect_loop_p (vect_loop, def_stmt))
2507 /* Changing the order of operations changes the semantics. */
2508 if (vect_print_dump_info (REPORT_DETAILS))
2510 fprintf (vect_dump, "reduction: unsafe int math optimization: ");
2511 print_generic_expr (vect_dump, operation, TDF_SLIM);
2515 else if (SAT_FIXED_POINT_TYPE_P (type))
2517 /* Changing the order of operations changes the semantics. */
2518 if (vect_print_dump_info (REPORT_DETAILS))
2520 fprintf (vect_dump, "reduction: unsafe fixed-point math optimization: ");
2521 print_generic_expr (vect_dump, operation, TDF_SLIM);
2526 /* reduction is safe. we're dealing with one of the following:
2527 1) integer arithmetic and no trapv
2528 2) floating point arithmetic, and special flags permit this optimization.
2530 def1 = SSA_NAME_DEF_STMT (op1);
2531 def2 = SSA_NAME_DEF_STMT (op2);
2532 if (!def1 || !def2 || IS_EMPTY_STMT (def1) || IS_EMPTY_STMT (def2))
2534 if (vect_print_dump_info (REPORT_DETAILS))
2536 fprintf (vect_dump, "reduction: no defs for operands: ");
2537 print_generic_expr (vect_dump, operation, TDF_SLIM);
2543 /* Check that one def is the reduction def, defined by PHI,
2544 the other def is either defined in the loop ("vect_loop_def"),
2545 or it's an induction (defined by a loop-header phi-node). */
2548 && flow_bb_inside_loop_p (loop, bb_for_stmt (def1))
2549 && (TREE_CODE (def1) == GIMPLE_MODIFY_STMT
2550 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def
2551 || (TREE_CODE (def1) == PHI_NODE
2552 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_loop_def
2553 && !is_loop_header_bb_p (bb_for_stmt (def1)))))
2555 if (vect_print_dump_info (REPORT_DETAILS))
2557 fprintf (vect_dump, "detected reduction:");
2558 print_generic_expr (vect_dump, operation, TDF_SLIM);
2562 else if (def1 == phi
2563 && flow_bb_inside_loop_p (loop, bb_for_stmt (def2))
2564 && (TREE_CODE (def2) == GIMPLE_MODIFY_STMT
2565 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_induction_def
2566 || (TREE_CODE (def2) == PHI_NODE
2567 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_loop_def
2568 && !is_loop_header_bb_p (bb_for_stmt (def2)))))
2570 /* Swap operands (just for simplicity - so that the rest of the code
2571 can assume that the reduction variable is always the last (second)
2573 if (vect_print_dump_info (REPORT_DETAILS))
2575 fprintf (vect_dump, "detected reduction: need to swap operands:");
2576 print_generic_expr (vect_dump, operation, TDF_SLIM);
2578 swap_tree_operands (def_stmt, &TREE_OPERAND (operation, 0),
2579 &TREE_OPERAND (operation, 1));
2584 if (vect_print_dump_info (REPORT_DETAILS))
2586 fprintf (vect_dump, "reduction: unknown pattern.");
2587 print_generic_expr (vect_dump, operation, TDF_SLIM);
2594 /* Function vect_is_simple_iv_evolution.
2596 FORNOW: A simple evolution of an induction variables in the loop is
2597 considered a polynomial evolution with constant step. */
2600 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
2605 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
2607 /* When there is no evolution in this loop, the evolution function
2609 if (evolution_part == NULL_TREE)
2612 /* When the evolution is a polynomial of degree >= 2
2613 the evolution function is not "simple". */
2614 if (tree_is_chrec (evolution_part))
2617 step_expr = evolution_part;
2618 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
2620 if (vect_print_dump_info (REPORT_DETAILS))
2622 fprintf (vect_dump, "step: ");
2623 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
2624 fprintf (vect_dump, ", init: ");
2625 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
2631 if (TREE_CODE (step_expr) != INTEGER_CST)
2633 if (vect_print_dump_info (REPORT_DETAILS))
2634 fprintf (vect_dump, "step unknown.");
2642 /* Function vectorize_loops.
2644 Entry Point to loop vectorization phase. */
2647 vectorize_loops (void)
2650 unsigned int num_vectorized_loops = 0;
2651 unsigned int vect_loops_num;
2655 vect_loops_num = number_of_loops ();
2657 /* Bail out if there are no loops. */
2658 if (vect_loops_num <= 1)
2661 /* Fix the verbosity level if not defined explicitly by the user. */
2662 vect_set_dump_settings ();
2664 /* Allocate the bitmap that records which virtual variables that
2665 need to be renamed. */
2666 vect_memsyms_to_rename = BITMAP_ALLOC (NULL);
2668 /* ----------- Analyze loops. ----------- */
2670 /* If some loop was duplicated, it gets bigger number
2671 than all previously defined loops. This fact allows us to run
2672 only over initial loops skipping newly generated ones. */
2673 FOR_EACH_LOOP (li, loop, 0)
2675 loop_vec_info loop_vinfo;
2677 vect_loop_location = find_loop_location (loop);
2678 loop_vinfo = vect_analyze_loop (loop);
2679 loop->aux = loop_vinfo;
2681 if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
2684 vect_transform_loop (loop_vinfo);
2685 num_vectorized_loops++;
2687 vect_loop_location = UNKNOWN_LOC;
2689 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)
2690 || (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
2691 && num_vectorized_loops > 0))
2692 fprintf (vect_dump, "vectorized %u loops in function.\n",
2693 num_vectorized_loops);
2695 /* ----------- Finalize. ----------- */
2697 BITMAP_FREE (vect_memsyms_to_rename);
2699 for (i = 1; i < vect_loops_num; i++)
2701 loop_vec_info loop_vinfo;
2703 loop = get_loop (i);
2706 loop_vinfo = loop->aux;
2707 destroy_loop_vec_info (loop_vinfo, true);
2711 return num_vectorized_loops > 0 ? TODO_cleanup_cfg : 0;
2714 /* Increase alignment of global arrays to improve vectorization potential.
2716 - Consider also structs that have an array field.
2717 - Use ipa analysis to prune arrays that can't be vectorized?
2718 This should involve global alignment analysis and in the future also
2722 increase_alignment (void)
2724 struct varpool_node *vnode;
2726 /* Increase the alignment of all global arrays for vectorization. */
2727 for (vnode = varpool_nodes_queue;
2729 vnode = vnode->next_needed)
2731 tree vectype, decl = vnode->decl;
2732 unsigned int alignment;
2734 if (TREE_CODE (TREE_TYPE (decl)) != ARRAY_TYPE)
2736 vectype = get_vectype_for_scalar_type (TREE_TYPE (TREE_TYPE (decl)));
2739 alignment = TYPE_ALIGN (vectype);
2740 if (DECL_ALIGN (decl) >= alignment)
2743 if (vect_can_force_dr_alignment_p (decl, alignment))
2745 DECL_ALIGN (decl) = TYPE_ALIGN (vectype);
2746 DECL_USER_ALIGN (decl) = 1;
2749 fprintf (dump_file, "Increasing alignment of decl: ");
2750 print_generic_expr (dump_file, decl, TDF_SLIM);
2758 gate_increase_alignment (void)
2760 return flag_section_anchors && flag_tree_vectorize;
2763 struct tree_opt_pass pass_ipa_increase_alignment =
2765 "increase_alignment", /* name */
2766 gate_increase_alignment, /* gate */
2767 increase_alignment, /* execute */
2770 0, /* static_pass_number */
2772 0, /* properties_required */
2773 0, /* properties_provided */
2774 0, /* properties_destroyed */
2775 0, /* todo_flags_start */
2776 0, /* todo_flags_finish */