2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software
4 Contributed by Dorit Naishlos <dorit@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 /* Loop Vectorization Pass.
24 This pass tries to vectorize loops. This first implementation focuses on
25 simple inner-most loops, with no conditional control flow, and a set of
26 simple operations which vector form can be expressed using existing
27 tree codes (PLUS, MULT etc).
29 For example, the vectorizer transforms the following simple loop:
31 short a[N]; short b[N]; short c[N]; int i;
37 as if it was manually vectorized by rewriting the source code into:
39 typedef int __attribute__((mode(V8HI))) v8hi;
40 short a[N]; short b[N]; short c[N]; int i;
41 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
44 for (i=0; i<N/8; i++){
51 The main entry to this pass is vectorize_loops(), in which
52 the vectorizer applies a set of analyses on a given set of loops,
53 followed by the actual vectorization transformation for the loops that
54 had successfully passed the analysis phase.
56 Throughout this pass we make a distinction between two types of
57 data: scalars (which are represented by SSA_NAMES), and memory references
58 ("data-refs"). These two types of data require different handling both
59 during analysis and transformation. The types of data-refs that the
60 vectorizer currently supports are ARRAY_REFS which base is an array DECL
61 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
62 accesses are required to have a simple (consecutive) access pattern.
66 The driver for the analysis phase is vect_analyze_loop_nest().
67 It applies a set of analyses, some of which rely on the scalar evolution
68 analyzer (scev) developed by Sebastian Pop.
70 During the analysis phase the vectorizer records some information
71 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
72 loop, as well as general information about the loop as a whole, which is
73 recorded in a "loop_vec_info" struct attached to each loop.
77 The loop transformation phase scans all the stmts in the loop, and
78 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
79 the loop that needs to be vectorized. It insert the vector code sequence
80 just before the scalar stmt S, and records a pointer to the vector code
81 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
82 attached to S). This pointer will be used for the vectorization of following
83 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
84 otherwise, we rely on dead code elimination for removing it.
86 For example, say stmt S1 was vectorized into stmt VS1:
89 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
92 To vectorize stmt S2, the vectorizer first finds the stmt that defines
93 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
94 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
95 resulting sequence would be:
98 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
100 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
102 Operands that are not SSA_NAMEs, are data-refs that appear in
103 load/store operations (like 'x[i]' in S1), and are handled differently.
107 Currently the only target specific information that is used is the
108 size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
109 support different sizes of vectors, for now will need to specify one value
110 for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
112 Since we only vectorize operations which vector form can be
113 expressed using existing tree codes, to verify that an operation is
114 supported, the vectorizer checks the relevant optab at the relevant
115 machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
116 the value found is CODE_FOR_nothing, then there's no target support, and
117 we can't vectorize the stmt.
119 For additional information on this project see:
120 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
125 #include "coretypes.h"
131 #include "basic-block.h"
132 #include "diagnostic.h"
133 #include "tree-flow.h"
134 #include "tree-dump.h"
137 #include "cfglayout.h"
143 #include "tree-chrec.h"
144 #include "tree-data-ref.h"
145 #include "tree-scalar-evolution.h"
147 #include "tree-vectorizer.h"
148 #include "tree-pass.h"
150 /*************************************************************************
151 General Vectorization Utilities
152 *************************************************************************/
154 /* vect_dump will be set to stderr or dump_file if exist. */
157 /* vect_verbosity_level set to an invalid value
158 to mark that it's uninitialized. */
159 enum verbosity_levels vect_verbosity_level = MAX_VERBOSITY_LEVEL;
162 static LOC vect_loop_location;
164 /* Bitmap of virtual variables to be renamed. */
165 bitmap vect_memsyms_to_rename;
167 /*************************************************************************
168 Simple Loop Peeling Utilities
170 Utilities to support loop peeling for vectorization purposes.
171 *************************************************************************/
174 /* Renames the use *OP_P. */
177 rename_use_op (use_operand_p op_p)
181 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
184 new_name = get_current_def (USE_FROM_PTR (op_p));
186 /* Something defined outside of the loop. */
190 /* An ordinary ssa name defined in the loop. */
192 SET_USE (op_p, new_name);
196 /* Renames the variables in basic block BB. */
199 rename_variables_in_bb (basic_block bb)
202 block_stmt_iterator bsi;
208 struct loop *loop = bb->loop_father;
210 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
212 stmt = bsi_stmt (bsi);
213 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
214 rename_use_op (use_p);
217 FOR_EACH_EDGE (e, ei, bb->succs)
219 if (!flow_bb_inside_loop_p (loop, e->dest))
221 for (phi = phi_nodes (e->dest); phi; phi = PHI_CHAIN (phi))
222 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (phi, e));
227 /* Renames variables in new generated LOOP. */
230 rename_variables_in_loop (struct loop *loop)
235 bbs = get_loop_body (loop);
237 for (i = 0; i < loop->num_nodes; i++)
238 rename_variables_in_bb (bbs[i]);
244 /* Update the PHI nodes of NEW_LOOP.
246 NEW_LOOP is a duplicate of ORIG_LOOP.
247 AFTER indicates whether NEW_LOOP executes before or after ORIG_LOOP:
248 AFTER is true if NEW_LOOP executes after ORIG_LOOP, and false if it
249 executes before it. */
252 slpeel_update_phis_for_duplicate_loop (struct loop *orig_loop,
253 struct loop *new_loop, bool after)
256 tree phi_new, phi_orig;
258 edge orig_loop_latch = loop_latch_edge (orig_loop);
259 edge orig_entry_e = loop_preheader_edge (orig_loop);
260 edge new_loop_exit_e = single_exit (new_loop);
261 edge new_loop_entry_e = loop_preheader_edge (new_loop);
262 edge entry_arg_e = (after ? orig_loop_latch : orig_entry_e);
265 step 1. For each loop-header-phi:
266 Add the first phi argument for the phi in NEW_LOOP
267 (the one associated with the entry of NEW_LOOP)
269 step 2. For each loop-header-phi:
270 Add the second phi argument for the phi in NEW_LOOP
271 (the one associated with the latch of NEW_LOOP)
273 step 3. Update the phis in the successor block of NEW_LOOP.
275 case 1: NEW_LOOP was placed before ORIG_LOOP:
276 The successor block of NEW_LOOP is the header of ORIG_LOOP.
277 Updating the phis in the successor block can therefore be done
278 along with the scanning of the loop header phis, because the
279 header blocks of ORIG_LOOP and NEW_LOOP have exactly the same
280 phi nodes, organized in the same order.
282 case 2: NEW_LOOP was placed after ORIG_LOOP:
283 The successor block of NEW_LOOP is the original exit block of
284 ORIG_LOOP - the phis to be updated are the loop-closed-ssa phis.
285 We postpone updating these phis to a later stage (when
286 loop guards are added).
290 /* Scan the phis in the headers of the old and new loops
291 (they are organized in exactly the same order). */
293 for (phi_new = phi_nodes (new_loop->header),
294 phi_orig = phi_nodes (orig_loop->header);
296 phi_new = PHI_CHAIN (phi_new), phi_orig = PHI_CHAIN (phi_orig))
299 def = PHI_ARG_DEF_FROM_EDGE (phi_orig, entry_arg_e);
300 add_phi_arg (phi_new, def, new_loop_entry_e);
303 def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch);
304 if (TREE_CODE (def) != SSA_NAME)
307 new_ssa_name = get_current_def (def);
310 /* This only happens if there are no definitions
311 inside the loop. use the phi_result in this case. */
312 new_ssa_name = PHI_RESULT (phi_new);
315 /* An ordinary ssa name defined in the loop. */
316 add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop));
318 /* step 3 (case 1). */
321 gcc_assert (new_loop_exit_e == orig_entry_e);
322 SET_PHI_ARG_DEF (phi_orig,
323 new_loop_exit_e->dest_idx,
330 /* Update PHI nodes for a guard of the LOOP.
333 - LOOP, GUARD_EDGE: LOOP is a loop for which we added guard code that
334 controls whether LOOP is to be executed. GUARD_EDGE is the edge that
335 originates from the guard-bb, skips LOOP and reaches the (unique) exit
336 bb of LOOP. This loop-exit-bb is an empty bb with one successor.
337 We denote this bb NEW_MERGE_BB because before the guard code was added
338 it had a single predecessor (the LOOP header), and now it became a merge
339 point of two paths - the path that ends with the LOOP exit-edge, and
340 the path that ends with GUARD_EDGE.
341 - NEW_EXIT_BB: New basic block that is added by this function between LOOP
342 and NEW_MERGE_BB. It is used to place loop-closed-ssa-form exit-phis.
344 ===> The CFG before the guard-code was added:
347 if (exit_loop) goto update_bb
348 else goto LOOP_header_bb
351 ==> The CFG after the guard-code was added:
353 if (LOOP_guard_condition) goto new_merge_bb
354 else goto LOOP_header_bb
357 if (exit_loop_condition) goto new_merge_bb
358 else goto LOOP_header_bb
363 ==> The CFG after this function:
365 if (LOOP_guard_condition) goto new_merge_bb
366 else goto LOOP_header_bb
369 if (exit_loop_condition) goto new_exit_bb
370 else goto LOOP_header_bb
377 1. creates and updates the relevant phi nodes to account for the new
378 incoming edge (GUARD_EDGE) into NEW_MERGE_BB. This involves:
379 1.1. Create phi nodes at NEW_MERGE_BB.
380 1.2. Update the phi nodes at the successor of NEW_MERGE_BB (denoted
381 UPDATE_BB). UPDATE_BB was the exit-bb of LOOP before NEW_MERGE_BB
382 2. preserves loop-closed-ssa-form by creating the required phi nodes
383 at the exit of LOOP (i.e, in NEW_EXIT_BB).
385 There are two flavors to this function:
387 slpeel_update_phi_nodes_for_guard1:
388 Here the guard controls whether we enter or skip LOOP, where LOOP is a
389 prolog_loop (loop1 below), and the new phis created in NEW_MERGE_BB are
390 for variables that have phis in the loop header.
392 slpeel_update_phi_nodes_for_guard2:
393 Here the guard controls whether we enter or skip LOOP, where LOOP is an
394 epilog_loop (loop2 below), and the new phis created in NEW_MERGE_BB are
395 for variables that have phis in the loop exit.
397 I.E., the overall structure is:
400 guard1 (goto loop1/merge1_bb)
403 guard2 (goto merge1_bb/merge2_bb)
410 slpeel_update_phi_nodes_for_guard1 takes care of creating phis in
411 loop1_exit_bb and merge1_bb. These are entry phis (phis for the vars
412 that have phis in loop1->header).
414 slpeel_update_phi_nodes_for_guard2 takes care of creating phis in
415 loop2_exit_bb and merge2_bb. These are exit phis (phis for the vars
416 that have phis in next_bb). It also adds some of these phis to
419 slpeel_update_phi_nodes_for_guard1 is always called before
420 slpeel_update_phi_nodes_for_guard2. They are both needed in order
421 to create correct data-flow and loop-closed-ssa-form.
423 Generally slpeel_update_phi_nodes_for_guard1 creates phis for variables
424 that change between iterations of a loop (and therefore have a phi-node
425 at the loop entry), whereas slpeel_update_phi_nodes_for_guard2 creates
426 phis for variables that are used out of the loop (and therefore have
427 loop-closed exit phis). Some variables may be both updated between
428 iterations and used after the loop. This is why in loop1_exit_bb we
429 may need both entry_phis (created by slpeel_update_phi_nodes_for_guard1)
430 and exit phis (created by slpeel_update_phi_nodes_for_guard2).
432 - IS_NEW_LOOP: if IS_NEW_LOOP is true, then LOOP is a newly created copy of
433 an original loop. i.e., we have:
436 guard_bb (goto LOOP/new_merge)
442 If IS_NEW_LOOP is false, then LOOP is an original loop, in which case we
446 guard_bb (goto LOOP/new_merge)
452 The SSA names defined in the original loop have a current
453 reaching definition that that records the corresponding new
454 ssa-name used in the new duplicated loop copy.
457 /* Function slpeel_update_phi_nodes_for_guard1
460 - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
461 - DEFS - a bitmap of ssa names to mark new names for which we recorded
464 In the context of the overall structure, we have:
467 guard1 (goto loop1/merge1_bb)
470 guard2 (goto merge1_bb/merge2_bb)
477 For each name updated between loop iterations (i.e - for each name that has
478 an entry (loop-header) phi in LOOP) we create a new phi in:
479 1. merge1_bb (to account for the edge from guard1)
480 2. loop1_exit_bb (an exit-phi to keep LOOP in loop-closed form)
484 slpeel_update_phi_nodes_for_guard1 (edge guard_edge, struct loop *loop,
485 bool is_new_loop, basic_block *new_exit_bb,
488 tree orig_phi, new_phi;
489 tree update_phi, update_phi2;
490 tree guard_arg, loop_arg;
491 basic_block new_merge_bb = guard_edge->dest;
492 edge e = EDGE_SUCC (new_merge_bb, 0);
493 basic_block update_bb = e->dest;
494 basic_block orig_bb = loop->header;
496 tree current_new_name;
499 /* Create new bb between loop and new_merge_bb. */
500 *new_exit_bb = split_edge (single_exit (loop));
502 new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
504 for (orig_phi = phi_nodes (orig_bb), update_phi = phi_nodes (update_bb);
505 orig_phi && update_phi;
506 orig_phi = PHI_CHAIN (orig_phi), update_phi = PHI_CHAIN (update_phi))
508 /* Virtual phi; Mark it for renaming. We actually want to call
509 mar_sym_for_renaming, but since all ssa renaming datastructures
510 are going to be freed before we get to call ssa_update, we just
511 record this name for now in a bitmap, and will mark it for
513 name = PHI_RESULT (orig_phi);
514 if (!is_gimple_reg (SSA_NAME_VAR (name)))
515 bitmap_set_bit (vect_memsyms_to_rename, DECL_UID (SSA_NAME_VAR (name)));
517 /** 1. Handle new-merge-point phis **/
519 /* 1.1. Generate new phi node in NEW_MERGE_BB: */
520 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
523 /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
524 of LOOP. Set the two phi args in NEW_PHI for these edges: */
525 loop_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, EDGE_SUCC (loop->latch, 0));
526 guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, loop_preheader_edge (loop));
528 add_phi_arg (new_phi, loop_arg, new_exit_e);
529 add_phi_arg (new_phi, guard_arg, guard_edge);
531 /* 1.3. Update phi in successor block. */
532 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == loop_arg
533 || PHI_ARG_DEF_FROM_EDGE (update_phi, e) == guard_arg);
534 SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
535 update_phi2 = new_phi;
538 /** 2. Handle loop-closed-ssa-form phis **/
540 if (!is_gimple_reg (PHI_RESULT (orig_phi)))
543 /* 2.1. Generate new phi node in NEW_EXIT_BB: */
544 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
547 /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop. */
548 add_phi_arg (new_phi, loop_arg, single_exit (loop));
550 /* 2.3. Update phi in successor of NEW_EXIT_BB: */
551 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
552 SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));
554 /* 2.4. Record the newly created name with set_current_def.
555 We want to find a name such that
556 name = get_current_def (orig_loop_name)
557 and to set its current definition as follows:
558 set_current_def (name, new_phi_name)
560 If LOOP is a new loop then loop_arg is already the name we're
561 looking for. If LOOP is the original loop, then loop_arg is
562 the orig_loop_name and the relevant name is recorded in its
563 current reaching definition. */
565 current_new_name = loop_arg;
568 current_new_name = get_current_def (loop_arg);
569 /* current_def is not available only if the variable does not
570 change inside the loop, in which case we also don't care
571 about recording a current_def for it because we won't be
572 trying to create loop-exit-phis for it. */
573 if (!current_new_name)
576 gcc_assert (get_current_def (current_new_name) == NULL_TREE);
578 set_current_def (current_new_name, PHI_RESULT (new_phi));
579 bitmap_set_bit (*defs, SSA_NAME_VERSION (current_new_name));
582 set_phi_nodes (new_merge_bb, phi_reverse (phi_nodes (new_merge_bb)));
586 /* Function slpeel_update_phi_nodes_for_guard2
589 - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
591 In the context of the overall structure, we have:
594 guard1 (goto loop1/merge1_bb)
597 guard2 (goto merge1_bb/merge2_bb)
604 For each name used out side the loop (i.e - for each name that has an exit
605 phi in next_bb) we create a new phi in:
606 1. merge2_bb (to account for the edge from guard_bb)
607 2. loop2_exit_bb (an exit-phi to keep LOOP in loop-closed form)
608 3. guard2 bb (an exit phi to keep the preceding loop in loop-closed form),
609 if needed (if it wasn't handled by slpeel_update_phis_nodes_for_phi1).
613 slpeel_update_phi_nodes_for_guard2 (edge guard_edge, struct loop *loop,
614 bool is_new_loop, basic_block *new_exit_bb)
616 tree orig_phi, new_phi;
617 tree update_phi, update_phi2;
618 tree guard_arg, loop_arg;
619 basic_block new_merge_bb = guard_edge->dest;
620 edge e = EDGE_SUCC (new_merge_bb, 0);
621 basic_block update_bb = e->dest;
623 tree orig_def, orig_def_new_name;
624 tree new_name, new_name2;
627 /* Create new bb between loop and new_merge_bb. */
628 *new_exit_bb = split_edge (single_exit (loop));
630 new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
632 for (update_phi = phi_nodes (update_bb); update_phi;
633 update_phi = PHI_CHAIN (update_phi))
635 orig_phi = update_phi;
636 orig_def = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
637 /* This loop-closed-phi actually doesn't represent a use
638 out of the loop - the phi arg is a constant. */
639 if (TREE_CODE (orig_def) != SSA_NAME)
641 orig_def_new_name = get_current_def (orig_def);
644 /** 1. Handle new-merge-point phis **/
646 /* 1.1. Generate new phi node in NEW_MERGE_BB: */
647 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
650 /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
651 of LOOP. Set the two PHI args in NEW_PHI for these edges: */
653 new_name2 = NULL_TREE;
654 if (orig_def_new_name)
656 new_name = orig_def_new_name;
657 /* Some variables have both loop-entry-phis and loop-exit-phis.
658 Such variables were given yet newer names by phis placed in
659 guard_bb by slpeel_update_phi_nodes_for_guard1. I.e:
660 new_name2 = get_current_def (get_current_def (orig_name)). */
661 new_name2 = get_current_def (new_name);
666 guard_arg = orig_def;
671 guard_arg = new_name;
675 guard_arg = new_name2;
677 add_phi_arg (new_phi, loop_arg, new_exit_e);
678 add_phi_arg (new_phi, guard_arg, guard_edge);
680 /* 1.3. Update phi in successor block. */
681 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == orig_def);
682 SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
683 update_phi2 = new_phi;
686 /** 2. Handle loop-closed-ssa-form phis **/
688 /* 2.1. Generate new phi node in NEW_EXIT_BB: */
689 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
692 /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop. */
693 add_phi_arg (new_phi, loop_arg, single_exit (loop));
695 /* 2.3. Update phi in successor of NEW_EXIT_BB: */
696 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
697 SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));
700 /** 3. Handle loop-closed-ssa-form phis for first loop **/
702 /* 3.1. Find the relevant names that need an exit-phi in
703 GUARD_BB, i.e. names for which
704 slpeel_update_phi_nodes_for_guard1 had not already created a
705 phi node. This is the case for names that are used outside
706 the loop (and therefore need an exit phi) but are not updated
707 across loop iterations (and therefore don't have a
710 slpeel_update_phi_nodes_for_guard1 is responsible for
711 creating loop-exit phis in GUARD_BB for names that have a
712 loop-header-phi. When such a phi is created we also record
713 the new name in its current definition. If this new name
714 exists, then guard_arg was set to this new name (see 1.2
715 above). Therefore, if guard_arg is not this new name, this
716 is an indication that an exit-phi in GUARD_BB was not yet
717 created, so we take care of it here. */
718 if (guard_arg == new_name2)
722 /* 3.2. Generate new phi node in GUARD_BB: */
723 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
726 /* 3.3. GUARD_BB has one incoming edge: */
727 gcc_assert (EDGE_COUNT (guard_edge->src->preds) == 1);
728 add_phi_arg (new_phi, arg, EDGE_PRED (guard_edge->src, 0));
730 /* 3.4. Update phi in successor of GUARD_BB: */
731 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, guard_edge)
733 SET_PHI_ARG_DEF (update_phi2, guard_edge->dest_idx, PHI_RESULT (new_phi));
736 set_phi_nodes (new_merge_bb, phi_reverse (phi_nodes (new_merge_bb)));
740 /* Make the LOOP iterate NITERS times. This is done by adding a new IV
741 that starts at zero, increases by one and its limit is NITERS.
743 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
746 slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters)
748 tree indx_before_incr, indx_after_incr, cond_stmt, cond;
750 edge exit_edge = single_exit (loop);
751 block_stmt_iterator loop_cond_bsi;
752 block_stmt_iterator incr_bsi;
754 tree init = build_int_cst (TREE_TYPE (niters), 0);
755 tree step = build_int_cst (TREE_TYPE (niters), 1);
758 orig_cond = get_loop_exit_condition (loop);
759 gcc_assert (orig_cond);
760 loop_cond_bsi = bsi_for_stmt (orig_cond);
762 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
763 create_iv (init, step, NULL_TREE, loop,
764 &incr_bsi, insert_after, &indx_before_incr, &indx_after_incr);
766 if (exit_edge->flags & EDGE_TRUE_VALUE) /* 'then' edge exits the loop. */
767 cond = build2 (GE_EXPR, boolean_type_node, indx_after_incr, niters);
768 else /* 'then' edge loops back. */
769 cond = build2 (LT_EXPR, boolean_type_node, indx_after_incr, niters);
771 cond_stmt = build3 (COND_EXPR, TREE_TYPE (orig_cond), cond,
772 NULL_TREE, NULL_TREE);
773 bsi_insert_before (&loop_cond_bsi, cond_stmt, BSI_SAME_STMT);
775 /* Remove old loop exit test: */
776 bsi_remove (&loop_cond_bsi, true);
778 loop_loc = find_loop_location (loop);
779 if (dump_file && (dump_flags & TDF_DETAILS))
781 if (loop_loc != UNKNOWN_LOC)
782 fprintf (dump_file, "\nloop at %s:%d: ",
783 LOC_FILE (loop_loc), LOC_LINE (loop_loc));
784 print_generic_expr (dump_file, cond_stmt, TDF_SLIM);
787 loop->nb_iterations = niters;
791 /* Given LOOP this function generates a new copy of it and puts it
792 on E which is either the entry or exit of LOOP. */
795 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
797 struct loop *new_loop;
798 basic_block *new_bbs, *bbs;
801 basic_block exit_dest;
805 at_exit = (e == single_exit (loop));
806 if (!at_exit && e != loop_preheader_edge (loop))
809 bbs = get_loop_body (loop);
811 /* Check whether duplication is possible. */
812 if (!can_copy_bbs_p (bbs, loop->num_nodes))
818 /* Generate new loop structure. */
819 new_loop = duplicate_loop (loop, loop_outer (loop));
826 exit_dest = single_exit (loop)->dest;
827 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
828 exit_dest) == loop->header ?
831 new_bbs = XNEWVEC (basic_block, loop->num_nodes);
833 exit = single_exit (loop);
834 copy_bbs (bbs, loop->num_nodes, new_bbs,
835 &exit, 1, &new_exit, NULL,
838 /* Duplicating phi args at exit bbs as coming
839 also from exit of duplicated loop. */
840 for (phi = phi_nodes (exit_dest); phi; phi = PHI_CHAIN (phi))
842 phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, single_exit (loop));
845 edge new_loop_exit_edge;
847 if (EDGE_SUCC (new_loop->header, 0)->dest == new_loop->latch)
848 new_loop_exit_edge = EDGE_SUCC (new_loop->header, 1);
850 new_loop_exit_edge = EDGE_SUCC (new_loop->header, 0);
852 add_phi_arg (phi, phi_arg, new_loop_exit_edge);
856 if (at_exit) /* Add the loop copy at exit. */
858 redirect_edge_and_branch_force (e, new_loop->header);
859 PENDING_STMT (e) = NULL;
860 set_immediate_dominator (CDI_DOMINATORS, new_loop->header, e->src);
862 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header);
864 else /* Add the copy at entry. */
867 edge entry_e = loop_preheader_edge (loop);
868 basic_block preheader = entry_e->src;
870 if (!flow_bb_inside_loop_p (new_loop,
871 EDGE_SUCC (new_loop->header, 0)->dest))
872 new_exit_e = EDGE_SUCC (new_loop->header, 0);
874 new_exit_e = EDGE_SUCC (new_loop->header, 1);
876 redirect_edge_and_branch_force (new_exit_e, loop->header);
877 PENDING_STMT (new_exit_e) = NULL;
878 set_immediate_dominator (CDI_DOMINATORS, loop->header,
881 /* We have to add phi args to the loop->header here as coming
882 from new_exit_e edge. */
883 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
885 phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, entry_e);
887 add_phi_arg (phi, phi_arg, new_exit_e);
890 redirect_edge_and_branch_force (entry_e, new_loop->header);
891 PENDING_STMT (entry_e) = NULL;
892 set_immediate_dominator (CDI_DOMINATORS, new_loop->header, preheader);
902 /* Given the condition statement COND, put it as the last statement
903 of GUARD_BB; EXIT_BB is the basic block to skip the loop;
904 Assumes that this is the single exit of the guarded loop.
905 Returns the skip edge. */
908 slpeel_add_loop_guard (basic_block guard_bb, tree cond, basic_block exit_bb,
911 block_stmt_iterator bsi;
914 tree gimplify_stmt_list;
916 enter_e = EDGE_SUCC (guard_bb, 0);
917 enter_e->flags &= ~EDGE_FALLTHRU;
918 enter_e->flags |= EDGE_FALSE_VALUE;
919 bsi = bsi_last (guard_bb);
922 force_gimple_operand (cond, &gimplify_stmt_list, true,
924 cond_stmt = build3 (COND_EXPR, void_type_node, cond,
925 NULL_TREE, NULL_TREE);
926 if (gimplify_stmt_list)
927 bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
929 bsi = bsi_last (guard_bb);
930 bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
932 /* Add new edge to connect guard block to the merge/loop-exit block. */
933 new_e = make_edge (guard_bb, exit_bb, EDGE_TRUE_VALUE);
934 set_immediate_dominator (CDI_DOMINATORS, exit_bb, dom_bb);
939 /* This function verifies that the following restrictions apply to LOOP:
941 (2) it consists of exactly 2 basic blocks - header, and an empty latch.
942 (3) it is single entry, single exit
943 (4) its exit condition is the last stmt in the header
944 (5) E is the entry/exit edge of LOOP.
948 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
950 edge exit_e = single_exit (loop);
951 edge entry_e = loop_preheader_edge (loop);
952 tree orig_cond = get_loop_exit_condition (loop);
953 block_stmt_iterator loop_exit_bsi = bsi_last (exit_e->src);
955 if (need_ssa_update_p ())
959 /* All loops have an outer scope; the only case loop->outer is NULL is for
960 the function itself. */
961 || !loop_outer (loop)
962 || loop->num_nodes != 2
963 || !empty_block_p (loop->latch)
964 || !single_exit (loop)
965 /* Verify that new loop exit condition can be trivially modified. */
966 || (!orig_cond || orig_cond != bsi_stmt (loop_exit_bsi))
967 || (e != exit_e && e != entry_e))
973 #ifdef ENABLE_CHECKING
975 slpeel_verify_cfg_after_peeling (struct loop *first_loop,
976 struct loop *second_loop)
978 basic_block loop1_exit_bb = single_exit (first_loop)->dest;
979 basic_block loop2_entry_bb = loop_preheader_edge (second_loop)->src;
980 basic_block loop1_entry_bb = loop_preheader_edge (first_loop)->src;
982 /* A guard that controls whether the second_loop is to be executed or skipped
983 is placed in first_loop->exit. first_loop->exit therefore has two
984 successors - one is the preheader of second_loop, and the other is a bb
987 gcc_assert (EDGE_COUNT (loop1_exit_bb->succs) == 2);
989 /* 1. Verify that one of the successors of first_loop->exit is the preheader
992 /* The preheader of new_loop is expected to have two predecessors:
993 first_loop->exit and the block that precedes first_loop. */
995 gcc_assert (EDGE_COUNT (loop2_entry_bb->preds) == 2
996 && ((EDGE_PRED (loop2_entry_bb, 0)->src == loop1_exit_bb
997 && EDGE_PRED (loop2_entry_bb, 1)->src == loop1_entry_bb)
998 || (EDGE_PRED (loop2_entry_bb, 1)->src == loop1_exit_bb
999 && EDGE_PRED (loop2_entry_bb, 0)->src == loop1_entry_bb)));
1001 /* Verify that the other successor of first_loop->exit is after the
1007 /* If the run time cost model check determines that vectorization is
1008 not profitable and hence scalar loop should be generated then set
1009 FIRST_NITERS to prologue peeled iterations. This will allow all the
1010 iterations to be executed in the prologue peeled scalar loop. */
1013 set_prologue_iterations (basic_block bb_before_first_loop,
1019 basic_block cond_bb, then_bb;
1020 tree var, prologue_after_cost_adjust_name, stmt;
1021 block_stmt_iterator bsi;
1023 edge e_true, e_false, e_fallthru;
1025 tree gimplify_stmt_list;
1026 tree cost_pre_condition = NULL_TREE;
1027 tree scalar_loop_iters =
1028 unshare_expr (LOOP_VINFO_NITERS_UNCHANGED (loop_vec_info_for_loop (loop)));
1030 e = single_pred_edge (bb_before_first_loop);
1031 cond_bb = split_edge(e);
1033 e = single_pred_edge (bb_before_first_loop);
1034 then_bb = split_edge(e);
1035 set_immediate_dominator (CDI_DOMINATORS, then_bb, cond_bb);
1037 e_false = make_single_succ_edge (cond_bb, bb_before_first_loop,
1039 set_immediate_dominator (CDI_DOMINATORS, bb_before_first_loop, cond_bb);
1041 e_true = EDGE_PRED (then_bb, 0);
1042 e_true->flags &= ~EDGE_FALLTHRU;
1043 e_true->flags |= EDGE_TRUE_VALUE;
1045 e_fallthru = EDGE_SUCC (then_bb, 0);
1047 cost_pre_condition =
1048 build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
1049 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
1050 cost_pre_condition =
1051 force_gimple_operand (cost_pre_condition, &gimplify_stmt_list,
1053 cond_stmt = build3 (COND_EXPR, void_type_node, cost_pre_condition,
1054 NULL_TREE, NULL_TREE);
1056 bsi = bsi_last (cond_bb);
1057 if (gimplify_stmt_list)
1058 bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
1060 bsi = bsi_last (cond_bb);
1061 bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
1063 var = create_tmp_var (TREE_TYPE (scalar_loop_iters),
1064 "prologue_after_cost_adjust");
1065 add_referenced_var (var);
1066 prologue_after_cost_adjust_name =
1067 force_gimple_operand (scalar_loop_iters, &stmt, false, var);
1069 bsi = bsi_last (then_bb);
1071 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1073 newphi = create_phi_node (var, bb_before_first_loop);
1074 add_phi_arg (newphi, prologue_after_cost_adjust_name, e_fallthru);
1075 add_phi_arg (newphi, first_niters, e_false);
1077 first_niters = PHI_RESULT (newphi);
1081 /* Function slpeel_tree_peel_loop_to_edge.
1083 Peel the first (last) iterations of LOOP into a new prolog (epilog) loop
1084 that is placed on the entry (exit) edge E of LOOP. After this transformation
1085 we have two loops one after the other - first-loop iterates FIRST_NITERS
1086 times, and second-loop iterates the remainder NITERS - FIRST_NITERS times.
1087 If the cost model indicates that it is profitable to emit a scalar
1088 loop instead of the vector one, then the prolog (epilog) loop will iterate
1089 for the entire unchanged scalar iterations of the loop.
1092 - LOOP: the loop to be peeled.
1093 - E: the exit or entry edge of LOOP.
1094 If it is the entry edge, we peel the first iterations of LOOP. In this
1095 case first-loop is LOOP, and second-loop is the newly created loop.
1096 If it is the exit edge, we peel the last iterations of LOOP. In this
1097 case, first-loop is the newly created loop, and second-loop is LOOP.
1098 - NITERS: the number of iterations that LOOP iterates.
1099 - FIRST_NITERS: the number of iterations that the first-loop should iterate.
1100 - UPDATE_FIRST_LOOP_COUNT: specified whether this function is responsible
1101 for updating the loop bound of the first-loop to FIRST_NITERS. If it
1102 is false, the caller of this function may want to take care of this
1103 (this can be useful if we don't want new stmts added to first-loop).
1104 - TH: cost model profitability threshold of iterations for vectorization.
1105 - CHECK_PROFITABILITY: specify whether cost model check has not occurred
1106 during versioning and hence needs to occur during
1107 prologue generation or whether cost model check
1108 has not occurred during prologue generation and hence
1109 needs to occur during epilogue generation.
1113 The function returns a pointer to the new loop-copy, or NULL if it failed
1114 to perform the transformation.
1116 The function generates two if-then-else guards: one before the first loop,
1117 and the other before the second loop:
1119 if (FIRST_NITERS == 0) then skip the first loop,
1120 and go directly to the second loop.
1121 The second guard is:
1122 if (FIRST_NITERS == NITERS) then skip the second loop.
1124 FORNOW only simple loops are supported (see slpeel_can_duplicate_loop_p).
1125 FORNOW the resulting code will not be in loop-closed-ssa form.
1129 slpeel_tree_peel_loop_to_edge (struct loop *loop,
1130 edge e, tree first_niters,
1131 tree niters, bool update_first_loop_count,
1132 unsigned int th, bool check_profitability)
1134 struct loop *new_loop = NULL, *first_loop, *second_loop;
1136 tree pre_condition = NULL_TREE;
1138 basic_block bb_before_second_loop, bb_after_second_loop;
1139 basic_block bb_before_first_loop;
1140 basic_block bb_between_loops;
1141 basic_block new_exit_bb;
1142 edge exit_e = single_exit (loop);
1144 tree cost_pre_condition = NULL_TREE;
1146 if (!slpeel_can_duplicate_loop_p (loop, e))
1149 /* We have to initialize cfg_hooks. Then, when calling
1150 cfg_hooks->split_edge, the function tree_split_edge
1151 is actually called and, when calling cfg_hooks->duplicate_block,
1152 the function tree_duplicate_bb is called. */
1153 tree_register_cfg_hooks ();
1156 /* 1. Generate a copy of LOOP and put it on E (E is the entry/exit of LOOP).
1157 Resulting CFG would be:
1170 if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e)))
1172 loop_loc = find_loop_location (loop);
1173 if (dump_file && (dump_flags & TDF_DETAILS))
1175 if (loop_loc != UNKNOWN_LOC)
1176 fprintf (dump_file, "\n%s:%d: note: ",
1177 LOC_FILE (loop_loc), LOC_LINE (loop_loc));
1178 fprintf (dump_file, "tree_duplicate_loop_to_edge_cfg failed.\n");
1185 /* NEW_LOOP was placed after LOOP. */
1187 second_loop = new_loop;
1191 /* NEW_LOOP was placed before LOOP. */
1192 first_loop = new_loop;
1196 definitions = ssa_names_to_replace ();
1197 slpeel_update_phis_for_duplicate_loop (loop, new_loop, e == exit_e);
1198 rename_variables_in_loop (new_loop);
1201 /* 2. Add the guard code in one of the following ways:
1203 2.a Add the guard that controls whether the first loop is executed.
1204 This occurs when this function is invoked for prologue or epilogue
1205 generation and when the cost model check can be done at compile time.
1207 Resulting CFG would be:
1209 bb_before_first_loop:
1210 if (FIRST_NITERS == 0) GOTO bb_before_second_loop
1217 bb_before_second_loop:
1225 2.b Add the cost model check that allows the prologue
1226 to iterate for the entire unchanged scalar
1227 iterations of the loop in the event that the cost
1228 model indicates that the scalar loop is more
1229 profitable than the vector one. This occurs when
1230 this function is invoked for prologue generation
1231 and the cost model check needs to be done at run
1234 Resulting CFG after prologue peeling would be:
1236 if (scalar_loop_iterations <= th)
1237 FIRST_NITERS = scalar_loop_iterations
1239 bb_before_first_loop:
1240 if (FIRST_NITERS == 0) GOTO bb_before_second_loop
1247 bb_before_second_loop:
1255 2.c Add the cost model check that allows the epilogue
1256 to iterate for the entire unchanged scalar
1257 iterations of the loop in the event that the cost
1258 model indicates that the scalar loop is more
1259 profitable than the vector one. This occurs when
1260 this function is invoked for epilogue generation
1261 and the cost model check needs to be done at run
1264 Resulting CFG after prologue peeling would be:
1266 bb_before_first_loop:
1267 if ((scalar_loop_iterations <= th)
1269 FIRST_NITERS == 0) GOTO bb_before_second_loop
1276 bb_before_second_loop:
1285 bb_before_first_loop = split_edge (loop_preheader_edge (first_loop));
1286 bb_before_second_loop = split_edge (single_exit (first_loop));
1288 /* Epilogue peeling. */
1289 if (!update_first_loop_count)
1292 fold_build2 (LE_EXPR, boolean_type_node, first_niters,
1293 build_int_cst (TREE_TYPE (first_niters), 0));
1294 if (check_profitability)
1296 tree scalar_loop_iters
1297 = unshare_expr (LOOP_VINFO_NITERS_UNCHANGED
1298 (loop_vec_info_for_loop (loop)));
1299 cost_pre_condition =
1300 build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
1301 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
1303 pre_condition = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
1304 cost_pre_condition, pre_condition);
1308 /* Prologue peeling. */
1311 if (check_profitability)
1312 set_prologue_iterations (bb_before_first_loop, first_niters,
1316 fold_build2 (LE_EXPR, boolean_type_node, first_niters,
1317 build_int_cst (TREE_TYPE (first_niters), 0));
1320 skip_e = slpeel_add_loop_guard (bb_before_first_loop, pre_condition,
1321 bb_before_second_loop, bb_before_first_loop);
1322 slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop,
1323 first_loop == new_loop,
1324 &new_exit_bb, &definitions);
1327 /* 3. Add the guard that controls whether the second loop is executed.
1328 Resulting CFG would be:
1330 bb_before_first_loop:
1331 if (FIRST_NITERS == 0) GOTO bb_before_second_loop (skip first loop)
1339 if (FIRST_NITERS == NITERS) GOTO bb_after_second_loop (skip second loop)
1340 GOTO bb_before_second_loop
1342 bb_before_second_loop:
1348 bb_after_second_loop:
1353 bb_between_loops = new_exit_bb;
1354 bb_after_second_loop = split_edge (single_exit (second_loop));
1357 fold_build2 (EQ_EXPR, boolean_type_node, first_niters, niters);
1358 skip_e = slpeel_add_loop_guard (bb_between_loops, pre_condition,
1359 bb_after_second_loop, bb_before_first_loop);
1360 slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop,
1361 second_loop == new_loop, &new_exit_bb);
1363 /* 4. Make first-loop iterate FIRST_NITERS times, if requested.
1365 if (update_first_loop_count)
1366 slpeel_make_loop_iterate_ntimes (first_loop, first_niters);
1368 BITMAP_FREE (definitions);
1369 delete_update_ssa ();
1374 /* Function vect_get_loop_location.
1376 Extract the location of the loop in the source code.
1377 If the loop is not well formed for vectorization, an estimated
1378 location is calculated.
1379 Return the loop location if succeed and NULL if not. */
1382 find_loop_location (struct loop *loop)
1384 tree node = NULL_TREE;
1386 block_stmt_iterator si;
1391 node = get_loop_exit_condition (loop);
1393 if (node && CAN_HAVE_LOCATION_P (node) && EXPR_HAS_LOCATION (node)
1394 && EXPR_FILENAME (node) && EXPR_LINENO (node))
1395 return EXPR_LOC (node);
1397 /* If we got here the loop is probably not "well formed",
1398 try to estimate the loop location */
1405 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
1407 node = bsi_stmt (si);
1408 if (node && CAN_HAVE_LOCATION_P (node) && EXPR_HAS_LOCATION (node))
1409 return EXPR_LOC (node);
1416 /*************************************************************************
1417 Vectorization Debug Information.
1418 *************************************************************************/
1420 /* Function vect_set_verbosity_level.
1422 Called from toplev.c upon detection of the
1423 -ftree-vectorizer-verbose=N option. */
1426 vect_set_verbosity_level (const char *val)
1431 if (vl < MAX_VERBOSITY_LEVEL)
1432 vect_verbosity_level = vl;
1434 vect_verbosity_level = MAX_VERBOSITY_LEVEL - 1;
1438 /* Function vect_set_dump_settings.
1440 Fix the verbosity level of the vectorizer if the
1441 requested level was not set explicitly using the flag
1442 -ftree-vectorizer-verbose=N.
1443 Decide where to print the debugging information (dump_file/stderr).
1444 If the user defined the verbosity level, but there is no dump file,
1445 print to stderr, otherwise print to the dump file. */
1448 vect_set_dump_settings (void)
1450 vect_dump = dump_file;
1452 /* Check if the verbosity level was defined by the user: */
1453 if (vect_verbosity_level != MAX_VERBOSITY_LEVEL)
1455 /* If there is no dump file, print to stderr. */
1461 /* User didn't specify verbosity level: */
1462 if (dump_file && (dump_flags & TDF_DETAILS))
1463 vect_verbosity_level = REPORT_DETAILS;
1464 else if (dump_file && (dump_flags & TDF_STATS))
1465 vect_verbosity_level = REPORT_UNVECTORIZED_LOOPS;
1467 vect_verbosity_level = REPORT_NONE;
1469 gcc_assert (dump_file || vect_verbosity_level == REPORT_NONE);
1473 /* Function debug_loop_details.
1475 For vectorization debug dumps. */
1478 vect_print_dump_info (enum verbosity_levels vl)
1480 if (vl > vect_verbosity_level)
1483 if (!current_function_decl || !vect_dump)
1486 if (vect_loop_location == UNKNOWN_LOC)
1487 fprintf (vect_dump, "\n%s:%d: note: ",
1488 DECL_SOURCE_FILE (current_function_decl),
1489 DECL_SOURCE_LINE (current_function_decl));
1491 fprintf (vect_dump, "\n%s:%d: note: ",
1492 LOC_FILE (vect_loop_location), LOC_LINE (vect_loop_location));
1498 /*************************************************************************
1499 Vectorization Utilities.
1500 *************************************************************************/
1502 /* Function new_stmt_vec_info.
1504 Create and initialize a new stmt_vec_info struct for STMT. */
1507 new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo)
1510 res = (stmt_vec_info) xcalloc (1, sizeof (struct _stmt_vec_info));
1512 STMT_VINFO_TYPE (res) = undef_vec_info_type;
1513 STMT_VINFO_STMT (res) = stmt;
1514 STMT_VINFO_LOOP_VINFO (res) = loop_vinfo;
1515 STMT_VINFO_RELEVANT (res) = 0;
1516 STMT_VINFO_LIVE_P (res) = false;
1517 STMT_VINFO_VECTYPE (res) = NULL;
1518 STMT_VINFO_VEC_STMT (res) = NULL;
1519 STMT_VINFO_IN_PATTERN_P (res) = false;
1520 STMT_VINFO_RELATED_STMT (res) = NULL;
1521 STMT_VINFO_DATA_REF (res) = NULL;
1523 STMT_VINFO_DR_BASE_ADDRESS (res) = NULL;
1524 STMT_VINFO_DR_OFFSET (res) = NULL;
1525 STMT_VINFO_DR_INIT (res) = NULL;
1526 STMT_VINFO_DR_STEP (res) = NULL;
1527 STMT_VINFO_DR_ALIGNED_TO (res) = NULL;
1529 if (TREE_CODE (stmt) == PHI_NODE && is_loop_header_bb_p (bb_for_stmt (stmt)))
1530 STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type;
1532 STMT_VINFO_DEF_TYPE (res) = vect_loop_def;
1533 STMT_VINFO_SAME_ALIGN_REFS (res) = VEC_alloc (dr_p, heap, 5);
1534 STMT_VINFO_INSIDE_OF_LOOP_COST (res) = 0;
1535 STMT_VINFO_OUTSIDE_OF_LOOP_COST (res) = 0;
1536 STMT_SLP_TYPE (res) = 0;
1537 DR_GROUP_FIRST_DR (res) = NULL_TREE;
1538 DR_GROUP_NEXT_DR (res) = NULL_TREE;
1539 DR_GROUP_SIZE (res) = 0;
1540 DR_GROUP_STORE_COUNT (res) = 0;
1541 DR_GROUP_GAP (res) = 0;
1542 DR_GROUP_SAME_DR_STMT (res) = NULL_TREE;
1543 DR_GROUP_READ_WRITE_DEPENDENCE (res) = false;
1549 /* Free stmt vectorization related info. */
1552 free_stmt_vec_info (tree stmt)
1554 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1559 VEC_free (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmt_info));
1561 set_stmt_info (stmt_ann (stmt), NULL);
1565 /* Function bb_in_loop_p
1567 Used as predicate for dfs order traversal of the loop bbs. */
1570 bb_in_loop_p (const_basic_block bb, const void *data)
1572 const struct loop *const loop = (const struct loop *)data;
1573 if (flow_bb_inside_loop_p (loop, bb))
1579 /* Function new_loop_vec_info.
1581 Create and initialize a new loop_vec_info struct for LOOP, as well as
1582 stmt_vec_info structs for all the stmts in LOOP. */
1585 new_loop_vec_info (struct loop *loop)
1589 block_stmt_iterator si;
1590 unsigned int i, nbbs;
1592 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1593 LOOP_VINFO_LOOP (res) = loop;
1595 bbs = get_loop_body (loop);
1597 /* Create/Update stmt_info for all stmts in the loop. */
1598 for (i = 0; i < loop->num_nodes; i++)
1600 basic_block bb = bbs[i];
1603 /* BBs in a nested inner-loop will have been already processed (because
1604 we will have called vect_analyze_loop_form for any nested inner-loop).
1605 Therefore, for stmts in an inner-loop we just want to update the
1606 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
1607 loop_info of the outer-loop we are currently considering to vectorize
1608 (instead of the loop_info of the inner-loop).
1609 For stmts in other BBs we need to create a stmt_info from scratch. */
1610 if (bb->loop_father != loop)
1612 /* Inner-loop bb. */
1613 gcc_assert (loop->inner && bb->loop_father == loop->inner);
1614 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1616 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
1617 loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1618 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
1619 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
1621 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
1623 tree stmt = bsi_stmt (si);
1624 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1625 loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1626 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
1627 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
1632 /* bb in current nest. */
1633 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1635 stmt_ann_t ann = get_stmt_ann (phi);
1636 set_stmt_info (ann, new_stmt_vec_info (phi, res));
1639 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
1641 tree stmt = bsi_stmt (si);
1642 stmt_ann_t ann = stmt_ann (stmt);
1643 set_stmt_info (ann, new_stmt_vec_info (stmt, res));
1648 /* CHECKME: We want to visit all BBs before their successors (except for
1649 latch blocks, for which this assertion wouldn't hold). In the simple
1650 case of the loop forms we allow, a dfs order of the BBs would the same
1651 as reversed postorder traversal, so we are safe. */
1654 bbs = XCNEWVEC (basic_block, loop->num_nodes);
1655 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1656 bbs, loop->num_nodes, loop);
1657 gcc_assert (nbbs == loop->num_nodes);
1659 LOOP_VINFO_BBS (res) = bbs;
1660 LOOP_VINFO_NITERS (res) = NULL;
1661 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1662 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
1663 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1664 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
1665 LOOP_VINFO_VECT_FACTOR (res) = 0;
1666 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
1667 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
1668 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1669 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
1670 VEC_alloc (tree, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
1671 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
1672 VEC_alloc (ddr_p, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1673 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (tree, heap, 10);
1674 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
1675 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1681 /* Function destroy_loop_vec_info.
1683 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1684 stmts in the loop. */
1687 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1692 block_stmt_iterator si;
1694 VEC (slp_instance, heap) *slp_instances;
1695 slp_instance instance;
1700 loop = LOOP_VINFO_LOOP (loop_vinfo);
1702 bbs = LOOP_VINFO_BBS (loop_vinfo);
1703 nbbs = loop->num_nodes;
1707 free (LOOP_VINFO_BBS (loop_vinfo));
1708 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
1709 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1710 VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1717 for (j = 0; j < nbbs; j++)
1719 basic_block bb = bbs[j];
1722 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1723 free_stmt_vec_info (phi);
1725 for (si = bsi_start (bb); !bsi_end_p (si); )
1727 tree stmt = bsi_stmt (si);
1728 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1732 /* Check if this is a "pattern stmt" (introduced by the
1733 vectorizer during the pattern recognition pass). */
1734 bool remove_stmt_p = false;
1735 tree orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1738 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
1740 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
1741 remove_stmt_p = true;
1744 /* Free stmt_vec_info. */
1745 free_stmt_vec_info (stmt);
1747 /* Remove dead "pattern stmts". */
1749 bsi_remove (&si, true);
1755 free (LOOP_VINFO_BBS (loop_vinfo));
1756 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
1757 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1758 VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1759 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1760 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1761 for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
1762 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
1763 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
1764 VEC_free (tree, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
1771 /* Function vect_force_dr_alignment_p.
1773 Returns whether the alignment of a DECL can be forced to be aligned
1774 on ALIGNMENT bit boundary. */
1777 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
1779 if (TREE_CODE (decl) != VAR_DECL)
1782 if (DECL_EXTERNAL (decl))
1785 if (TREE_ASM_WRITTEN (decl))
1788 if (TREE_STATIC (decl))
1789 return (alignment <= MAX_OFILE_ALIGNMENT);
1791 /* This used to be PREFERRED_STACK_BOUNDARY, however, that is not 100%
1792 correct until someone implements forced stack alignment. */
1793 return (alignment <= STACK_BOUNDARY);
1797 /* Function get_vectype_for_scalar_type.
1799 Returns the vector type corresponding to SCALAR_TYPE as supported
1803 get_vectype_for_scalar_type (tree scalar_type)
1805 enum machine_mode inner_mode = TYPE_MODE (scalar_type);
1806 int nbytes = GET_MODE_SIZE (inner_mode);
1810 if (nbytes == 0 || nbytes >= UNITS_PER_SIMD_WORD (inner_mode))
1813 /* FORNOW: Only a single vector size per mode (UNITS_PER_SIMD_WORD)
1815 nunits = UNITS_PER_SIMD_WORD (inner_mode) / nbytes;
1817 vectype = build_vector_type (scalar_type, nunits);
1818 if (vect_print_dump_info (REPORT_DETAILS))
1820 fprintf (vect_dump, "get vectype with %d units of type ", nunits);
1821 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
1827 if (vect_print_dump_info (REPORT_DETAILS))
1829 fprintf (vect_dump, "vectype: ");
1830 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1833 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1834 && !INTEGRAL_MODE_P (TYPE_MODE (vectype)))
1836 if (vect_print_dump_info (REPORT_DETAILS))
1837 fprintf (vect_dump, "mode not supported by target.");
1845 /* Function vect_supportable_dr_alignment
1847 Return whether the data reference DR is supported with respect to its
1850 enum dr_alignment_support
1851 vect_supportable_dr_alignment (struct data_reference *dr)
1853 tree stmt = DR_STMT (dr);
1854 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1855 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1856 enum machine_mode mode = (int) TYPE_MODE (vectype);
1857 struct loop *vect_loop = LOOP_VINFO_LOOP (STMT_VINFO_LOOP_VINFO (stmt_info));
1858 bool nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
1859 bool invariant_in_outerloop = false;
1861 if (aligned_access_p (dr))
1864 if (nested_in_vect_loop)
1866 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
1867 invariant_in_outerloop =
1868 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
1871 /* Possibly unaligned access. */
1873 /* We can choose between using the implicit realignment scheme (generating
1874 a misaligned_move stmt) and the explicit realignment scheme (generating
1875 aligned loads with a REALIGN_LOAD). There are two variants to the explicit
1876 realignment scheme: optimized, and unoptimized.
1877 We can optimize the realignment only if the step between consecutive
1878 vector loads is equal to the vector size. Since the vector memory
1879 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
1880 is guaranteed that the misalignment amount remains the same throughout the
1881 execution of the vectorized loop. Therefore, we can create the
1882 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
1883 at the loop preheader.
1885 However, in the case of outer-loop vectorization, when vectorizing a
1886 memory access in the inner-loop nested within the LOOP that is now being
1887 vectorized, while it is guaranteed that the misalignment of the
1888 vectorized memory access will remain the same in different outer-loop
1889 iterations, it is *not* guaranteed that is will remain the same throughout
1890 the execution of the inner-loop. This is because the inner-loop advances
1891 with the original scalar step (and not in steps of VS). If the inner-loop
1892 step happens to be a multiple of VS, then the misalignment remains fixed
1893 and we can use the optimized realignment scheme. For example:
1899 When vectorizing the i-loop in the above example, the step between
1900 consecutive vector loads is 1, and so the misalignment does not remain
1901 fixed across the execution of the inner-loop, and the realignment cannot
1902 be optimized (as illustrated in the following pseudo vectorized loop):
1904 for (i=0; i<N; i+=4)
1905 for (j=0; j<M; j++){
1906 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
1907 // when j is {0,1,2,3,4,5,6,7,...} respectively.
1908 // (assuming that we start from an aligned address).
1911 We therefore have to use the unoptimized realignment scheme:
1913 for (i=0; i<N; i+=4)
1914 for (j=k; j<M; j+=4)
1915 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
1916 // that the misalignment of the initial address is
1919 The loop can then be vectorized as follows:
1921 for (k=0; k<4; k++){
1922 rt = get_realignment_token (&vp[k]);
1923 for (i=0; i<N; i+=4){
1925 for (j=k; j<M; j+=4){
1927 va = REALIGN_LOAD <v1,v2,rt>;
1934 if (DR_IS_READ (dr))
1936 if (optab_handler (vec_realign_load_optab, mode)->insn_code !=
1938 && (!targetm.vectorize.builtin_mask_for_load
1939 || targetm.vectorize.builtin_mask_for_load ()))
1941 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1942 if (nested_in_vect_loop
1943 && (TREE_INT_CST_LOW (DR_STEP (dr))
1944 != GET_MODE_SIZE (TYPE_MODE (vectype))))
1945 return dr_explicit_realign;
1947 return dr_explicit_realign_optimized;
1950 if (optab_handler (movmisalign_optab, mode)->insn_code !=
1952 /* Can't software pipeline the loads, but can at least do them. */
1953 return dr_unaligned_supported;
1957 return dr_unaligned_unsupported;
1961 /* Function vect_is_simple_use.
1964 LOOP - the loop that is being vectorized.
1965 OPERAND - operand of a stmt in LOOP.
1966 DEF - the defining stmt in case OPERAND is an SSA_NAME.
1968 Returns whether a stmt with OPERAND can be vectorized.
1969 Supportable operands are constants, loop invariants, and operands that are
1970 defined by the current iteration of the loop. Unsupportable operands are
1971 those that are defined by a previous iteration of the loop (as is the case
1972 in reduction/induction computations). */
1975 vect_is_simple_use (tree operand, loop_vec_info loop_vinfo, tree *def_stmt,
1976 tree *def, enum vect_def_type *dt)
1979 stmt_vec_info stmt_vinfo;
1980 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1982 *def_stmt = NULL_TREE;
1985 if (vect_print_dump_info (REPORT_DETAILS))
1987 fprintf (vect_dump, "vect_is_simple_use: operand ");
1988 print_generic_expr (vect_dump, operand, TDF_SLIM);
1991 if (TREE_CODE (operand) == INTEGER_CST || TREE_CODE (operand) == REAL_CST)
1993 *dt = vect_constant_def;
1996 if (is_gimple_min_invariant (operand))
1999 *dt = vect_invariant_def;
2003 if (TREE_CODE (operand) == PAREN_EXPR)
2005 if (vect_print_dump_info (REPORT_DETAILS))
2006 fprintf (vect_dump, "non-associatable copy.");
2007 operand = TREE_OPERAND (operand, 0);
2009 if (TREE_CODE (operand) != SSA_NAME)
2011 if (vect_print_dump_info (REPORT_DETAILS))
2012 fprintf (vect_dump, "not ssa-name.");
2016 *def_stmt = SSA_NAME_DEF_STMT (operand);
2017 if (*def_stmt == NULL_TREE )
2019 if (vect_print_dump_info (REPORT_DETAILS))
2020 fprintf (vect_dump, "no def_stmt.");
2024 if (vect_print_dump_info (REPORT_DETAILS))
2026 fprintf (vect_dump, "def_stmt: ");
2027 print_generic_expr (vect_dump, *def_stmt, TDF_SLIM);
2030 /* empty stmt is expected only in case of a function argument.
2031 (Otherwise - we expect a phi_node or a GIMPLE_MODIFY_STMT). */
2032 if (IS_EMPTY_STMT (*def_stmt))
2034 tree arg = TREE_OPERAND (*def_stmt, 0);
2035 if (is_gimple_min_invariant (arg))
2038 *dt = vect_invariant_def;
2042 if (vect_print_dump_info (REPORT_DETAILS))
2043 fprintf (vect_dump, "Unexpected empty stmt.");
2047 bb = bb_for_stmt (*def_stmt);
2048 if (!flow_bb_inside_loop_p (loop, bb))
2049 *dt = vect_invariant_def;
2052 stmt_vinfo = vinfo_for_stmt (*def_stmt);
2053 *dt = STMT_VINFO_DEF_TYPE (stmt_vinfo);
2056 if (*dt == vect_unknown_def_type)
2058 if (vect_print_dump_info (REPORT_DETAILS))
2059 fprintf (vect_dump, "Unsupported pattern.");
2063 if (vect_print_dump_info (REPORT_DETAILS))
2064 fprintf (vect_dump, "type of def: %d.",*dt);
2066 switch (TREE_CODE (*def_stmt))
2069 *def = PHI_RESULT (*def_stmt);
2072 case GIMPLE_MODIFY_STMT:
2073 *def = GIMPLE_STMT_OPERAND (*def_stmt, 0);
2077 if (vect_print_dump_info (REPORT_DETAILS))
2078 fprintf (vect_dump, "unsupported defining stmt: ");
2086 /* Function supportable_widening_operation
2088 Check whether an operation represented by the code CODE is a
2089 widening operation that is supported by the target platform in
2090 vector form (i.e., when operating on arguments of type VECTYPE).
2092 Widening operations we currently support are NOP (CONVERT), FLOAT
2093 and WIDEN_MULT. This function checks if these operations are supported
2094 by the target platform either directly (via vector tree-codes), or via
2098 - CODE1 and CODE2 are codes of vector operations to be used when
2099 vectorizing the operation, if available.
2100 - DECL1 and DECL2 are decls of target builtin functions to be used
2101 when vectorizing the operation, if available. In this case,
2102 CODE1 and CODE2 are CALL_EXPR. */
2105 supportable_widening_operation (enum tree_code code, tree stmt, tree vectype,
2106 tree *decl1, tree *decl2,
2107 enum tree_code *code1, enum tree_code *code2)
2109 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2110 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
2111 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2113 enum machine_mode vec_mode;
2114 enum insn_code icode1, icode2;
2115 optab optab1, optab2;
2116 tree expr = GIMPLE_STMT_OPERAND (stmt, 1);
2117 tree type = TREE_TYPE (expr);
2118 tree wide_vectype = get_vectype_for_scalar_type (type);
2119 enum tree_code c1, c2;
2121 /* The result of a vectorized widening operation usually requires two vectors
2122 (because the widened results do not fit int one vector). The generated
2123 vector results would normally be expected to be generated in the same
2124 order as in the original scalar computation, i.e. if 8 results are
2125 generated in each vector iteration, they are to be organized as follows:
2126 vect1: [res1,res2,res3,res4], vect2: [res5,res6,res7,res8].
2128 However, in the special case that the result of the widening operation is
2129 used in a reduction computation only, the order doesn't matter (because
2130 when vectorizing a reduction we change the order of the computation).
2131 Some targets can take advantage of this and generate more efficient code.
2132 For example, targets like Altivec, that support widen_mult using a sequence
2133 of {mult_even,mult_odd} generate the following vectors:
2134 vect1: [res1,res3,res5,res7], vect2: [res2,res4,res6,res8].
2136 When vectorizing outer-loops, we execute the inner-loop sequentially
2137 (each vectorized inner-loop iteration contributes to VF outer-loop
2138 iterations in parallel). We therefore don't allow to change the order
2139 of the computation in the inner-loop during outer-loop vectorization. */
2141 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction
2142 && !nested_in_vect_loop_p (vect_loop, stmt))
2148 && code == WIDEN_MULT_EXPR
2149 && targetm.vectorize.builtin_mul_widen_even
2150 && targetm.vectorize.builtin_mul_widen_even (vectype)
2151 && targetm.vectorize.builtin_mul_widen_odd
2152 && targetm.vectorize.builtin_mul_widen_odd (vectype))
2154 if (vect_print_dump_info (REPORT_DETAILS))
2155 fprintf (vect_dump, "Unordered widening operation detected.");
2157 *code1 = *code2 = CALL_EXPR;
2158 *decl1 = targetm.vectorize.builtin_mul_widen_even (vectype);
2159 *decl2 = targetm.vectorize.builtin_mul_widen_odd (vectype);
2165 case WIDEN_MULT_EXPR:
2166 if (BYTES_BIG_ENDIAN)
2168 c1 = VEC_WIDEN_MULT_HI_EXPR;
2169 c2 = VEC_WIDEN_MULT_LO_EXPR;
2173 c2 = VEC_WIDEN_MULT_HI_EXPR;
2174 c1 = VEC_WIDEN_MULT_LO_EXPR;
2179 if (BYTES_BIG_ENDIAN)
2181 c1 = VEC_UNPACK_HI_EXPR;
2182 c2 = VEC_UNPACK_LO_EXPR;
2186 c2 = VEC_UNPACK_HI_EXPR;
2187 c1 = VEC_UNPACK_LO_EXPR;
2192 if (BYTES_BIG_ENDIAN)
2194 c1 = VEC_UNPACK_FLOAT_HI_EXPR;
2195 c2 = VEC_UNPACK_FLOAT_LO_EXPR;
2199 c2 = VEC_UNPACK_FLOAT_HI_EXPR;
2200 c1 = VEC_UNPACK_FLOAT_LO_EXPR;
2204 case FIX_TRUNC_EXPR:
2205 /* ??? Not yet implemented due to missing VEC_UNPACK_FIX_TRUNC_HI_EXPR/
2206 VEC_UNPACK_FIX_TRUNC_LO_EXPR tree codes and optabs used for
2207 computing the operation. */
2214 if (code == FIX_TRUNC_EXPR)
2216 /* The signedness is determined from output operand. */
2217 optab1 = optab_for_tree_code (c1, type, optab_default);
2218 optab2 = optab_for_tree_code (c2, type, optab_default);
2222 optab1 = optab_for_tree_code (c1, vectype, optab_default);
2223 optab2 = optab_for_tree_code (c2, vectype, optab_default);
2226 if (!optab1 || !optab2)
2229 vec_mode = TYPE_MODE (vectype);
2230 if ((icode1 = optab_handler (optab1, vec_mode)->insn_code) == CODE_FOR_nothing
2231 || insn_data[icode1].operand[0].mode != TYPE_MODE (wide_vectype)
2232 || (icode2 = optab_handler (optab2, vec_mode)->insn_code)
2234 || insn_data[icode2].operand[0].mode != TYPE_MODE (wide_vectype))
2243 /* Function supportable_narrowing_operation
2245 Check whether an operation represented by the code CODE is a
2246 narrowing operation that is supported by the target platform in
2247 vector form (i.e., when operating on arguments of type VECTYPE).
2249 Narrowing operations we currently support are NOP (CONVERT) and
2250 FIX_TRUNC. This function checks if these operations are supported by
2251 the target platform directly via vector tree-codes.
2254 - CODE1 is the code of a vector operation to be used when
2255 vectorizing the operation, if available. */
2258 supportable_narrowing_operation (enum tree_code code,
2259 const_tree stmt, const_tree vectype,
2260 enum tree_code *code1)
2262 enum machine_mode vec_mode;
2263 enum insn_code icode1;
2265 tree expr = GIMPLE_STMT_OPERAND (stmt, 1);
2266 tree type = TREE_TYPE (expr);
2267 tree narrow_vectype = get_vectype_for_scalar_type (type);
2273 c1 = VEC_PACK_TRUNC_EXPR;
2276 case FIX_TRUNC_EXPR:
2277 c1 = VEC_PACK_FIX_TRUNC_EXPR;
2281 /* ??? Not yet implemented due to missing VEC_PACK_FLOAT_EXPR
2282 tree code and optabs used for computing the operation. */
2289 if (code == FIX_TRUNC_EXPR)
2290 /* The signedness is determined from output operand. */
2291 optab1 = optab_for_tree_code (c1, type, optab_default);
2293 optab1 = optab_for_tree_code (c1, vectype, optab_default);
2298 vec_mode = TYPE_MODE (vectype);
2299 if ((icode1 = optab_handler (optab1, vec_mode)->insn_code) == CODE_FOR_nothing
2300 || insn_data[icode1].operand[0].mode != TYPE_MODE (narrow_vectype))
2308 /* Function reduction_code_for_scalar_code
2311 CODE - tree_code of a reduction operations.
2314 REDUC_CODE - the corresponding tree-code to be used to reduce the
2315 vector of partial results into a single scalar result (which
2316 will also reside in a vector).
2318 Return TRUE if a corresponding REDUC_CODE was found, FALSE otherwise. */
2321 reduction_code_for_scalar_code (enum tree_code code,
2322 enum tree_code *reduc_code)
2327 *reduc_code = REDUC_MAX_EXPR;
2331 *reduc_code = REDUC_MIN_EXPR;
2335 *reduc_code = REDUC_PLUS_EXPR;
2344 /* Function vect_is_simple_reduction
2346 Detect a cross-iteration def-use cycle that represents a simple
2347 reduction computation. We look for the following pattern:
2352 a2 = operation (a3, a1)
2355 1. operation is commutative and associative and it is safe to
2356 change the order of the computation.
2357 2. no uses for a2 in the loop (a2 is used out of the loop)
2358 3. no uses of a1 in the loop besides the reduction operation.
2360 Condition 1 is tested here.
2361 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. */
2364 vect_is_simple_reduction (loop_vec_info loop_info, tree phi)
2366 struct loop *loop = (bb_for_stmt (phi))->loop_father;
2367 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2368 edge latch_e = loop_latch_edge (loop);
2369 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2370 tree def_stmt, def1, def2;
2371 enum tree_code code;
2373 tree operation, op1, op2;
2377 imm_use_iterator imm_iter;
2378 use_operand_p use_p;
2380 gcc_assert (loop == vect_loop || flow_loop_nested_p (vect_loop, loop));
2382 name = PHI_RESULT (phi);
2384 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2386 tree use_stmt = USE_STMT (use_p);
2387 if (flow_bb_inside_loop_p (loop, bb_for_stmt (use_stmt))
2388 && vinfo_for_stmt (use_stmt)
2389 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2393 if (vect_print_dump_info (REPORT_DETAILS))
2394 fprintf (vect_dump, "reduction used in loop.");
2399 if (TREE_CODE (loop_arg) != SSA_NAME)
2401 if (vect_print_dump_info (REPORT_DETAILS))
2403 fprintf (vect_dump, "reduction: not ssa_name: ");
2404 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
2409 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2412 if (vect_print_dump_info (REPORT_DETAILS))
2413 fprintf (vect_dump, "reduction: no def_stmt.");
2417 if (TREE_CODE (def_stmt) != GIMPLE_MODIFY_STMT)
2419 if (vect_print_dump_info (REPORT_DETAILS))
2420 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
2424 name = GIMPLE_STMT_OPERAND (def_stmt, 0);
2426 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2428 tree use_stmt = USE_STMT (use_p);
2429 if (flow_bb_inside_loop_p (loop, bb_for_stmt (use_stmt))
2430 && vinfo_for_stmt (use_stmt)
2431 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2435 if (vect_print_dump_info (REPORT_DETAILS))
2436 fprintf (vect_dump, "reduction used in loop.");
2441 operation = GIMPLE_STMT_OPERAND (def_stmt, 1);
2442 code = TREE_CODE (operation);
2443 if (!commutative_tree_code (code) || !associative_tree_code (code))
2445 if (vect_print_dump_info (REPORT_DETAILS))
2447 fprintf (vect_dump, "reduction: not commutative/associative: ");
2448 print_generic_expr (vect_dump, operation, TDF_SLIM);
2453 op_type = TREE_OPERAND_LENGTH (operation);
2454 if (op_type != binary_op)
2456 if (vect_print_dump_info (REPORT_DETAILS))
2458 fprintf (vect_dump, "reduction: not binary operation: ");
2459 print_generic_expr (vect_dump, operation, TDF_SLIM);
2464 op1 = TREE_OPERAND (operation, 0);
2465 op2 = TREE_OPERAND (operation, 1);
2466 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
2468 if (vect_print_dump_info (REPORT_DETAILS))
2470 fprintf (vect_dump, "reduction: uses not ssa_names: ");
2471 print_generic_expr (vect_dump, operation, TDF_SLIM);
2476 /* Check that it's ok to change the order of the computation. */
2477 type = TREE_TYPE (operation);
2478 if (TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op1))
2479 || TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op2)))
2481 if (vect_print_dump_info (REPORT_DETAILS))
2483 fprintf (vect_dump, "reduction: multiple types: operation type: ");
2484 print_generic_expr (vect_dump, type, TDF_SLIM);
2485 fprintf (vect_dump, ", operands types: ");
2486 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
2487 fprintf (vect_dump, ",");
2488 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
2493 /* Generally, when vectorizing a reduction we change the order of the
2494 computation. This may change the behavior of the program in some
2495 cases, so we need to check that this is ok. One exception is when
2496 vectorizing an outer-loop: the inner-loop is executed sequentially,
2497 and therefore vectorizing reductions in the inner-loop during
2498 outer-loop vectorization is safe. */
2500 /* CHECKME: check for !flag_finite_math_only too? */
2501 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2502 && !nested_in_vect_loop_p (vect_loop, def_stmt))
2504 /* Changing the order of operations changes the semantics. */
2505 if (vect_print_dump_info (REPORT_DETAILS))
2507 fprintf (vect_dump, "reduction: unsafe fp math optimization: ");
2508 print_generic_expr (vect_dump, operation, TDF_SLIM);
2512 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2513 && !nested_in_vect_loop_p (vect_loop, def_stmt))
2515 /* Changing the order of operations changes the semantics. */
2516 if (vect_print_dump_info (REPORT_DETAILS))
2518 fprintf (vect_dump, "reduction: unsafe int math optimization: ");
2519 print_generic_expr (vect_dump, operation, TDF_SLIM);
2523 else if (SAT_FIXED_POINT_TYPE_P (type))
2525 /* Changing the order of operations changes the semantics. */
2526 if (vect_print_dump_info (REPORT_DETAILS))
2528 fprintf (vect_dump, "reduction: unsafe fixed-point math optimization: ");
2529 print_generic_expr (vect_dump, operation, TDF_SLIM);
2534 /* reduction is safe. we're dealing with one of the following:
2535 1) integer arithmetic and no trapv
2536 2) floating point arithmetic, and special flags permit this optimization.
2538 def1 = SSA_NAME_DEF_STMT (op1);
2539 def2 = SSA_NAME_DEF_STMT (op2);
2540 if (!def1 || !def2 || IS_EMPTY_STMT (def1) || IS_EMPTY_STMT (def2))
2542 if (vect_print_dump_info (REPORT_DETAILS))
2544 fprintf (vect_dump, "reduction: no defs for operands: ");
2545 print_generic_expr (vect_dump, operation, TDF_SLIM);
2551 /* Check that one def is the reduction def, defined by PHI,
2552 the other def is either defined in the loop ("vect_loop_def"),
2553 or it's an induction (defined by a loop-header phi-node). */
2556 && flow_bb_inside_loop_p (loop, bb_for_stmt (def1))
2557 && (TREE_CODE (def1) == GIMPLE_MODIFY_STMT
2558 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def
2559 || (TREE_CODE (def1) == PHI_NODE
2560 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_loop_def
2561 && !is_loop_header_bb_p (bb_for_stmt (def1)))))
2563 if (vect_print_dump_info (REPORT_DETAILS))
2565 fprintf (vect_dump, "detected reduction:");
2566 print_generic_expr (vect_dump, operation, TDF_SLIM);
2570 else if (def1 == phi
2571 && flow_bb_inside_loop_p (loop, bb_for_stmt (def2))
2572 && (TREE_CODE (def2) == GIMPLE_MODIFY_STMT
2573 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_induction_def
2574 || (TREE_CODE (def2) == PHI_NODE
2575 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_loop_def
2576 && !is_loop_header_bb_p (bb_for_stmt (def2)))))
2578 /* Swap operands (just for simplicity - so that the rest of the code
2579 can assume that the reduction variable is always the last (second)
2581 if (vect_print_dump_info (REPORT_DETAILS))
2583 fprintf (vect_dump, "detected reduction: need to swap operands:");
2584 print_generic_expr (vect_dump, operation, TDF_SLIM);
2586 swap_tree_operands (def_stmt, &TREE_OPERAND (operation, 0),
2587 &TREE_OPERAND (operation, 1));
2592 if (vect_print_dump_info (REPORT_DETAILS))
2594 fprintf (vect_dump, "reduction: unknown pattern.");
2595 print_generic_expr (vect_dump, operation, TDF_SLIM);
2602 /* Function vect_is_simple_iv_evolution.
2604 FORNOW: A simple evolution of an induction variables in the loop is
2605 considered a polynomial evolution with constant step. */
2608 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
2613 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
2615 /* When there is no evolution in this loop, the evolution function
2617 if (evolution_part == NULL_TREE)
2620 /* When the evolution is a polynomial of degree >= 2
2621 the evolution function is not "simple". */
2622 if (tree_is_chrec (evolution_part))
2625 step_expr = evolution_part;
2626 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
2628 if (vect_print_dump_info (REPORT_DETAILS))
2630 fprintf (vect_dump, "step: ");
2631 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
2632 fprintf (vect_dump, ", init: ");
2633 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
2639 if (TREE_CODE (step_expr) != INTEGER_CST)
2641 if (vect_print_dump_info (REPORT_DETAILS))
2642 fprintf (vect_dump, "step unknown.");
2650 /* Function vectorize_loops.
2652 Entry Point to loop vectorization phase. */
2655 vectorize_loops (void)
2658 unsigned int num_vectorized_loops = 0;
2659 unsigned int vect_loops_num;
2663 vect_loops_num = number_of_loops ();
2665 /* Bail out if there are no loops. */
2666 if (vect_loops_num <= 1)
2669 /* Fix the verbosity level if not defined explicitly by the user. */
2670 vect_set_dump_settings ();
2672 /* Allocate the bitmap that records which virtual variables that
2673 need to be renamed. */
2674 vect_memsyms_to_rename = BITMAP_ALLOC (NULL);
2676 /* ----------- Analyze loops. ----------- */
2678 /* If some loop was duplicated, it gets bigger number
2679 than all previously defined loops. This fact allows us to run
2680 only over initial loops skipping newly generated ones. */
2681 FOR_EACH_LOOP (li, loop, 0)
2683 loop_vec_info loop_vinfo;
2685 vect_loop_location = find_loop_location (loop);
2686 loop_vinfo = vect_analyze_loop (loop);
2687 loop->aux = loop_vinfo;
2689 if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
2692 vect_transform_loop (loop_vinfo);
2693 num_vectorized_loops++;
2695 vect_loop_location = UNKNOWN_LOC;
2697 statistics_counter_event (cfun, "Vectorized loops", num_vectorized_loops);
2698 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)
2699 || (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
2700 && num_vectorized_loops > 0))
2701 fprintf (vect_dump, "vectorized %u loops in function.\n",
2702 num_vectorized_loops);
2704 /* ----------- Finalize. ----------- */
2706 BITMAP_FREE (vect_memsyms_to_rename);
2708 for (i = 1; i < vect_loops_num; i++)
2710 loop_vec_info loop_vinfo;
2712 loop = get_loop (i);
2715 loop_vinfo = (loop_vec_info) loop->aux;
2716 destroy_loop_vec_info (loop_vinfo, true);
2720 return num_vectorized_loops > 0 ? TODO_cleanup_cfg : 0;
2723 /* Increase alignment of global arrays to improve vectorization potential.
2725 - Consider also structs that have an array field.
2726 - Use ipa analysis to prune arrays that can't be vectorized?
2727 This should involve global alignment analysis and in the future also
2731 increase_alignment (void)
2733 struct varpool_node *vnode;
2735 /* Increase the alignment of all global arrays for vectorization. */
2736 for (vnode = varpool_nodes_queue;
2738 vnode = vnode->next_needed)
2740 tree vectype, decl = vnode->decl;
2741 unsigned int alignment;
2743 if (TREE_CODE (TREE_TYPE (decl)) != ARRAY_TYPE)
2745 vectype = get_vectype_for_scalar_type (TREE_TYPE (TREE_TYPE (decl)));
2748 alignment = TYPE_ALIGN (vectype);
2749 if (DECL_ALIGN (decl) >= alignment)
2752 if (vect_can_force_dr_alignment_p (decl, alignment))
2754 DECL_ALIGN (decl) = TYPE_ALIGN (vectype);
2755 DECL_USER_ALIGN (decl) = 1;
2758 fprintf (dump_file, "Increasing alignment of decl: ");
2759 print_generic_expr (dump_file, decl, TDF_SLIM);
2767 gate_increase_alignment (void)
2769 return flag_section_anchors && flag_tree_vectorize;
2772 struct simple_ipa_opt_pass pass_ipa_increase_alignment =
2776 "increase_alignment", /* name */
2777 gate_increase_alignment, /* gate */
2778 increase_alignment, /* execute */
2781 0, /* static_pass_number */
2783 0, /* properties_required */
2784 0, /* properties_provided */
2785 0, /* properties_destroyed */
2786 0, /* todo_flags_start */
2787 0 /* todo_flags_finish */