gcc/tree-vectorizer.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software
   3    Foundation, Inc.
   4    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 /* Loop Vectorization Pass.
  23
  24    This pass tries to vectorize loops. This first implementation focuses on
  25    simple inner-most loops, with no conditional control flow, and a set of
  26    simple operations which vector form can be expressed using existing
  27    tree codes (PLUS, MULT etc).
  28
  29    For example, the vectorizer transforms the following simple loop:
  30
  31         short a[N]; short b[N]; short c[N]; int i;
  32
  33         for (i=0; i<N; i++){
  34           a[i] = b[i] + c[i];
  35         }
  36
  37    as if it was manually vectorized by rewriting the source code into:
  38
  39         typedef int __attribute__((mode(V8HI))) v8hi;
  40         short a[N];  short b[N]; short c[N];   int i;
  41         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  42         v8hi va, vb, vc;
  43
  44         for (i=0; i<N/8; i++){
  45           vb = pb[i];
  46           vc = pc[i];
  47           va = vb + vc;
  48           pa[i] = va;
  49         }
  50
  51         The main entry to this pass is vectorize_loops(), in which
  52    the vectorizer applies a set of analyses on a given set of loops,
  53    followed by the actual vectorization transformation for the loops that
  54    had successfully passed the analysis phase.
  55
  56         Throughout this pass we make a distinction between two types of
  57    data: scalars (which are represented by SSA_NAMES), and memory references
  58    ("data-refs"). These two types of data require different handling both
  59    during analysis and transformation. The types of data-refs that the
  60    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  61    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  62    accesses are required to have a  simple (consecutive) access pattern.
  63
  64    Analysis phase:
  65    ===============
  66         The driver for the analysis phase is vect_analyze_loop_nest().
  67    It applies a set of analyses, some of which rely on the scalar evolution
  68    analyzer (scev) developed by Sebastian Pop.
  69
  70         During the analysis phase the vectorizer records some information
  71    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  72    loop, as well as general information about the loop as a whole, which is
  73    recorded in a "loop_vec_info" struct attached to each loop.
  74
  75    Transformation phase:
  76    =====================
  77         The loop transformation phase scans all the stmts in the loop, and
  78    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  79    the loop that needs to be vectorized. It insert the vector code sequence
  80    just before the scalar stmt S, and records a pointer to the vector code
  81    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
  82    attached to S). This pointer will be used for the vectorization of following
  83    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
  84    otherwise, we rely on dead code elimination for removing it.
  85
  86         For example, say stmt S1 was vectorized into stmt VS1:
  87
  88    VS1: vb = px[i];
  89    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
  90    S2:  a = b;
  91
  92    To vectorize stmt S2, the vectorizer first finds the stmt that defines
  93    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
  94    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
  95    resulting sequence would be:
  96
  97    VS1: vb = px[i];
  98    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
  99    VS2: va = vb;
 100    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 101
 102         Operands that are not SSA_NAMEs, are data-refs that appear in
 103    load/store operations (like 'x[i]' in S1), and are handled differently.
 104
 105    Target modeling:
 106    =================
 107         Currently the only target specific information that is used is the
 108    size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
 109    support different sizes of vectors, for now will need to specify one value
 110    for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
 111
 112         Since we only vectorize operations which vector form can be
 113    expressed using existing tree codes, to verify that an operation is
 114    supported, the vectorizer checks the relevant optab at the relevant
 115    machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
 116    the value found is CODE_FOR_nothing, then there's no target support, and
 117    we can't vectorize the stmt.
 118
 119    For additional information on this project see:
 120    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 121 */
 122
 123 #include "config.h"
 124 #include "system.h"
 125 #include "coretypes.h"
 126 #include "tm.h"
 127 #include "ggc.h"
 128 #include "tree.h"
 129 #include "target.h"
 130 #include "rtl.h"
 131 #include "basic-block.h"
 132 #include "diagnostic.h"
 133 #include "tree-flow.h"
 134 #include "tree-dump.h"
 135 #include "timevar.h"
 136 #include "cfgloop.h"
 137 #include "cfglayout.h"
 138 #include "expr.h"
 139 #include "recog.h"
 140 #include "optabs.h"
 141 #include "params.h"
 142 #include "toplev.h"
 143 #include "tree-chrec.h"
 144 #include "tree-data-ref.h"
 145 #include "tree-scalar-evolution.h"
 146 #include "input.h"
 147 #include "hashtab.h"
 148 #include "tree-vectorizer.h"
 149 #include "tree-pass.h"
 150 #include "langhooks.h"
 151
 152 /*************************************************************************
 153   General Vectorization Utilities
 154  *************************************************************************/
 155
 156 /* vect_dump will be set to stderr or dump_file if exist.  */
 157 FILE *vect_dump;
 158
 159 /* vect_verbosity_level set to an invalid value
 160    to mark that it's uninitialized.  */
 161 enum verbosity_levels vect_verbosity_level = MAX_VERBOSITY_LEVEL;
 162
 163 /* Loop location.  */
 164 static LOC vect_loop_location;
 165
 166 /* Bitmap of virtual variables to be renamed.  */
 167 bitmap vect_memsyms_to_rename;
 168
 169 /* Vector mapping GIMPLE stmt to stmt_vec_info. */
 170 VEC(vec_void_p,heap) *stmt_vec_info_vec;
 171
 172 \f
 173 /*************************************************************************
 174   Simple Loop Peeling Utilities
 175
 176   Utilities to support loop peeling for vectorization purposes.
 177  *************************************************************************/
 178
 179
 180 /* Renames the use *OP_P.  */
 181
 182 static void
 183 rename_use_op (use_operand_p op_p)
 184 {
 185   tree new_name;
 186
 187   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
 188     return;
 189
 190   new_name = get_current_def (USE_FROM_PTR (op_p));
 191
 192   /* Something defined outside of the loop.  */
 193   if (!new_name)
 194     return;
 195
 196   /* An ordinary ssa name defined in the loop.  */
 197
 198   SET_USE (op_p, new_name);
 199 }
 200
 201
 202 /* Renames the variables in basic block BB.  */
 203
 204 static void
 205 rename_variables_in_bb (basic_block bb)
 206 {
 207   gimple_stmt_iterator gsi;
 208   gimple stmt;
 209   use_operand_p use_p;
 210   ssa_op_iter iter;
 211   edge e;
 212   edge_iterator ei;
 213   struct loop *loop = bb->loop_father;
 214
 215   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 216     {
 217       stmt = gsi_stmt (gsi);
 218       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 219         rename_use_op (use_p);
 220     }
 221
 222   FOR_EACH_EDGE (e, ei, bb->succs)
 223     {
 224       if (!flow_bb_inside_loop_p (loop, e->dest))
 225         continue;
 226       for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
 227         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi_stmt (gsi), e));
 228     }
 229 }
 230
 231
 232 /* Renames variables in new generated LOOP.  */
 233
 234 void
 235 rename_variables_in_loop (struct loop *loop)
 236 {
 237   unsigned i;
 238   basic_block *bbs;
 239
 240   bbs = get_loop_body (loop);
 241
 242   for (i = 0; i < loop->num_nodes; i++)
 243     rename_variables_in_bb (bbs[i]);
 244
 245   free (bbs);
 246 }
 247
 248
 249 /* Update the PHI nodes of NEW_LOOP.
 250
 251    NEW_LOOP is a duplicate of ORIG_LOOP.
 252    AFTER indicates whether NEW_LOOP executes before or after ORIG_LOOP:
 253    AFTER is true if NEW_LOOP executes after ORIG_LOOP, and false if it
 254    executes before it.  */
 255
 256 static void
 257 slpeel_update_phis_for_duplicate_loop (struct loop *orig_loop,
 258                                        struct loop *new_loop, bool after)
 259 {
 260   tree new_ssa_name;
 261   gimple phi_new, phi_orig;
 262   tree def;
 263   edge orig_loop_latch = loop_latch_edge (orig_loop);
 264   edge orig_entry_e = loop_preheader_edge (orig_loop);
 265   edge new_loop_exit_e = single_exit (new_loop);
 266   edge new_loop_entry_e = loop_preheader_edge (new_loop);
 267   edge entry_arg_e = (after ? orig_loop_latch : orig_entry_e);
 268   gimple_stmt_iterator gsi_new, gsi_orig;
 269
 270   /*
 271      step 1. For each loop-header-phi:
 272              Add the first phi argument for the phi in NEW_LOOP
 273             (the one associated with the entry of NEW_LOOP)
 274
 275      step 2. For each loop-header-phi:
 276              Add the second phi argument for the phi in NEW_LOOP
 277             (the one associated with the latch of NEW_LOOP)
 278
 279      step 3. Update the phis in the successor block of NEW_LOOP.
 280
 281         case 1: NEW_LOOP was placed before ORIG_LOOP:
 282                 The successor block of NEW_LOOP is the header of ORIG_LOOP.
 283                 Updating the phis in the successor block can therefore be done
 284                 along with the scanning of the loop header phis, because the
 285                 header blocks of ORIG_LOOP and NEW_LOOP have exactly the same
 286                 phi nodes, organized in the same order.
 287
 288         case 2: NEW_LOOP was placed after ORIG_LOOP:
 289                 The successor block of NEW_LOOP is the original exit block of
 290                 ORIG_LOOP - the phis to be updated are the loop-closed-ssa phis.
 291                 We postpone updating these phis to a later stage (when
 292                 loop guards are added).
 293    */
 294
 295
 296   /* Scan the phis in the headers of the old and new loops
 297      (they are organized in exactly the same order).  */
 298
 299   for (gsi_new = gsi_start_phis (new_loop->header),
 300        gsi_orig = gsi_start_phis (orig_loop->header);
 301        !gsi_end_p (gsi_new) && !gsi_end_p (gsi_orig);
 302        gsi_next (&gsi_new), gsi_next (&gsi_orig))
 303     {
 304       phi_new = gsi_stmt (gsi_new);
 305       phi_orig = gsi_stmt (gsi_orig);
 306
 307       /* step 1.  */
 308       def = PHI_ARG_DEF_FROM_EDGE (phi_orig, entry_arg_e);
 309       add_phi_arg (phi_new, def, new_loop_entry_e);
 310
 311       /* step 2.  */
 312       def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch);
 313       if (TREE_CODE (def) != SSA_NAME)
 314         continue;
 315
 316       new_ssa_name = get_current_def (def);
 317       if (!new_ssa_name)
 318         {
 319           /* This only happens if there are no definitions
 320              inside the loop. use the phi_result in this case.  */
 321           new_ssa_name = PHI_RESULT (phi_new);
 322         }
 323
 324       /* An ordinary ssa name defined in the loop.  */
 325       add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop));
 326
 327       /* step 3 (case 1).  */
 328       if (!after)
 329         {
 330           gcc_assert (new_loop_exit_e == orig_entry_e);
 331           SET_PHI_ARG_DEF (phi_orig,
 332                            new_loop_exit_e->dest_idx,
 333                            new_ssa_name);
 334         }
 335     }
 336 }
 337
 338
 339 /* Update PHI nodes for a guard of the LOOP.
 340
 341    Input:
 342    - LOOP, GUARD_EDGE: LOOP is a loop for which we added guard code that
 343         controls whether LOOP is to be executed.  GUARD_EDGE is the edge that
 344         originates from the guard-bb, skips LOOP and reaches the (unique) exit
 345         bb of LOOP.  This loop-exit-bb is an empty bb with one successor.
 346         We denote this bb NEW_MERGE_BB because before the guard code was added
 347         it had a single predecessor (the LOOP header), and now it became a merge
 348         point of two paths - the path that ends with the LOOP exit-edge, and
 349         the path that ends with GUARD_EDGE.
 350    - NEW_EXIT_BB: New basic block that is added by this function between LOOP
 351         and NEW_MERGE_BB. It is used to place loop-closed-ssa-form exit-phis.
 352
 353    ===> The CFG before the guard-code was added:
 354         LOOP_header_bb:
 355           loop_body
 356           if (exit_loop) goto update_bb
 357           else           goto LOOP_header_bb
 358         update_bb:
 359
 360    ==> The CFG after the guard-code was added:
 361         guard_bb:
 362           if (LOOP_guard_condition) goto new_merge_bb
 363           else                      goto LOOP_header_bb
 364         LOOP_header_bb:
 365           loop_body
 366           if (exit_loop_condition) goto new_merge_bb
 367           else                     goto LOOP_header_bb
 368         new_merge_bb:
 369           goto update_bb
 370         update_bb:
 371
 372    ==> The CFG after this function:
 373         guard_bb:
 374           if (LOOP_guard_condition) goto new_merge_bb
 375           else                      goto LOOP_header_bb
 376         LOOP_header_bb:
 377           loop_body
 378           if (exit_loop_condition) goto new_exit_bb
 379           else                     goto LOOP_header_bb
 380         new_exit_bb:
 381         new_merge_bb:
 382           goto update_bb
 383         update_bb:
 384
 385    This function:
 386    1. creates and updates the relevant phi nodes to account for the new
 387       incoming edge (GUARD_EDGE) into NEW_MERGE_BB. This involves:
 388       1.1. Create phi nodes at NEW_MERGE_BB.
 389       1.2. Update the phi nodes at the successor of NEW_MERGE_BB (denoted
 390            UPDATE_BB).  UPDATE_BB was the exit-bb of LOOP before NEW_MERGE_BB
 391    2. preserves loop-closed-ssa-form by creating the required phi nodes
 392       at the exit of LOOP (i.e, in NEW_EXIT_BB).
 393
 394    There are two flavors to this function:
 395
 396    slpeel_update_phi_nodes_for_guard1:
 397      Here the guard controls whether we enter or skip LOOP, where LOOP is a
 398      prolog_loop (loop1 below), and the new phis created in NEW_MERGE_BB are
 399      for variables that have phis in the loop header.
 400
 401    slpeel_update_phi_nodes_for_guard2:
 402      Here the guard controls whether we enter or skip LOOP, where LOOP is an
 403      epilog_loop (loop2 below), and the new phis created in NEW_MERGE_BB are
 404      for variables that have phis in the loop exit.
 405
 406    I.E., the overall structure is:
 407
 408         loop1_preheader_bb:
 409                 guard1 (goto loop1/merge1_bb)
 410         loop1
 411         loop1_exit_bb:
 412                 guard2 (goto merge1_bb/merge2_bb)
 413         merge1_bb
 414         loop2
 415         loop2_exit_bb
 416         merge2_bb
 417         next_bb
 418
 419    slpeel_update_phi_nodes_for_guard1 takes care of creating phis in
 420    loop1_exit_bb and merge1_bb. These are entry phis (phis for the vars
 421    that have phis in loop1->header).
 422
 423    slpeel_update_phi_nodes_for_guard2 takes care of creating phis in
 424    loop2_exit_bb and merge2_bb. These are exit phis (phis for the vars
 425    that have phis in next_bb). It also adds some of these phis to
 426    loop1_exit_bb.
 427
 428    slpeel_update_phi_nodes_for_guard1 is always called before
 429    slpeel_update_phi_nodes_for_guard2. They are both needed in order
 430    to create correct data-flow and loop-closed-ssa-form.
 431
 432    Generally slpeel_update_phi_nodes_for_guard1 creates phis for variables
 433    that change between iterations of a loop (and therefore have a phi-node
 434    at the loop entry), whereas slpeel_update_phi_nodes_for_guard2 creates
 435    phis for variables that are used out of the loop (and therefore have
 436    loop-closed exit phis). Some variables may be both updated between
 437    iterations and used after the loop. This is why in loop1_exit_bb we
 438    may need both entry_phis (created by slpeel_update_phi_nodes_for_guard1)
 439    and exit phis (created by slpeel_update_phi_nodes_for_guard2).
 440
 441    - IS_NEW_LOOP: if IS_NEW_LOOP is true, then LOOP is a newly created copy of
 442      an original loop. i.e., we have:
 443
 444            orig_loop
 445            guard_bb (goto LOOP/new_merge)
 446            new_loop <-- LOOP
 447            new_exit
 448            new_merge
 449            next_bb
 450
 451      If IS_NEW_LOOP is false, then LOOP is an original loop, in which case we
 452      have:
 453
 454            new_loop
 455            guard_bb (goto LOOP/new_merge)
 456            orig_loop <-- LOOP
 457            new_exit
 458            new_merge
 459            next_bb
 460
 461      The SSA names defined in the original loop have a current
 462      reaching definition that that records the corresponding new
 463      ssa-name used in the new duplicated loop copy.
 464   */
 465
 466 /* Function slpeel_update_phi_nodes_for_guard1
 467
 468    Input:
 469    - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
 470    - DEFS - a bitmap of ssa names to mark new names for which we recorded
 471             information.
 472
 473    In the context of the overall structure, we have:
 474
 475         loop1_preheader_bb:
 476                 guard1 (goto loop1/merge1_bb)
 477 LOOP->  loop1
 478         loop1_exit_bb:
 479                 guard2 (goto merge1_bb/merge2_bb)
 480         merge1_bb
 481         loop2
 482         loop2_exit_bb
 483         merge2_bb
 484         next_bb
 485
 486    For each name updated between loop iterations (i.e - for each name that has
 487    an entry (loop-header) phi in LOOP) we create a new phi in:
 488    1. merge1_bb (to account for the edge from guard1)
 489    2. loop1_exit_bb (an exit-phi to keep LOOP in loop-closed form)
 490 */
 491
 492 static void
 493 slpeel_update_phi_nodes_for_guard1 (edge guard_edge, struct loop *loop,
 494                                     bool is_new_loop, basic_block *new_exit_bb,
 495                                     bitmap *defs)
 496 {
 497   gimple orig_phi, new_phi;
 498   gimple update_phi, update_phi2;
 499   tree guard_arg, loop_arg;
 500   basic_block new_merge_bb = guard_edge->dest;
 501   edge e = EDGE_SUCC (new_merge_bb, 0);
 502   basic_block update_bb = e->dest;
 503   basic_block orig_bb = loop->header;
 504   edge new_exit_e;
 505   tree current_new_name;
 506   tree name;
 507   gimple_stmt_iterator gsi_orig, gsi_update;
 508
 509   /* Create new bb between loop and new_merge_bb.  */
 510   *new_exit_bb = split_edge (single_exit (loop));
 511
 512   new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
 513
 514   for (gsi_orig = gsi_start_phis (orig_bb),
 515        gsi_update = gsi_start_phis (update_bb);
 516        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
 517        gsi_next (&gsi_orig), gsi_next (&gsi_update))
 518     {
 519       orig_phi = gsi_stmt (gsi_orig);
 520       update_phi = gsi_stmt (gsi_update);
 521
 522       /* Virtual phi; Mark it for renaming. We actually want to call
 523          mar_sym_for_renaming, but since all ssa renaming datastructures
 524          are going to be freed before we get to call ssa_update, we just
 525          record this name for now in a bitmap, and will mark it for
 526          renaming later.  */
 527       name = PHI_RESULT (orig_phi);
 528       if (!is_gimple_reg (SSA_NAME_VAR (name)))
 529         bitmap_set_bit (vect_memsyms_to_rename, DECL_UID (SSA_NAME_VAR (name)));
 530
 531       /** 1. Handle new-merge-point phis  **/
 532
 533       /* 1.1. Generate new phi node in NEW_MERGE_BB:  */
 534       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 535                                  new_merge_bb);
 536
 537       /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
 538             of LOOP. Set the two phi args in NEW_PHI for these edges:  */
 539       loop_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, EDGE_SUCC (loop->latch, 0));
 540       guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, loop_preheader_edge (loop));
 541
 542       add_phi_arg (new_phi, loop_arg, new_exit_e);
 543       add_phi_arg (new_phi, guard_arg, guard_edge);
 544
 545       /* 1.3. Update phi in successor block.  */
 546       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == loop_arg
 547                   || PHI_ARG_DEF_FROM_EDGE (update_phi, e) == guard_arg);
 548       SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
 549       update_phi2 = new_phi;
 550
 551
 552       /** 2. Handle loop-closed-ssa-form phis  **/
 553
 554       if (!is_gimple_reg (PHI_RESULT (orig_phi)))
 555         continue;
 556
 557       /* 2.1. Generate new phi node in NEW_EXIT_BB:  */
 558       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 559                                  *new_exit_bb);
 560
 561       /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop.  */
 562       add_phi_arg (new_phi, loop_arg, single_exit (loop));
 563
 564       /* 2.3. Update phi in successor of NEW_EXIT_BB:  */
 565       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
 566       SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));
 567
 568       /* 2.4. Record the newly created name with set_current_def.
 569          We want to find a name such that
 570                 name = get_current_def (orig_loop_name)
 571          and to set its current definition as follows:
 572                 set_current_def (name, new_phi_name)
 573
 574          If LOOP is a new loop then loop_arg is already the name we're
 575          looking for. If LOOP is the original loop, then loop_arg is
 576          the orig_loop_name and the relevant name is recorded in its
 577          current reaching definition.  */
 578       if (is_new_loop)
 579         current_new_name = loop_arg;
 580       else
 581         {
 582           current_new_name = get_current_def (loop_arg);
 583           /* current_def is not available only if the variable does not
 584              change inside the loop, in which case we also don't care
 585              about recording a current_def for it because we won't be
 586              trying to create loop-exit-phis for it.  */
 587           if (!current_new_name)
 588             continue;
 589         }
 590       gcc_assert (get_current_def (current_new_name) == NULL_TREE);
 591
 592       set_current_def (current_new_name, PHI_RESULT (new_phi));
 593       bitmap_set_bit (*defs, SSA_NAME_VERSION (current_new_name));
 594     }
 595 }
 596
 597
 598 /* Function slpeel_update_phi_nodes_for_guard2
 599
 600    Input:
 601    - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
 602
 603    In the context of the overall structure, we have:
 604
 605         loop1_preheader_bb:
 606                 guard1 (goto loop1/merge1_bb)
 607         loop1
 608         loop1_exit_bb:
 609                 guard2 (goto merge1_bb/merge2_bb)
 610         merge1_bb
 611 LOOP->  loop2
 612         loop2_exit_bb
 613         merge2_bb
 614         next_bb
 615
 616    For each name used out side the loop (i.e - for each name that has an exit
 617    phi in next_bb) we create a new phi in:
 618    1. merge2_bb (to account for the edge from guard_bb)
 619    2. loop2_exit_bb (an exit-phi to keep LOOP in loop-closed form)
 620    3. guard2 bb (an exit phi to keep the preceding loop in loop-closed form),
 621       if needed (if it wasn't handled by slpeel_update_phis_nodes_for_phi1).
 622 */
 623
 624 static void
 625 slpeel_update_phi_nodes_for_guard2 (edge guard_edge, struct loop *loop,
 626                                     bool is_new_loop, basic_block *new_exit_bb)
 627 {
 628   gimple orig_phi, new_phi;
 629   gimple update_phi, update_phi2;
 630   tree guard_arg, loop_arg;
 631   basic_block new_merge_bb = guard_edge->dest;
 632   edge e = EDGE_SUCC (new_merge_bb, 0);
 633   basic_block update_bb = e->dest;
 634   edge new_exit_e;
 635   tree orig_def, orig_def_new_name;
 636   tree new_name, new_name2;
 637   tree arg;
 638   gimple_stmt_iterator gsi;
 639
 640   /* Create new bb between loop and new_merge_bb.  */
 641   *new_exit_bb = split_edge (single_exit (loop));
 642
 643   new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
 644
 645   for (gsi = gsi_start_phis (update_bb); !gsi_end_p (gsi); gsi_next (&gsi))
 646     {
 647       update_phi = gsi_stmt (gsi);
 648       orig_phi = update_phi;
 649       orig_def = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
 650       /* This loop-closed-phi actually doesn't represent a use
 651          out of the loop - the phi arg is a constant.  */
 652       if (TREE_CODE (orig_def) != SSA_NAME)
 653         continue;
 654       orig_def_new_name = get_current_def (orig_def);
 655       arg = NULL_TREE;
 656
 657       /** 1. Handle new-merge-point phis  **/
 658
 659       /* 1.1. Generate new phi node in NEW_MERGE_BB:  */
 660       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 661                                  new_merge_bb);
 662
 663       /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
 664             of LOOP. Set the two PHI args in NEW_PHI for these edges:  */
 665       new_name = orig_def;
 666       new_name2 = NULL_TREE;
 667       if (orig_def_new_name)
 668         {
 669           new_name = orig_def_new_name;
 670           /* Some variables have both loop-entry-phis and loop-exit-phis.
 671              Such variables were given yet newer names by phis placed in
 672              guard_bb by slpeel_update_phi_nodes_for_guard1. I.e:
 673              new_name2 = get_current_def (get_current_def (orig_name)).  */
 674           new_name2 = get_current_def (new_name);
 675         }
 676
 677       if (is_new_loop)
 678         {
 679           guard_arg = orig_def;
 680           loop_arg = new_name;
 681         }
 682       else
 683         {
 684           guard_arg = new_name;
 685           loop_arg = orig_def;
 686         }
 687       if (new_name2)
 688         guard_arg = new_name2;
 689
 690       add_phi_arg (new_phi, loop_arg, new_exit_e);
 691       add_phi_arg (new_phi, guard_arg, guard_edge);
 692
 693       /* 1.3. Update phi in successor block.  */
 694       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == orig_def);
 695       SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
 696       update_phi2 = new_phi;
 697
 698
 699       /** 2. Handle loop-closed-ssa-form phis  **/
 700
 701       /* 2.1. Generate new phi node in NEW_EXIT_BB:  */
 702       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 703                                  *new_exit_bb);
 704
 705       /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop.  */
 706       add_phi_arg (new_phi, loop_arg, single_exit (loop));
 707
 708       /* 2.3. Update phi in successor of NEW_EXIT_BB:  */
 709       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
 710       SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));
 711
 712
 713       /** 3. Handle loop-closed-ssa-form phis for first loop  **/
 714
 715       /* 3.1. Find the relevant names that need an exit-phi in
 716          GUARD_BB, i.e. names for which
 717          slpeel_update_phi_nodes_for_guard1 had not already created a
 718          phi node. This is the case for names that are used outside
 719          the loop (and therefore need an exit phi) but are not updated
 720          across loop iterations (and therefore don't have a
 721          loop-header-phi).
 722
 723          slpeel_update_phi_nodes_for_guard1 is responsible for
 724          creating loop-exit phis in GUARD_BB for names that have a
 725          loop-header-phi.  When such a phi is created we also record
 726          the new name in its current definition.  If this new name
 727          exists, then guard_arg was set to this new name (see 1.2
 728          above).  Therefore, if guard_arg is not this new name, this
 729          is an indication that an exit-phi in GUARD_BB was not yet
 730          created, so we take care of it here.  */
 731       if (guard_arg == new_name2)
 732         continue;
 733       arg = guard_arg;
 734
 735       /* 3.2. Generate new phi node in GUARD_BB:  */
 736       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 737                                  guard_edge->src);
 738
 739       /* 3.3. GUARD_BB has one incoming edge:  */
 740       gcc_assert (EDGE_COUNT (guard_edge->src->preds) == 1);
 741       add_phi_arg (new_phi, arg, EDGE_PRED (guard_edge->src, 0));
 742
 743       /* 3.4. Update phi in successor of GUARD_BB:  */
 744       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, guard_edge)
 745                                                                 == guard_arg);
 746       SET_PHI_ARG_DEF (update_phi2, guard_edge->dest_idx, PHI_RESULT (new_phi));
 747     }
 748 }
 749
 750
 751 /* Make the LOOP iterate NITERS times. This is done by adding a new IV
 752    that starts at zero, increases by one and its limit is NITERS.
 753
 754    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 755
 756 void
 757 slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters)
 758 {
 759   tree indx_before_incr, indx_after_incr;
 760   gimple cond_stmt;
 761   gimple orig_cond;
 762   edge exit_edge = single_exit (loop);
 763   gimple_stmt_iterator loop_cond_gsi;
 764   gimple_stmt_iterator incr_gsi;
 765   bool insert_after;
 766   tree init = build_int_cst (TREE_TYPE (niters), 0);
 767   tree step = build_int_cst (TREE_TYPE (niters), 1);
 768   LOC loop_loc;
 769   enum tree_code code;
 770
 771   orig_cond = get_loop_exit_condition (loop);
 772   gcc_assert (orig_cond);
 773   loop_cond_gsi = gsi_for_stmt (orig_cond);
 774
 775   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 776   create_iv (init, step, NULL_TREE, loop,
 777              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 778
 779   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 780                                               true, NULL_TREE, true,
 781                                               GSI_SAME_STMT);
 782   niters = force_gimple_operand_gsi (&loop_cond_gsi, niters, true, NULL_TREE,
 783                                      true, GSI_SAME_STMT);
 784
 785   code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 786   cond_stmt = gimple_build_cond (code, indx_after_incr, niters, NULL_TREE,
 787                                  NULL_TREE);
 788
 789   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 790
 791   /* Remove old loop exit test:  */
 792   gsi_remove (&loop_cond_gsi, true);
 793
 794   loop_loc = find_loop_location (loop);
 795   if (dump_file && (dump_flags & TDF_DETAILS))
 796     {
 797       if (loop_loc != UNKNOWN_LOC)
 798         fprintf (dump_file, "\nloop at %s:%d: ",
 799                  LOC_FILE (loop_loc), LOC_LINE (loop_loc));
 800       print_gimple_stmt (dump_file, cond_stmt, 0, TDF_SLIM);
 801     }
 802
 803   loop->nb_iterations = niters;
 804 }
 805
 806
 807 /* Given LOOP this function generates a new copy of it and puts it
 808    on E which is either the entry or exit of LOOP.  */
 809
 810 struct loop *
 811 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
 812 {
 813   struct loop *new_loop;
 814   basic_block *new_bbs, *bbs;
 815   bool at_exit;
 816   bool was_imm_dom;
 817   basic_block exit_dest;
 818   gimple phi;
 819   tree phi_arg;
 820   edge exit, new_exit;
 821   gimple_stmt_iterator gsi;
 822
 823   at_exit = (e == single_exit (loop));
 824   if (!at_exit && e != loop_preheader_edge (loop))
 825     return NULL;
 826
 827   bbs = get_loop_body (loop);
 828
 829   /* Check whether duplication is possible.  */
 830   if (!can_copy_bbs_p (bbs, loop->num_nodes))
 831     {
 832       free (bbs);
 833       return NULL;
 834     }
 835
 836   /* Generate new loop structure.  */
 837   new_loop = duplicate_loop (loop, loop_outer (loop));
 838   if (!new_loop)
 839     {
 840       free (bbs);
 841       return NULL;
 842     }
 843
 844   exit_dest = single_exit (loop)->dest;
 845   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
 846                                           exit_dest) == loop->header ?
 847                  true : false);
 848
 849   new_bbs = XNEWVEC (basic_block, loop->num_nodes);
 850
 851   exit = single_exit (loop);
 852   copy_bbs (bbs, loop->num_nodes, new_bbs,
 853             &exit, 1, &new_exit, NULL,
 854             e->src);
 855
 856   /* Duplicating phi args at exit bbs as coming
 857      also from exit of duplicated loop.  */
 858   for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi); gsi_next (&gsi))
 859     {
 860       phi = gsi_stmt (gsi);
 861       phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, single_exit (loop));
 862       if (phi_arg)
 863         {
 864           edge new_loop_exit_edge;
 865
 866           if (EDGE_SUCC (new_loop->header, 0)->dest == new_loop->latch)
 867             new_loop_exit_edge = EDGE_SUCC (new_loop->header, 1);
 868           else
 869             new_loop_exit_edge = EDGE_SUCC (new_loop->header, 0);
 870
 871           add_phi_arg (phi, phi_arg, new_loop_exit_edge);
 872         }
 873     }
 874
 875   if (at_exit) /* Add the loop copy at exit.  */
 876     {
 877       redirect_edge_and_branch_force (e, new_loop->header);
 878       PENDING_STMT (e) = NULL;
 879       set_immediate_dominator (CDI_DOMINATORS, new_loop->header, e->src);
 880       if (was_imm_dom)
 881         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header);
 882     }
 883   else /* Add the copy at entry.  */
 884     {
 885       edge new_exit_e;
 886       edge entry_e = loop_preheader_edge (loop);
 887       basic_block preheader = entry_e->src;
 888
 889       if (!flow_bb_inside_loop_p (new_loop,
 890                                   EDGE_SUCC (new_loop->header, 0)->dest))
 891         new_exit_e = EDGE_SUCC (new_loop->header, 0);
 892       else
 893         new_exit_e = EDGE_SUCC (new_loop->header, 1);
 894
 895       redirect_edge_and_branch_force (new_exit_e, loop->header);
 896       PENDING_STMT (new_exit_e) = NULL;
 897       set_immediate_dominator (CDI_DOMINATORS, loop->header,
 898                                new_exit_e->src);
 899
 900       /* We have to add phi args to the loop->header here as coming
 901          from new_exit_e edge.  */
 902       for (gsi = gsi_start_phis (loop->header);
 903            !gsi_end_p (gsi);
 904            gsi_next (&gsi))
 905         {
 906           phi = gsi_stmt (gsi);
 907           phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, entry_e);
 908           if (phi_arg)
 909             add_phi_arg (phi, phi_arg, new_exit_e);
 910         }
 911
 912       redirect_edge_and_branch_force (entry_e, new_loop->header);
 913       PENDING_STMT (entry_e) = NULL;
 914       set_immediate_dominator (CDI_DOMINATORS, new_loop->header, preheader);
 915     }
 916
 917   free (new_bbs);
 918   free (bbs);
 919
 920   return new_loop;
 921 }
 922
 923
 924 /* Given the condition statement COND, put it as the last statement
 925    of GUARD_BB; EXIT_BB is the basic block to skip the loop;
 926    Assumes that this is the single exit of the guarded loop.
 927    Returns the skip edge.  */
 928
 929 static edge
 930 slpeel_add_loop_guard (basic_block guard_bb, tree cond, basic_block exit_bb,
 931                        basic_block dom_bb)
 932 {
 933   gimple_stmt_iterator gsi;
 934   edge new_e, enter_e;
 935   gimple cond_stmt;
 936   gimple_seq gimplify_stmt_list = NULL;
 937
 938   enter_e = EDGE_SUCC (guard_bb, 0);
 939   enter_e->flags &= ~EDGE_FALLTHRU;
 940   enter_e->flags |= EDGE_FALSE_VALUE;
 941   gsi = gsi_last_bb (guard_bb);
 942
 943   cond =
 944     force_gimple_operand (cond, &gimplify_stmt_list, true,
 945                           NULL_TREE);
 946   cond_stmt = gimple_build_cond (NE_EXPR, cond, integer_zero_node,
 947                                  NULL_TREE, NULL_TREE);
 948   if (gimplify_stmt_list)
 949     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
 950
 951   gsi = gsi_last_bb (guard_bb);
 952   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
 953
 954   /* Add new edge to connect guard block to the merge/loop-exit block.  */
 955   new_e = make_edge (guard_bb, exit_bb, EDGE_TRUE_VALUE);
 956   set_immediate_dominator (CDI_DOMINATORS, exit_bb, dom_bb);
 957   return new_e;
 958 }
 959
 960
 961 /* This function verifies that the following restrictions apply to LOOP:
 962    (1) it is innermost
 963    (2) it consists of exactly 2 basic blocks - header, and an empty latch.
 964    (3) it is single entry, single exit
 965    (4) its exit condition is the last stmt in the header
 966    (5) E is the entry/exit edge of LOOP.
 967  */
 968
 969 bool
 970 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
 971 {
 972   edge exit_e = single_exit (loop);
 973   edge entry_e = loop_preheader_edge (loop);
 974   gimple orig_cond = get_loop_exit_condition (loop);
 975   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
 976
 977   if (need_ssa_update_p ())
 978     return false;
 979
 980   if (loop->inner
 981       /* All loops have an outer scope; the only case loop->outer is NULL is for
 982          the function itself.  */
 983       || !loop_outer (loop)
 984       || loop->num_nodes != 2
 985       || !empty_block_p (loop->latch)
 986       || !single_exit (loop)
 987       /* Verify that new loop exit condition can be trivially modified.  */
 988       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
 989       || (e != exit_e && e != entry_e))
 990     return false;
 991
 992   return true;
 993 }
 994
 995 #ifdef ENABLE_CHECKING
 996 void
 997 slpeel_verify_cfg_after_peeling (struct loop *first_loop,
 998                                  struct loop *second_loop)
 999 {
1000   basic_block loop1_exit_bb = single_exit (first_loop)->dest;
1001   basic_block loop2_entry_bb = loop_preheader_edge (second_loop)->src;
1002   basic_block loop1_entry_bb = loop_preheader_edge (first_loop)->src;
1003
1004   /* A guard that controls whether the second_loop is to be executed or skipped
1005      is placed in first_loop->exit.  first_loop->exit therefore has two
1006      successors - one is the preheader of second_loop, and the other is a bb
1007      after second_loop.
1008    */
1009   gcc_assert (EDGE_COUNT (loop1_exit_bb->succs) == 2);
1010
1011   /* 1. Verify that one of the successors of first_loop->exit is the preheader
1012         of second_loop.  */
1013
1014   /* The preheader of new_loop is expected to have two predecessors:
1015      first_loop->exit and the block that precedes first_loop.  */
1016
1017   gcc_assert (EDGE_COUNT (loop2_entry_bb->preds) == 2
1018               && ((EDGE_PRED (loop2_entry_bb, 0)->src == loop1_exit_bb
1019                    && EDGE_PRED (loop2_entry_bb, 1)->src == loop1_entry_bb)
1020                || (EDGE_PRED (loop2_entry_bb, 1)->src ==  loop1_exit_bb
1021                    && EDGE_PRED (loop2_entry_bb, 0)->src == loop1_entry_bb)));
1022
1023   /* Verify that the other successor of first_loop->exit is after the
1024      second_loop.  */
1025   /* TODO */
1026 }
1027 #endif
1028
1029 /* If the run time cost model check determines that vectorization is
1030    not profitable and hence scalar loop should be generated then set
1031    FIRST_NITERS to prologue peeled iterations. This will allow all the
1032    iterations to be executed in the prologue peeled scalar loop.  */
1033
1034 void
1035 set_prologue_iterations (basic_block bb_before_first_loop,
1036                          tree first_niters,
1037                          struct loop *loop,
1038                          unsigned int th)
1039 {
1040   edge e;
1041   basic_block cond_bb, then_bb;
1042   tree var, prologue_after_cost_adjust_name;
1043   gimple_stmt_iterator gsi;
1044   gimple newphi;
1045   edge e_true, e_false, e_fallthru;
1046   gimple cond_stmt;
1047   gimple_seq gimplify_stmt_list = NULL, stmts = NULL;
1048   tree cost_pre_condition = NULL_TREE;
1049   tree scalar_loop_iters =
1050     unshare_expr (LOOP_VINFO_NITERS_UNCHANGED (loop_vec_info_for_loop (loop)));
1051
1052   e = single_pred_edge (bb_before_first_loop);
1053   cond_bb = split_edge(e);
1054
1055   e = single_pred_edge (bb_before_first_loop);
1056   then_bb = split_edge(e);
1057   set_immediate_dominator (CDI_DOMINATORS, then_bb, cond_bb);
1058
1059   e_false = make_single_succ_edge (cond_bb, bb_before_first_loop,
1060                                    EDGE_FALSE_VALUE);
1061   set_immediate_dominator (CDI_DOMINATORS, bb_before_first_loop, cond_bb);
1062
1063   e_true = EDGE_PRED (then_bb, 0);
1064   e_true->flags &= ~EDGE_FALLTHRU;
1065   e_true->flags |= EDGE_TRUE_VALUE;
1066
1067   e_fallthru = EDGE_SUCC (then_bb, 0);
1068
1069   cost_pre_condition =
1070     build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
1071             build_int_cst (TREE_TYPE (scalar_loop_iters), th));
1072   cost_pre_condition =
1073     force_gimple_operand (cost_pre_condition, &gimplify_stmt_list,
1074                           true, NULL_TREE);
1075   cond_stmt = gimple_build_cond (NE_EXPR, cost_pre_condition,
1076                                  integer_zero_node, NULL_TREE, NULL_TREE);
1077
1078   gsi = gsi_last_bb (cond_bb);
1079   if (gimplify_stmt_list)
1080     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1081
1082   gsi = gsi_last_bb (cond_bb);
1083   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1084
1085   var = create_tmp_var (TREE_TYPE (scalar_loop_iters),
1086                         "prologue_after_cost_adjust");
1087   add_referenced_var (var);
1088   prologue_after_cost_adjust_name =
1089     force_gimple_operand (scalar_loop_iters, &stmts, false, var);
1090
1091   gsi = gsi_last_bb (then_bb);
1092   if (stmts)
1093     gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT);
1094
1095   newphi = create_phi_node (var, bb_before_first_loop);
1096   add_phi_arg (newphi, prologue_after_cost_adjust_name, e_fallthru);
1097   add_phi_arg (newphi, first_niters, e_false);
1098
1099   first_niters = PHI_RESULT (newphi);
1100 }
1101
1102
1103 /* Function slpeel_tree_peel_loop_to_edge.
1104
1105    Peel the first (last) iterations of LOOP into a new prolog (epilog) loop
1106    that is placed on the entry (exit) edge E of LOOP. After this transformation
1107    we have two loops one after the other - first-loop iterates FIRST_NITERS
1108    times, and second-loop iterates the remainder NITERS - FIRST_NITERS times.
1109    If the cost model indicates that it is profitable to emit a scalar
1110    loop instead of the vector one, then the prolog (epilog) loop will iterate
1111    for the entire unchanged scalar iterations of the loop.
1112
1113    Input:
1114    - LOOP: the loop to be peeled.
1115    - E: the exit or entry edge of LOOP.
1116         If it is the entry edge, we peel the first iterations of LOOP. In this
1117         case first-loop is LOOP, and second-loop is the newly created loop.
1118         If it is the exit edge, we peel the last iterations of LOOP. In this
1119         case, first-loop is the newly created loop, and second-loop is LOOP.
1120    - NITERS: the number of iterations that LOOP iterates.
1121    - FIRST_NITERS: the number of iterations that the first-loop should iterate.
1122    - UPDATE_FIRST_LOOP_COUNT:  specified whether this function is responsible
1123         for updating the loop bound of the first-loop to FIRST_NITERS.  If it
1124         is false, the caller of this function may want to take care of this
1125         (this can be useful if we don't want new stmts added to first-loop).
1126    - TH: cost model profitability threshold of iterations for vectorization.
1127    - CHECK_PROFITABILITY: specify whether cost model check has not occurred
1128                           during versioning and hence needs to occur during
1129                           prologue generation or whether cost model check
1130                           has not occurred during prologue generation and hence
1131                           needs to occur during epilogue generation.
1132
1133
1134    Output:
1135    The function returns a pointer to the new loop-copy, or NULL if it failed
1136    to perform the transformation.
1137
1138    The function generates two if-then-else guards: one before the first loop,
1139    and the other before the second loop:
1140    The first guard is:
1141      if (FIRST_NITERS == 0) then skip the first loop,
1142      and go directly to the second loop.
1143    The second guard is:
1144      if (FIRST_NITERS == NITERS) then skip the second loop.
1145
1146    FORNOW only simple loops are supported (see slpeel_can_duplicate_loop_p).
1147    FORNOW the resulting code will not be in loop-closed-ssa form.
1148 */
1149
1150 struct loop*
1151 slpeel_tree_peel_loop_to_edge (struct loop *loop,
1152                                edge e, tree first_niters,
1153                                tree niters, bool update_first_loop_count,
1154                                unsigned int th, bool check_profitability)
1155 {
1156   struct loop *new_loop = NULL, *first_loop, *second_loop;
1157   edge skip_e;
1158   tree pre_condition = NULL_TREE;
1159   bitmap definitions;
1160   basic_block bb_before_second_loop, bb_after_second_loop;
1161   basic_block bb_before_first_loop;
1162   basic_block bb_between_loops;
1163   basic_block new_exit_bb;
1164   edge exit_e = single_exit (loop);
1165   LOC loop_loc;
1166   tree cost_pre_condition = NULL_TREE;
1167
1168   if (!slpeel_can_duplicate_loop_p (loop, e))
1169     return NULL;
1170
1171   /* We have to initialize cfg_hooks. Then, when calling
1172    cfg_hooks->split_edge, the function tree_split_edge
1173    is actually called and, when calling cfg_hooks->duplicate_block,
1174    the function tree_duplicate_bb is called.  */
1175   gimple_register_cfg_hooks ();
1176
1177
1178   /* 1. Generate a copy of LOOP and put it on E (E is the entry/exit of LOOP).
1179         Resulting CFG would be:
1180
1181         first_loop:
1182         do {
1183         } while ...
1184
1185         second_loop:
1186         do {
1187         } while ...
1188
1189         orig_exit_bb:
1190    */
1191
1192   if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e)))
1193     {
1194       loop_loc = find_loop_location (loop);
1195       if (dump_file && (dump_flags & TDF_DETAILS))
1196         {
1197           if (loop_loc != UNKNOWN_LOC)
1198             fprintf (dump_file, "\n%s:%d: note: ",
1199                      LOC_FILE (loop_loc), LOC_LINE (loop_loc));
1200           fprintf (dump_file, "tree_duplicate_loop_to_edge_cfg failed.\n");
1201         }
1202       return NULL;
1203     }
1204
1205   if (e == exit_e)
1206     {
1207       /* NEW_LOOP was placed after LOOP.  */
1208       first_loop = loop;
1209       second_loop = new_loop;
1210     }
1211   else
1212     {
1213       /* NEW_LOOP was placed before LOOP.  */
1214       first_loop = new_loop;
1215       second_loop = loop;
1216     }
1217
1218   definitions = ssa_names_to_replace ();
1219   slpeel_update_phis_for_duplicate_loop (loop, new_loop, e == exit_e);
1220   rename_variables_in_loop (new_loop);
1221
1222
1223   /* 2.  Add the guard code in one of the following ways:
1224
1225      2.a Add the guard that controls whether the first loop is executed.
1226          This occurs when this function is invoked for prologue or epilogue
1227          generation and when the cost model check can be done at compile time.
1228
1229          Resulting CFG would be:
1230
1231          bb_before_first_loop:
1232          if (FIRST_NITERS == 0) GOTO bb_before_second_loop
1233                                 GOTO first-loop
1234
1235          first_loop:
1236          do {
1237          } while ...
1238
1239          bb_before_second_loop:
1240
1241          second_loop:
1242          do {
1243          } while ...
1244
1245          orig_exit_bb:
1246
1247      2.b Add the cost model check that allows the prologue
1248          to iterate for the entire unchanged scalar
1249          iterations of the loop in the event that the cost
1250          model indicates that the scalar loop is more
1251          profitable than the vector one. This occurs when
1252          this function is invoked for prologue generation
1253          and the cost model check needs to be done at run
1254          time.
1255
1256          Resulting CFG after prologue peeling would be:
1257
1258          if (scalar_loop_iterations <= th)
1259            FIRST_NITERS = scalar_loop_iterations
1260
1261          bb_before_first_loop:
1262          if (FIRST_NITERS == 0) GOTO bb_before_second_loop
1263                                 GOTO first-loop
1264
1265          first_loop:
1266          do {
1267          } while ...
1268
1269          bb_before_second_loop:
1270
1271          second_loop:
1272          do {
1273          } while ...
1274
1275          orig_exit_bb:
1276
1277      2.c Add the cost model check that allows the epilogue
1278          to iterate for the entire unchanged scalar
1279          iterations of the loop in the event that the cost
1280          model indicates that the scalar loop is more
1281          profitable than the vector one. This occurs when
1282          this function is invoked for epilogue generation
1283          and the cost model check needs to be done at run
1284          time.
1285
1286          Resulting CFG after prologue peeling would be:
1287
1288          bb_before_first_loop:
1289          if ((scalar_loop_iterations <= th)
1290              ||
1291              FIRST_NITERS == 0) GOTO bb_before_second_loop
1292                                 GOTO first-loop
1293
1294          first_loop:
1295          do {
1296          } while ...
1297
1298          bb_before_second_loop:
1299
1300          second_loop:
1301          do {
1302          } while ...
1303
1304          orig_exit_bb:
1305   */
1306
1307   bb_before_first_loop = split_edge (loop_preheader_edge (first_loop));
1308   bb_before_second_loop = split_edge (single_exit (first_loop));
1309
1310   /* Epilogue peeling.  */
1311   if (!update_first_loop_count)
1312     {
1313       pre_condition =
1314         fold_build2 (LE_EXPR, boolean_type_node, first_niters,
1315                      build_int_cst (TREE_TYPE (first_niters), 0));
1316       if (check_profitability)
1317         {
1318           tree scalar_loop_iters
1319             = unshare_expr (LOOP_VINFO_NITERS_UNCHANGED
1320                                         (loop_vec_info_for_loop (loop)));
1321           cost_pre_condition =
1322             build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
1323                     build_int_cst (TREE_TYPE (scalar_loop_iters), th));
1324
1325           pre_condition = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
1326                                        cost_pre_condition, pre_condition);
1327         }
1328     }
1329
1330   /* Prologue peeling.  */
1331   else
1332     {
1333       if (check_profitability)
1334         set_prologue_iterations (bb_before_first_loop, first_niters,
1335                                  loop, th);
1336
1337       pre_condition =
1338         fold_build2 (LE_EXPR, boolean_type_node, first_niters,
1339                      build_int_cst (TREE_TYPE (first_niters), 0));
1340     }
1341
1342   skip_e = slpeel_add_loop_guard (bb_before_first_loop, pre_condition,
1343                                   bb_before_second_loop, bb_before_first_loop);
1344   slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop,
1345                                       first_loop == new_loop,
1346                                       &new_exit_bb, &definitions);
1347
1348
1349   /* 3. Add the guard that controls whether the second loop is executed.
1350         Resulting CFG would be:
1351
1352         bb_before_first_loop:
1353         if (FIRST_NITERS == 0) GOTO bb_before_second_loop (skip first loop)
1354                                GOTO first-loop
1355
1356         first_loop:
1357         do {
1358         } while ...
1359
1360         bb_between_loops:
1361         if (FIRST_NITERS == NITERS) GOTO bb_after_second_loop (skip second loop)
1362                                     GOTO bb_before_second_loop
1363
1364         bb_before_second_loop:
1365
1366         second_loop:
1367         do {
1368         } while ...
1369
1370         bb_after_second_loop:
1371
1372         orig_exit_bb:
1373    */
1374
1375   bb_between_loops = new_exit_bb;
1376   bb_after_second_loop = split_edge (single_exit (second_loop));
1377
1378   pre_condition =
1379         fold_build2 (EQ_EXPR, boolean_type_node, first_niters, niters);
1380   skip_e = slpeel_add_loop_guard (bb_between_loops, pre_condition,
1381                                   bb_after_second_loop, bb_before_first_loop);
1382   slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop,
1383                                      second_loop == new_loop, &new_exit_bb);
1384
1385   /* 4. Make first-loop iterate FIRST_NITERS times, if requested.
1386    */
1387   if (update_first_loop_count)
1388     slpeel_make_loop_iterate_ntimes (first_loop, first_niters);
1389
1390   BITMAP_FREE (definitions);
1391   delete_update_ssa ();
1392
1393   return new_loop;
1394 }
1395
1396 /* Function vect_get_loop_location.
1397
1398    Extract the location of the loop in the source code.
1399    If the loop is not well formed for vectorization, an estimated
1400    location is calculated.
1401    Return the loop location if succeed and NULL if not.  */
1402
1403 LOC
1404 find_loop_location (struct loop *loop)
1405 {
1406   gimple stmt = NULL;
1407   basic_block bb;
1408   gimple_stmt_iterator si;
1409
1410   if (!loop)
1411     return UNKNOWN_LOC;
1412
1413   stmt = get_loop_exit_condition (loop);
1414
1415   if (stmt && gimple_location (stmt) != UNKNOWN_LOC)
1416     return gimple_location (stmt);
1417
1418   /* If we got here the loop is probably not "well formed",
1419      try to estimate the loop location */
1420
1421   if (!loop->header)
1422     return UNKNOWN_LOC;
1423
1424   bb = loop->header;
1425
1426   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1427     {
1428       stmt = gsi_stmt (si);
1429       if (gimple_location (stmt) != UNKNOWN_LOC)
1430         return gimple_location (stmt);
1431     }
1432
1433   return UNKNOWN_LOC;
1434 }
1435
1436
1437 /*************************************************************************
1438   Vectorization Debug Information.
1439  *************************************************************************/
1440
1441 /* Function vect_set_verbosity_level.
1442
1443    Called from toplev.c upon detection of the
1444    -ftree-vectorizer-verbose=N option.  */
1445
1446 void
1447 vect_set_verbosity_level (const char *val)
1448 {
1449    unsigned int vl;
1450
1451    vl = atoi (val);
1452    if (vl < MAX_VERBOSITY_LEVEL)
1453      vect_verbosity_level = vl;
1454    else
1455      vect_verbosity_level = MAX_VERBOSITY_LEVEL - 1;
1456 }
1457
1458
1459 /* Function vect_set_dump_settings.
1460
1461    Fix the verbosity level of the vectorizer if the
1462    requested level was not set explicitly using the flag
1463    -ftree-vectorizer-verbose=N.
1464    Decide where to print the debugging information (dump_file/stderr).
1465    If the user defined the verbosity level, but there is no dump file,
1466    print to stderr, otherwise print to the dump file.  */
1467
1468 static void
1469 vect_set_dump_settings (void)
1470 {
1471   vect_dump = dump_file;
1472
1473   /* Check if the verbosity level was defined by the user:  */
1474   if (vect_verbosity_level != MAX_VERBOSITY_LEVEL)
1475     {
1476       /* If there is no dump file, print to stderr.  */
1477       if (!dump_file)
1478         vect_dump = stderr;
1479       return;
1480     }
1481
1482   /* User didn't specify verbosity level:  */
1483   if (dump_file && (dump_flags & TDF_DETAILS))
1484     vect_verbosity_level = REPORT_DETAILS;
1485   else if (dump_file && (dump_flags & TDF_STATS))
1486     vect_verbosity_level = REPORT_UNVECTORIZED_LOOPS;
1487   else
1488     vect_verbosity_level = REPORT_NONE;
1489
1490   gcc_assert (dump_file || vect_verbosity_level == REPORT_NONE);
1491 }
1492
1493
1494 /* Function debug_loop_details.
1495
1496    For vectorization debug dumps.  */
1497
1498 bool
1499 vect_print_dump_info (enum verbosity_levels vl)
1500 {
1501   if (vl > vect_verbosity_level)
1502     return false;
1503
1504   if (!current_function_decl || !vect_dump)
1505     return false;
1506
1507   if (vect_loop_location == UNKNOWN_LOC)
1508     fprintf (vect_dump, "\n%s:%d: note: ",
1509              DECL_SOURCE_FILE (current_function_decl),
1510              DECL_SOURCE_LINE (current_function_decl));
1511   else
1512     fprintf (vect_dump, "\n%s:%d: note: ",
1513              LOC_FILE (vect_loop_location), LOC_LINE (vect_loop_location));
1514
1515   return true;
1516 }
1517
1518
1519 /*************************************************************************
1520   Vectorization Utilities.
1521  *************************************************************************/
1522
1523 /* Function new_stmt_vec_info.
1524
1525    Create and initialize a new stmt_vec_info struct for STMT.  */
1526
1527 stmt_vec_info
1528 new_stmt_vec_info (gimple stmt, loop_vec_info loop_vinfo)
1529 {
1530   stmt_vec_info res;
1531   res = (stmt_vec_info) xcalloc (1, sizeof (struct _stmt_vec_info));
1532
1533   STMT_VINFO_TYPE (res) = undef_vec_info_type;
1534   STMT_VINFO_STMT (res) = stmt;
1535   STMT_VINFO_LOOP_VINFO (res) = loop_vinfo;
1536   STMT_VINFO_RELEVANT (res) = 0;
1537   STMT_VINFO_LIVE_P (res) = false;
1538   STMT_VINFO_VECTYPE (res) = NULL;
1539   STMT_VINFO_VEC_STMT (res) = NULL;
1540   STMT_VINFO_IN_PATTERN_P (res) = false;
1541   STMT_VINFO_RELATED_STMT (res) = NULL;
1542   STMT_VINFO_DATA_REF (res) = NULL;
1543
1544   STMT_VINFO_DR_BASE_ADDRESS (res) = NULL;
1545   STMT_VINFO_DR_OFFSET (res) = NULL;
1546   STMT_VINFO_DR_INIT (res) = NULL;
1547   STMT_VINFO_DR_STEP (res) = NULL;
1548   STMT_VINFO_DR_ALIGNED_TO (res) = NULL;
1549
1550   if (gimple_code (stmt) == GIMPLE_PHI
1551       && is_loop_header_bb_p (gimple_bb (stmt)))
1552     STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type;
1553   else
1554     STMT_VINFO_DEF_TYPE (res) = vect_loop_def;
1555   STMT_VINFO_SAME_ALIGN_REFS (res) = VEC_alloc (dr_p, heap, 5);
1556   STMT_VINFO_INSIDE_OF_LOOP_COST (res) = 0;
1557   STMT_VINFO_OUTSIDE_OF_LOOP_COST (res) = 0;
1558   STMT_SLP_TYPE (res) = 0;
1559   DR_GROUP_FIRST_DR (res) = NULL;
1560   DR_GROUP_NEXT_DR (res) = NULL;
1561   DR_GROUP_SIZE (res) = 0;
1562   DR_GROUP_STORE_COUNT (res) = 0;
1563   DR_GROUP_GAP (res) = 0;
1564   DR_GROUP_SAME_DR_STMT (res) = NULL;
1565   DR_GROUP_READ_WRITE_DEPENDENCE (res) = false;
1566
1567   return res;
1568 }
1569
1570 /* Create a hash table for stmt_vec_info. */
1571
1572 void
1573 init_stmt_vec_info_vec (void)
1574 {
1575   gcc_assert (!stmt_vec_info_vec);
1576   stmt_vec_info_vec = VEC_alloc (vec_void_p, heap, 50);
1577 }
1578
1579 /* Free hash table for stmt_vec_info. */
1580
1581 void
1582 free_stmt_vec_info_vec (void)
1583 {
1584   gcc_assert (stmt_vec_info_vec);
1585   VEC_free (vec_void_p, heap, stmt_vec_info_vec);
1586 }
1587
1588 /* Free stmt vectorization related info.  */
1589
1590 void
1591 free_stmt_vec_info (gimple stmt)
1592 {
1593   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1594
1595   if (!stmt_info)
1596     return;
1597
1598   VEC_free (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmt_info));
1599   set_vinfo_for_stmt (stmt, NULL);
1600   free (stmt_info);
1601 }
1602
1603
1604 /* Function bb_in_loop_p
1605
1606    Used as predicate for dfs order traversal of the loop bbs.  */
1607
1608 static bool
1609 bb_in_loop_p (const_basic_block bb, const void *data)
1610 {
1611   const struct loop *const loop = (const struct loop *)data;
1612   if (flow_bb_inside_loop_p (loop, bb))
1613     return true;
1614   return false;
1615 }
1616
1617
1618 /* Function new_loop_vec_info.
1619
1620    Create and initialize a new loop_vec_info struct for LOOP, as well as
1621    stmt_vec_info structs for all the stmts in LOOP.  */
1622
1623 loop_vec_info
1624 new_loop_vec_info (struct loop *loop)
1625 {
1626   loop_vec_info res;
1627   basic_block *bbs;
1628   gimple_stmt_iterator si;
1629   unsigned int i, nbbs;
1630
1631   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1632   LOOP_VINFO_LOOP (res) = loop;
1633
1634   bbs = get_loop_body (loop);
1635
1636   /* Create/Update stmt_info for all stmts in the loop.  */
1637   for (i = 0; i < loop->num_nodes; i++)
1638     {
1639       basic_block bb = bbs[i];
1640
1641       /* BBs in a nested inner-loop will have been already processed (because
1642          we will have called vect_analyze_loop_form for any nested inner-loop).
1643          Therefore, for stmts in an inner-loop we just want to update the
1644          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
1645          loop_info of the outer-loop we are currently considering to vectorize
1646          (instead of the loop_info of the inner-loop).
1647          For stmts in other BBs we need to create a stmt_info from scratch.  */
1648       if (bb->loop_father != loop)
1649         {
1650           /* Inner-loop bb.  */
1651           gcc_assert (loop->inner && bb->loop_father == loop->inner);
1652           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1653             {
1654               gimple phi = gsi_stmt (si);
1655               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
1656               loop_vec_info inner_loop_vinfo =
1657                 STMT_VINFO_LOOP_VINFO (stmt_info);
1658               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
1659               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
1660             }
1661           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1662            {
1663               gimple stmt = gsi_stmt (si);
1664               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1665               loop_vec_info inner_loop_vinfo =
1666                  STMT_VINFO_LOOP_VINFO (stmt_info);
1667               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
1668               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
1669            }
1670         }
1671       else
1672         {
1673           /* bb in current nest.  */
1674           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1675             {
1676               gimple phi = gsi_stmt (si);
1677               gimple_set_uid (phi, 0);
1678               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1679             }
1680
1681           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1682             {
1683               gimple stmt = gsi_stmt (si);
1684               gimple_set_uid (stmt, 0);
1685               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1686             }
1687         }
1688     }
1689
1690   /* CHECKME: We want to visit all BBs before their successors (except for
1691      latch blocks, for which this assertion wouldn't hold).  In the simple
1692      case of the loop forms we allow, a dfs order of the BBs would the same
1693      as reversed postorder traversal, so we are safe.  */
1694
1695    free (bbs);
1696    bbs = XCNEWVEC (basic_block, loop->num_nodes);
1697    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1698                               bbs, loop->num_nodes, loop);
1699    gcc_assert (nbbs == loop->num_nodes);
1700
1701   LOOP_VINFO_BBS (res) = bbs;
1702   LOOP_VINFO_NITERS (res) = NULL;
1703   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1704   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
1705   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1706   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
1707   LOOP_VINFO_VECT_FACTOR (res) = 0;
1708   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
1709   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
1710   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1711   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
1712     VEC_alloc (gimple, heap,
1713                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
1714   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
1715     VEC_alloc (ddr_p, heap,
1716                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1717   LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
1718   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
1719   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1720
1721   return res;
1722 }
1723
1724
1725 /* Function destroy_loop_vec_info.
1726
1727    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1728    stmts in the loop.  */
1729
1730 void
1731 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1732 {
1733   struct loop *loop;
1734   basic_block *bbs;
1735   int nbbs;
1736   gimple_stmt_iterator si;
1737   int j;
1738   VEC (slp_instance, heap) *slp_instances;
1739   slp_instance instance;
1740
1741   if (!loop_vinfo)
1742     return;
1743
1744   loop = LOOP_VINFO_LOOP (loop_vinfo);
1745
1746   bbs = LOOP_VINFO_BBS (loop_vinfo);
1747   nbbs = loop->num_nodes;
1748
1749   if (!clean_stmts)
1750     {
1751       free (LOOP_VINFO_BBS (loop_vinfo));
1752       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
1753       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1754       VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1755
1756       free (loop_vinfo);
1757       loop->aux = NULL;
1758       return;
1759     }
1760
1761   for (j = 0; j < nbbs; j++)
1762     {
1763       basic_block bb = bbs[j];
1764
1765       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1766         free_stmt_vec_info (gsi_stmt (si));
1767
1768       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1769         {
1770           gimple stmt = gsi_stmt (si);
1771           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1772
1773           if (stmt_info)
1774             {
1775               /* Check if this is a "pattern stmt" (introduced by the
1776                  vectorizer during the pattern recognition pass).  */
1777               bool remove_stmt_p = false;
1778               gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1779               if (orig_stmt)
1780                 {
1781                   stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
1782                   if (orig_stmt_info
1783                       && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
1784                     remove_stmt_p = true;
1785                 }
1786
1787               /* Free stmt_vec_info.  */
1788               free_stmt_vec_info (stmt);
1789
1790               /* Remove dead "pattern stmts".  */
1791               if (remove_stmt_p)
1792                 gsi_remove (&si, true);
1793             }
1794           gsi_next (&si);
1795         }
1796     }
1797
1798   free (LOOP_VINFO_BBS (loop_vinfo));
1799   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
1800   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1801   VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1802   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1803   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1804   for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
1805     vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
1806   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
1807   VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
1808
1809   free (loop_vinfo);
1810   loop->aux = NULL;
1811 }
1812
1813
1814 /* Function vect_force_dr_alignment_p.
1815
1816    Returns whether the alignment of a DECL can be forced to be aligned
1817    on ALIGNMENT bit boundary.  */
1818
1819 bool
1820 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
1821 {
1822   if (TREE_CODE (decl) != VAR_DECL)
1823     return false;
1824
1825   if (DECL_EXTERNAL (decl))
1826     return false;
1827
1828   if (TREE_ASM_WRITTEN (decl))
1829     return false;
1830
1831   if (TREE_STATIC (decl))
1832     return (alignment <= MAX_OFILE_ALIGNMENT);
1833   else
1834     return (alignment <= MAX_STACK_ALIGNMENT);
1835 }
1836
1837
1838 /* Function get_vectype_for_scalar_type.
1839
1840    Returns the vector type corresponding to SCALAR_TYPE as supported
1841    by the target.  */
1842
1843 tree
1844 get_vectype_for_scalar_type (tree scalar_type)
1845 {
1846   enum machine_mode inner_mode = TYPE_MODE (scalar_type);
1847   int nbytes = GET_MODE_SIZE (inner_mode);
1848   int nunits;
1849   tree vectype;
1850
1851   if (nbytes == 0 || nbytes >= UNITS_PER_SIMD_WORD (inner_mode))
1852     return NULL_TREE;
1853
1854   /* FORNOW: Only a single vector size per mode (UNITS_PER_SIMD_WORD)
1855      is expected.  */
1856   nunits = UNITS_PER_SIMD_WORD (inner_mode) / nbytes;
1857
1858   vectype = build_vector_type (scalar_type, nunits);
1859   if (vect_print_dump_info (REPORT_DETAILS))
1860     {
1861       fprintf (vect_dump, "get vectype with %d units of type ", nunits);
1862       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
1863     }
1864
1865   if (!vectype)
1866     return NULL_TREE;
1867
1868   if (vect_print_dump_info (REPORT_DETAILS))
1869     {
1870       fprintf (vect_dump, "vectype: ");
1871       print_generic_expr (vect_dump, vectype, TDF_SLIM);
1872     }
1873
1874   if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1875       && !INTEGRAL_MODE_P (TYPE_MODE (vectype)))
1876     {
1877       if (vect_print_dump_info (REPORT_DETAILS))
1878         fprintf (vect_dump, "mode not supported by target.");
1879       return NULL_TREE;
1880     }
1881
1882   return vectype;
1883 }
1884
1885
1886 /* Function vect_supportable_dr_alignment
1887
1888    Return whether the data reference DR is supported with respect to its
1889    alignment.  */
1890
1891 enum dr_alignment_support
1892 vect_supportable_dr_alignment (struct data_reference *dr)
1893 {
1894   gimple stmt = DR_STMT (dr);
1895   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1896   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1897   enum machine_mode mode = (int) TYPE_MODE (vectype);
1898   struct loop *vect_loop = LOOP_VINFO_LOOP (STMT_VINFO_LOOP_VINFO (stmt_info));
1899   bool nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
1900   bool invariant_in_outerloop = false;
1901
1902   if (aligned_access_p (dr))
1903     return dr_aligned;
1904
1905   if (nested_in_vect_loop)
1906     {
1907       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
1908       invariant_in_outerloop =
1909         (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
1910     }
1911
1912   /* Possibly unaligned access.  */
1913
1914   /* We can choose between using the implicit realignment scheme (generating
1915      a misaligned_move stmt) and the explicit realignment scheme (generating
1916      aligned loads with a REALIGN_LOAD). There are two variants to the explicit
1917      realignment scheme: optimized, and unoptimized.
1918      We can optimize the realignment only if the step between consecutive
1919      vector loads is equal to the vector size.  Since the vector memory
1920      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
1921      is guaranteed that the misalignment amount remains the same throughout the
1922      execution of the vectorized loop.  Therefore, we can create the
1923      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
1924      at the loop preheader.
1925
1926      However, in the case of outer-loop vectorization, when vectorizing a
1927      memory access in the inner-loop nested within the LOOP that is now being
1928      vectorized, while it is guaranteed that the misalignment of the
1929      vectorized memory access will remain the same in different outer-loop
1930      iterations, it is *not* guaranteed that is will remain the same throughout
1931      the execution of the inner-loop.  This is because the inner-loop advances
1932      with the original scalar step (and not in steps of VS).  If the inner-loop
1933      step happens to be a multiple of VS, then the misalignment remains fixed
1934      and we can use the optimized realignment scheme.  For example:
1935
1936       for (i=0; i<N; i++)
1937         for (j=0; j<M; j++)
1938           s += a[i+j];
1939
1940      When vectorizing the i-loop in the above example, the step between
1941      consecutive vector loads is 1, and so the misalignment does not remain
1942      fixed across the execution of the inner-loop, and the realignment cannot
1943      be optimized (as illustrated in the following pseudo vectorized loop):
1944
1945       for (i=0; i<N; i+=4)
1946         for (j=0; j<M; j++){
1947           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
1948                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
1949                          // (assuming that we start from an aligned address).
1950           }
1951
1952      We therefore have to use the unoptimized realignment scheme:
1953
1954       for (i=0; i<N; i+=4)
1955           for (j=k; j<M; j+=4)
1956           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
1957                            // that the misalignment of the initial address is
1958                            // 0).
1959
1960      The loop can then be vectorized as follows:
1961
1962       for (k=0; k<4; k++){
1963         rt = get_realignment_token (&vp[k]);
1964         for (i=0; i<N; i+=4){
1965           v1 = vp[i+k];
1966           for (j=k; j<M; j+=4){
1967             v2 = vp[i+j+VS-1];
1968             va = REALIGN_LOAD <v1,v2,rt>;
1969             vs += va;
1970             v1 = v2;
1971           }
1972         }
1973     } */
1974
1975   if (DR_IS_READ (dr))
1976     {
1977       if (optab_handler (vec_realign_load_optab, mode)->insn_code !=
1978                                                              CODE_FOR_nothing
1979           && (!targetm.vectorize.builtin_mask_for_load
1980               || targetm.vectorize.builtin_mask_for_load ()))
1981         {
1982           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1983           if (nested_in_vect_loop
1984               && (TREE_INT_CST_LOW (DR_STEP (dr))
1985                   != GET_MODE_SIZE (TYPE_MODE (vectype))))
1986             return dr_explicit_realign;
1987           else
1988             return dr_explicit_realign_optimized;
1989         }
1990
1991       if (optab_handler (movmisalign_optab, mode)->insn_code !=
1992                                                              CODE_FOR_nothing)
1993         /* Can't software pipeline the loads, but can at least do them.  */
1994         return dr_unaligned_supported;
1995     }
1996
1997   /* Unsupported.  */
1998   return dr_unaligned_unsupported;
1999 }
2000
2001
2002 /* Function vect_is_simple_use.
2003
2004    Input:
2005    LOOP - the loop that is being vectorized.
2006    OPERAND - operand of a stmt in LOOP.
2007    DEF - the defining stmt in case OPERAND is an SSA_NAME.
2008
2009    Returns whether a stmt with OPERAND can be vectorized.
2010    Supportable operands are constants, loop invariants, and operands that are
2011    defined by the current iteration of the loop. Unsupportable operands are
2012    those that are defined by a previous iteration of the loop (as is the case
2013    in reduction/induction computations).  */
2014
2015 bool
2016 vect_is_simple_use (tree operand, loop_vec_info loop_vinfo, gimple *def_stmt,
2017                     tree *def, enum vect_def_type *dt)
2018 {
2019   basic_block bb;
2020   stmt_vec_info stmt_vinfo;
2021   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2022
2023   *def_stmt = NULL;
2024   *def = NULL_TREE;
2025
2026   if (vect_print_dump_info (REPORT_DETAILS))
2027     {
2028       fprintf (vect_dump, "vect_is_simple_use: operand ");
2029       print_generic_expr (vect_dump, operand, TDF_SLIM);
2030     }
2031
2032   if (TREE_CODE (operand) == INTEGER_CST || TREE_CODE (operand) == REAL_CST)
2033     {
2034       *dt = vect_constant_def;
2035       return true;
2036     }
2037   if (is_gimple_min_invariant (operand))
2038    {
2039       *def = operand;
2040       *dt = vect_invariant_def;
2041       return true;
2042    }
2043
2044   if (TREE_CODE (operand) == PAREN_EXPR)
2045     {
2046       if (vect_print_dump_info (REPORT_DETAILS))
2047         fprintf (vect_dump, "non-associatable copy.");
2048       operand = TREE_OPERAND (operand, 0);
2049     }
2050   if (TREE_CODE (operand) != SSA_NAME)
2051     {
2052       if (vect_print_dump_info (REPORT_DETAILS))
2053         fprintf (vect_dump, "not ssa-name.");
2054       return false;
2055     }
2056
2057   *def_stmt = SSA_NAME_DEF_STMT (operand);
2058   if (*def_stmt == NULL)
2059     {
2060       if (vect_print_dump_info (REPORT_DETAILS))
2061         fprintf (vect_dump, "no def_stmt.");
2062       return false;
2063     }
2064
2065   if (vect_print_dump_info (REPORT_DETAILS))
2066     {
2067       fprintf (vect_dump, "def_stmt: ");
2068       print_gimple_stmt (vect_dump, *def_stmt, 0, TDF_SLIM);
2069     }
2070
2071   /* empty stmt is expected only in case of a function argument.
2072      (Otherwise - we expect a phi_node or a GIMPLE_ASSIGN).  */
2073   if (gimple_nop_p (*def_stmt))
2074     {
2075       *def = operand;
2076       *dt = vect_invariant_def;
2077       return true;
2078     }
2079
2080   bb = gimple_bb (*def_stmt);
2081   if (!flow_bb_inside_loop_p (loop, bb))
2082     *dt = vect_invariant_def;
2083   else
2084     {
2085       stmt_vinfo = vinfo_for_stmt (*def_stmt);
2086       *dt = STMT_VINFO_DEF_TYPE (stmt_vinfo);
2087     }
2088
2089   if (*dt == vect_unknown_def_type)
2090     {
2091       if (vect_print_dump_info (REPORT_DETAILS))
2092         fprintf (vect_dump, "Unsupported pattern.");
2093       return false;
2094     }
2095
2096   if (vect_print_dump_info (REPORT_DETAILS))
2097     fprintf (vect_dump, "type of def: %d.",*dt);
2098
2099   switch (gimple_code (*def_stmt))
2100     {
2101     case GIMPLE_PHI:
2102       *def = gimple_phi_result (*def_stmt);
2103       break;
2104
2105     case GIMPLE_ASSIGN:
2106       *def = gimple_assign_lhs (*def_stmt);
2107       break;
2108
2109     case GIMPLE_CALL:
2110       *def = gimple_call_lhs (*def_stmt);
2111       if (*def != NULL)
2112         break;
2113       /* FALLTHRU */
2114     default:
2115       if (vect_print_dump_info (REPORT_DETAILS))
2116         fprintf (vect_dump, "unsupported defining stmt: ");
2117       return false;
2118     }
2119
2120   return true;
2121 }
2122
2123
2124 /* Function supportable_widening_operation
2125
2126    Check whether an operation represented by the code CODE is a
2127    widening operation that is supported by the target platform in
2128    vector form (i.e., when operating on arguments of type VECTYPE).
2129
2130    Widening operations we currently support are NOP (CONVERT), FLOAT
2131    and WIDEN_MULT.  This function checks if these operations are supported
2132    by the target platform either directly (via vector tree-codes), or via
2133    target builtins.
2134
2135    Output:
2136    - CODE1 and CODE2 are codes of vector operations to be used when
2137    vectorizing the operation, if available.
2138    - DECL1 and DECL2 are decls of target builtin functions to be used
2139    when vectorizing the operation, if available. In this case,
2140    CODE1 and CODE2 are CALL_EXPR.
2141    - MULTI_STEP_CVT determines the number of required intermediate steps in
2142    case of multi-step conversion (like char->short->int - in that case
2143    MULTI_STEP_CVT will be 1).
2144    - INTERM_TYPES contains the intermediate type required to perform the
2145    widening operation (short in the above example).  */
2146
2147 bool
2148 supportable_widening_operation (enum tree_code code, gimple stmt, tree vectype,
2149                                 tree *decl1, tree *decl2,
2150                                 enum tree_code *code1, enum tree_code *code2,
2151                                 int *multi_step_cvt,
2152                                 VEC (tree, heap) **interm_types)
2153 {
2154   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2155   loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
2156   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2157   bool ordered_p;
2158   enum machine_mode vec_mode;
2159   enum insn_code icode1 = 0, icode2 = 0;
2160   optab optab1, optab2;
2161   tree type = gimple_expr_type (stmt);
2162   tree wide_vectype = get_vectype_for_scalar_type (type);
2163   enum tree_code c1, c2;
2164
2165   /* The result of a vectorized widening operation usually requires two vectors
2166      (because the widened results do not fit int one vector). The generated
2167      vector results would normally be expected to be generated in the same
2168      order as in the original scalar computation, i.e. if 8 results are
2169      generated in each vector iteration, they are to be organized as follows:
2170         vect1: [res1,res2,res3,res4], vect2: [res5,res6,res7,res8].
2171
2172      However, in the special case that the result of the widening operation is
2173      used in a reduction computation only, the order doesn't matter (because
2174      when vectorizing a reduction we change the order of the computation).
2175      Some targets can take advantage of this and generate more efficient code.
2176      For example, targets like Altivec, that support widen_mult using a sequence
2177      of {mult_even,mult_odd} generate the following vectors:
2178         vect1: [res1,res3,res5,res7], vect2: [res2,res4,res6,res8].
2179
2180      When vectorizing outer-loops, we execute the inner-loop sequentially
2181      (each vectorized inner-loop iteration contributes to VF outer-loop
2182      iterations in parallel). We therefore don't allow to change the order
2183      of the computation in the inner-loop during outer-loop vectorization.  */
2184
2185    if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction
2186        && !nested_in_vect_loop_p (vect_loop, stmt))
2187      ordered_p = false;
2188    else
2189      ordered_p = true;
2190
2191   if (!ordered_p
2192       && code == WIDEN_MULT_EXPR
2193       && targetm.vectorize.builtin_mul_widen_even
2194       && targetm.vectorize.builtin_mul_widen_even (vectype)
2195       && targetm.vectorize.builtin_mul_widen_odd
2196       && targetm.vectorize.builtin_mul_widen_odd (vectype))
2197     {
2198       if (vect_print_dump_info (REPORT_DETAILS))
2199         fprintf (vect_dump, "Unordered widening operation detected.");
2200
2201       *code1 = *code2 = CALL_EXPR;
2202       *decl1 = targetm.vectorize.builtin_mul_widen_even (vectype);
2203       *decl2 = targetm.vectorize.builtin_mul_widen_odd (vectype);
2204       return true;
2205     }
2206
2207   switch (code)
2208     {
2209     case WIDEN_MULT_EXPR:
2210       if (BYTES_BIG_ENDIAN)
2211         {
2212           c1 = VEC_WIDEN_MULT_HI_EXPR;
2213           c2 = VEC_WIDEN_MULT_LO_EXPR;
2214         }
2215       else
2216         {
2217           c2 = VEC_WIDEN_MULT_HI_EXPR;
2218           c1 = VEC_WIDEN_MULT_LO_EXPR;
2219         }
2220       break;
2221
2222     CASE_CONVERT:
2223       if (BYTES_BIG_ENDIAN)
2224         {
2225           c1 = VEC_UNPACK_HI_EXPR;
2226           c2 = VEC_UNPACK_LO_EXPR;
2227         }
2228       else
2229         {
2230           c2 = VEC_UNPACK_HI_EXPR;
2231           c1 = VEC_UNPACK_LO_EXPR;
2232         }
2233       break;
2234
2235     case FLOAT_EXPR:
2236       if (BYTES_BIG_ENDIAN)
2237         {
2238           c1 = VEC_UNPACK_FLOAT_HI_EXPR;
2239           c2 = VEC_UNPACK_FLOAT_LO_EXPR;
2240         }
2241       else
2242         {
2243           c2 = VEC_UNPACK_FLOAT_HI_EXPR;
2244           c1 = VEC_UNPACK_FLOAT_LO_EXPR;
2245         }
2246       break;
2247
2248     case FIX_TRUNC_EXPR:
2249       /* ??? Not yet implemented due to missing VEC_UNPACK_FIX_TRUNC_HI_EXPR/
2250          VEC_UNPACK_FIX_TRUNC_LO_EXPR tree codes and optabs used for
2251          computing the operation.  */
2252       return false;
2253
2254     default:
2255       gcc_unreachable ();
2256     }
2257
2258   if (code == FIX_TRUNC_EXPR)
2259     {
2260       /* The signedness is determined from output operand.  */
2261       optab1 = optab_for_tree_code (c1, type, optab_default);
2262       optab2 = optab_for_tree_code (c2, type, optab_default);
2263     }
2264   else
2265     {
2266       optab1 = optab_for_tree_code (c1, vectype, optab_default);
2267       optab2 = optab_for_tree_code (c2, vectype, optab_default);
2268     }
2269
2270   if (!optab1 || !optab2)
2271     return false;
2272
2273   vec_mode = TYPE_MODE (vectype);
2274   if ((icode1 = optab_handler (optab1, vec_mode)->insn_code) == CODE_FOR_nothing
2275        || (icode2 = optab_handler (optab2, vec_mode)->insn_code)
2276                                                        == CODE_FOR_nothing)
2277     return false;
2278
2279   /* Check if it's a multi-step conversion that can be done using intermediate
2280      types.  */
2281   if (insn_data[icode1].operand[0].mode != TYPE_MODE (wide_vectype)
2282        || insn_data[icode2].operand[0].mode != TYPE_MODE (wide_vectype))
2283     {
2284       int i;
2285       tree prev_type = vectype, intermediate_type;
2286       enum machine_mode intermediate_mode, prev_mode = vec_mode;
2287       optab optab3, optab4;
2288
2289       if (!CONVERT_EXPR_CODE_P (code))
2290         return false;
2291
2292       *code1 = c1;
2293       *code2 = c2;
2294
2295       /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
2296          intermediate  steps in promotion sequence. We try MAX_INTERM_CVT_STEPS
2297          to get to NARROW_VECTYPE, and fail if we do not.  */
2298       *interm_types = VEC_alloc (tree, heap, MAX_INTERM_CVT_STEPS);
2299       for (i = 0; i < 3; i++)
2300         {
2301           intermediate_mode = insn_data[icode1].operand[0].mode;
2302           intermediate_type = lang_hooks.types.type_for_mode (intermediate_mode,
2303                                                      TYPE_UNSIGNED (prev_type));
2304           optab3 = optab_for_tree_code (c1, intermediate_type, optab_default);
2305           optab4 = optab_for_tree_code (c2, intermediate_type, optab_default);
2306
2307           if (!optab3 || !optab4
2308               || (icode1 = optab1->handlers[(int) prev_mode].insn_code)
2309                                                         == CODE_FOR_nothing
2310               || insn_data[icode1].operand[0].mode != intermediate_mode
2311               || (icode2 = optab2->handlers[(int) prev_mode].insn_code)
2312                                                         == CODE_FOR_nothing
2313               || insn_data[icode2].operand[0].mode != intermediate_mode
2314               || (icode1 = optab3->handlers[(int) intermediate_mode].insn_code)
2315                                                         == CODE_FOR_nothing
2316               || (icode2 = optab4->handlers[(int) intermediate_mode].insn_code)
2317                                                         == CODE_FOR_nothing)
2318             return false;
2319
2320           VEC_quick_push (tree, *interm_types, intermediate_type);
2321           (*multi_step_cvt)++;
2322
2323           if (insn_data[icode1].operand[0].mode == TYPE_MODE (wide_vectype)
2324               && insn_data[icode2].operand[0].mode == TYPE_MODE (wide_vectype))
2325             return true;
2326
2327           prev_type = intermediate_type;
2328           prev_mode = intermediate_mode;
2329         }
2330
2331        return false;
2332     }
2333
2334   *code1 = c1;
2335   *code2 = c2;
2336   return true;
2337 }
2338
2339
2340 /* Function supportable_narrowing_operation
2341
2342    Check whether an operation represented by the code CODE is a
2343    narrowing operation that is supported by the target platform in
2344    vector form (i.e., when operating on arguments of type VECTYPE).
2345
2346    Narrowing operations we currently support are NOP (CONVERT) and
2347    FIX_TRUNC. This function checks if these operations are supported by
2348    the target platform directly via vector tree-codes.
2349
2350    Output:
2351    - CODE1 is the code of a vector operation to be used when
2352    vectorizing the operation, if available.
2353    - MULTI_STEP_CVT determines the number of required intermediate steps in
2354    case of multi-step conversion (like int->short->char - in that case
2355    MULTI_STEP_CVT will be 1).
2356    - INTERM_TYPES contains the intermediate type required to perform the
2357    narrowing operation (short in the above example).   */
2358
2359 bool
2360 supportable_narrowing_operation (enum tree_code code,
2361                                  const_gimple stmt, tree vectype,
2362                                  enum tree_code *code1, int *multi_step_cvt,
2363                                  VEC (tree, heap) **interm_types)
2364 {
2365   enum machine_mode vec_mode;
2366   enum insn_code icode1;
2367   optab optab1, interm_optab;
2368   tree type = gimple_expr_type (stmt);
2369   tree narrow_vectype = get_vectype_for_scalar_type (type);
2370   enum tree_code c1;
2371   tree intermediate_type, prev_type;
2372   int i;
2373
2374   switch (code)
2375     {
2376     CASE_CONVERT:
2377       c1 = VEC_PACK_TRUNC_EXPR;
2378       break;
2379
2380     case FIX_TRUNC_EXPR:
2381       c1 = VEC_PACK_FIX_TRUNC_EXPR;
2382       break;
2383
2384     case FLOAT_EXPR:
2385       /* ??? Not yet implemented due to missing VEC_PACK_FLOAT_EXPR
2386          tree code and optabs used for computing the operation.  */
2387       return false;
2388
2389     default:
2390       gcc_unreachable ();
2391     }
2392
2393   if (code == FIX_TRUNC_EXPR)
2394     /* The signedness is determined from output operand.  */
2395     optab1 = optab_for_tree_code (c1, type, optab_default);
2396   else
2397     optab1 = optab_for_tree_code (c1, vectype, optab_default);
2398
2399   if (!optab1)
2400     return false;
2401
2402   vec_mode = TYPE_MODE (vectype);
2403   if ((icode1 = optab_handler (optab1, vec_mode)->insn_code)
2404        == CODE_FOR_nothing)
2405     return false;
2406
2407   /* Check if it's a multi-step conversion that can be done using intermediate
2408      types.  */
2409   if (insn_data[icode1].operand[0].mode != TYPE_MODE (narrow_vectype))
2410     {
2411       enum machine_mode intermediate_mode, prev_mode = vec_mode;
2412
2413       *code1 = c1;
2414       prev_type = vectype;
2415       /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
2416          intermediate  steps in promotion sequence. We try MAX_INTERM_CVT_STEPS
2417          to get to NARROW_VECTYPE, and fail if we do not.  */
2418       *interm_types = VEC_alloc (tree, heap, MAX_INTERM_CVT_STEPS);
2419       for (i = 0; i < 3; i++)
2420         {
2421           intermediate_mode = insn_data[icode1].operand[0].mode;
2422           intermediate_type = lang_hooks.types.type_for_mode (intermediate_mode,
2423                                                      TYPE_UNSIGNED (prev_type));
2424           interm_optab = optab_for_tree_code (c1, intermediate_type,
2425                                               optab_default);
2426           if (!interm_optab
2427               || (icode1 = optab1->handlers[(int) prev_mode].insn_code)
2428                                                         == CODE_FOR_nothing
2429               || insn_data[icode1].operand[0].mode != intermediate_mode
2430               || (icode1
2431                   = interm_optab->handlers[(int) intermediate_mode].insn_code)
2432                  == CODE_FOR_nothing)
2433             return false;
2434
2435           VEC_quick_push (tree, *interm_types, intermediate_type);
2436           (*multi_step_cvt)++;
2437
2438           if (insn_data[icode1].operand[0].mode == TYPE_MODE (narrow_vectype))
2439             return true;
2440
2441           prev_type = intermediate_type;
2442           prev_mode = intermediate_mode;
2443         }
2444
2445       return false;
2446     }
2447
2448   *code1 = c1;
2449   return true;
2450 }
2451
2452
2453 /* Function reduction_code_for_scalar_code
2454
2455    Input:
2456    CODE - tree_code of a reduction operations.
2457
2458    Output:
2459    REDUC_CODE - the corresponding tree-code to be used to reduce the
2460       vector of partial results into a single scalar result (which
2461       will also reside in a vector).
2462
2463    Return TRUE if a corresponding REDUC_CODE was found, FALSE otherwise.  */
2464
2465 bool
2466 reduction_code_for_scalar_code (enum tree_code code,
2467                                 enum tree_code *reduc_code)
2468 {
2469   switch (code)
2470   {
2471   case MAX_EXPR:
2472     *reduc_code = REDUC_MAX_EXPR;
2473     return true;
2474
2475   case MIN_EXPR:
2476     *reduc_code = REDUC_MIN_EXPR;
2477     return true;
2478
2479   case PLUS_EXPR:
2480     *reduc_code = REDUC_PLUS_EXPR;
2481     return true;
2482
2483   default:
2484     return false;
2485   }
2486 }
2487
2488 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2489    STMT is printed with a message MSG. */
2490
2491 static void
2492 report_vect_op (gimple stmt, const char *msg)
2493 {
2494   fprintf (vect_dump, "%s", msg);
2495   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
2496 }
2497
2498 /* Function vect_is_simple_reduction
2499
2500    Detect a cross-iteration def-use cycle that represents a simple
2501    reduction computation. We look for the following pattern:
2502
2503    loop_header:
2504      a1 = phi < a0, a2 >
2505      a3 = ...
2506      a2 = operation (a3, a1)
2507
2508    such that:
2509    1. operation is commutative and associative and it is safe to
2510       change the order of the computation.
2511    2. no uses for a2 in the loop (a2 is used out of the loop)
2512    3. no uses of a1 in the loop besides the reduction operation.
2513
2514    Condition 1 is tested here.
2515    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.  */
2516
2517 gimple
2518 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi)
2519 {
2520   struct loop *loop = (gimple_bb (phi))->loop_father;
2521   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2522   edge latch_e = loop_latch_edge (loop);
2523   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2524   gimple def_stmt, def1, def2;
2525   enum tree_code code;
2526   tree op1, op2;
2527   tree type;
2528   int nloop_uses;
2529   tree name;
2530   imm_use_iterator imm_iter;
2531   use_operand_p use_p;
2532
2533   gcc_assert (loop == vect_loop || flow_loop_nested_p (vect_loop, loop));
2534
2535   name = PHI_RESULT (phi);
2536   nloop_uses = 0;
2537   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2538     {
2539       gimple use_stmt = USE_STMT (use_p);
2540       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2541           && vinfo_for_stmt (use_stmt)
2542           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2543         nloop_uses++;
2544       if (nloop_uses > 1)
2545         {
2546           if (vect_print_dump_info (REPORT_DETAILS))
2547             fprintf (vect_dump, "reduction used in loop.");
2548           return NULL;
2549         }
2550     }
2551
2552   if (TREE_CODE (loop_arg) != SSA_NAME)
2553     {
2554       if (vect_print_dump_info (REPORT_DETAILS))
2555         {
2556           fprintf (vect_dump, "reduction: not ssa_name: ");
2557           print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
2558         }
2559       return NULL;
2560     }
2561
2562   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2563   if (!def_stmt)
2564     {
2565       if (vect_print_dump_info (REPORT_DETAILS))
2566         fprintf (vect_dump, "reduction: no def_stmt.");
2567       return NULL;
2568     }
2569
2570   if (!is_gimple_assign (def_stmt))
2571     {
2572       if (vect_print_dump_info (REPORT_DETAILS))
2573         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
2574       return NULL;
2575     }
2576
2577   name = gimple_assign_lhs (def_stmt);
2578   nloop_uses = 0;
2579   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2580     {
2581       gimple use_stmt = USE_STMT (use_p);
2582       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2583           && vinfo_for_stmt (use_stmt)
2584           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2585         nloop_uses++;
2586       if (nloop_uses > 1)
2587         {
2588           if (vect_print_dump_info (REPORT_DETAILS))
2589             fprintf (vect_dump, "reduction used in loop.");
2590           return NULL;
2591         }
2592     }
2593
2594   code = gimple_assign_rhs_code (def_stmt);
2595
2596   if (!commutative_tree_code (code) || !associative_tree_code (code))
2597     {
2598       if (vect_print_dump_info (REPORT_DETAILS))
2599         report_vect_op (def_stmt, "reduction: not commutative/associative: ");
2600       return NULL;
2601     }
2602
2603   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2604     {
2605       if (vect_print_dump_info (REPORT_DETAILS))
2606         report_vect_op (def_stmt, "reduction: not binary operation: ");
2607       return NULL;
2608     }
2609
2610   op1 = gimple_assign_rhs1 (def_stmt);
2611   op2 = gimple_assign_rhs2 (def_stmt);
2612   if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
2613     {
2614       if (vect_print_dump_info (REPORT_DETAILS))
2615         report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2616       return NULL;
2617     }
2618
2619   /* Check that it's ok to change the order of the computation.  */
2620   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2621   if (TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op1))
2622       || TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op2)))
2623     {
2624       if (vect_print_dump_info (REPORT_DETAILS))
2625         {
2626           fprintf (vect_dump, "reduction: multiple types: operation type: ");
2627           print_generic_expr (vect_dump, type, TDF_SLIM);
2628           fprintf (vect_dump, ", operands types: ");
2629           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
2630           fprintf (vect_dump, ",");
2631           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
2632         }
2633       return NULL;
2634     }
2635
2636   /* Generally, when vectorizing a reduction we change the order of the
2637      computation.  This may change the behavior of the program in some
2638      cases, so we need to check that this is ok.  One exception is when
2639      vectorizing an outer-loop: the inner-loop is executed sequentially,
2640      and therefore vectorizing reductions in the inner-loop during
2641      outer-loop vectorization is safe.  */
2642
2643   /* CHECKME: check for !flag_finite_math_only too?  */
2644   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2645       && !nested_in_vect_loop_p (vect_loop, def_stmt))
2646     {
2647       /* Changing the order of operations changes the semantics.  */
2648       if (vect_print_dump_info (REPORT_DETAILS))
2649         report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
2650       return NULL;
2651     }
2652   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2653            && !nested_in_vect_loop_p (vect_loop, def_stmt))
2654     {
2655       /* Changing the order of operations changes the semantics.  */
2656       if (vect_print_dump_info (REPORT_DETAILS))
2657         report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
2658       return NULL;
2659     }
2660   else if (SAT_FIXED_POINT_TYPE_P (type))
2661     {
2662       /* Changing the order of operations changes the semantics.  */
2663       if (vect_print_dump_info (REPORT_DETAILS))
2664         report_vect_op (def_stmt,
2665                         "reduction: unsafe fixed-point math optimization: ");
2666       return NULL;
2667     }
2668
2669   /* reduction is safe. we're dealing with one of the following:
2670      1) integer arithmetic and no trapv
2671      2) floating point arithmetic, and special flags permit this optimization.
2672    */
2673   def1 = SSA_NAME_DEF_STMT (op1);
2674   def2 = SSA_NAME_DEF_STMT (op2);
2675   if (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2))
2676     {
2677       if (vect_print_dump_info (REPORT_DETAILS))
2678         report_vect_op (def_stmt, "reduction: no defs for operands: ");
2679       return NULL;
2680     }
2681
2682
2683   /* Check that one def is the reduction def, defined by PHI,
2684      the other def is either defined in the loop ("vect_loop_def"),
2685      or it's an induction (defined by a loop-header phi-node).  */
2686
2687   if (def2 == phi
2688       && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2689       && (is_gimple_assign (def1)
2690           || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def
2691           || (gimple_code (def1) == GIMPLE_PHI
2692               && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_loop_def
2693               && !is_loop_header_bb_p (gimple_bb (def1)))))
2694     {
2695       if (vect_print_dump_info (REPORT_DETAILS))
2696         report_vect_op (def_stmt, "detected reduction:");
2697       return def_stmt;
2698     }
2699   else if (def1 == phi
2700            && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2701            && (is_gimple_assign (def2)
2702                || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_induction_def
2703                || (gimple_code (def2) == GIMPLE_PHI
2704                    && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_loop_def
2705                    && !is_loop_header_bb_p (gimple_bb (def2)))))
2706     {
2707       /* Swap operands (just for simplicity - so that the rest of the code
2708          can assume that the reduction variable is always the last (second)
2709          argument).  */
2710       if (vect_print_dump_info (REPORT_DETAILS))
2711         report_vect_op (def_stmt ,
2712                         "detected reduction: need to swap operands:");
2713       swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2714                           gimple_assign_rhs2_ptr (def_stmt));
2715       return def_stmt;
2716     }
2717   else
2718     {
2719       if (vect_print_dump_info (REPORT_DETAILS))
2720         report_vect_op (def_stmt, "reduction: unknown pattern.");
2721       return NULL;
2722     }
2723 }
2724
2725
2726 /* Function vect_is_simple_iv_evolution.
2727
2728    FORNOW: A simple evolution of an induction variables in the loop is
2729    considered a polynomial evolution with constant step.  */
2730
2731 bool
2732 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
2733                              tree * step)
2734 {
2735   tree init_expr;
2736   tree step_expr;
2737   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
2738
2739   /* When there is no evolution in this loop, the evolution function
2740      is not "simple".  */
2741   if (evolution_part == NULL_TREE)
2742     return false;
2743
2744   /* When the evolution is a polynomial of degree >= 2
2745      the evolution function is not "simple".  */
2746   if (tree_is_chrec (evolution_part))
2747     return false;
2748
2749   step_expr = evolution_part;
2750   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
2751
2752   if (vect_print_dump_info (REPORT_DETAILS))
2753     {
2754       fprintf (vect_dump, "step: ");
2755       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
2756       fprintf (vect_dump, ",  init: ");
2757       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
2758     }
2759
2760   *init = init_expr;
2761   *step = step_expr;
2762
2763   if (TREE_CODE (step_expr) != INTEGER_CST)
2764     {
2765       if (vect_print_dump_info (REPORT_DETAILS))
2766         fprintf (vect_dump, "step unknown.");
2767       return false;
2768     }
2769
2770   return true;
2771 }
2772
2773
2774 /* Function vectorize_loops.
2775
2776    Entry Point to loop vectorization phase.  */
2777
2778 unsigned
2779 vectorize_loops (void)
2780 {
2781   unsigned int i;
2782   unsigned int num_vectorized_loops = 0;
2783   unsigned int vect_loops_num;
2784   loop_iterator li;
2785   struct loop *loop;
2786
2787   vect_loops_num = number_of_loops ();
2788
2789   /* Bail out if there are no loops.  */
2790   if (vect_loops_num <= 1)
2791     return 0;
2792
2793   /* Fix the verbosity level if not defined explicitly by the user.  */
2794   vect_set_dump_settings ();
2795
2796   /* Allocate the bitmap that records which virtual variables that
2797      need to be renamed.  */
2798   vect_memsyms_to_rename = BITMAP_ALLOC (NULL);
2799
2800   init_stmt_vec_info_vec ();
2801
2802   /*  ----------- Analyze loops. -----------  */
2803
2804   /* If some loop was duplicated, it gets bigger number
2805      than all previously defined loops. This fact allows us to run
2806      only over initial loops skipping newly generated ones.  */
2807   FOR_EACH_LOOP (li, loop, 0)
2808     {
2809       loop_vec_info loop_vinfo;
2810
2811       vect_loop_location = find_loop_location (loop);
2812       loop_vinfo = vect_analyze_loop (loop);
2813       loop->aux = loop_vinfo;
2814
2815       if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
2816         continue;
2817
2818       vect_transform_loop (loop_vinfo);
2819       num_vectorized_loops++;
2820     }
2821   vect_loop_location = UNKNOWN_LOC;
2822
2823   statistics_counter_event (cfun, "Vectorized loops", num_vectorized_loops);
2824   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)
2825       || (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
2826           && num_vectorized_loops > 0))
2827     fprintf (vect_dump, "vectorized %u loops in function.\n",
2828              num_vectorized_loops);
2829
2830   /*  ----------- Finalize. -----------  */
2831
2832   BITMAP_FREE (vect_memsyms_to_rename);
2833
2834   for (i = 1; i < vect_loops_num; i++)
2835     {
2836       loop_vec_info loop_vinfo;
2837
2838       loop = get_loop (i);
2839       if (!loop)
2840         continue;
2841       loop_vinfo = (loop_vec_info) loop->aux;
2842       destroy_loop_vec_info (loop_vinfo, true);
2843       loop->aux = NULL;
2844     }
2845
2846   free_stmt_vec_info_vec ();
2847
2848   return num_vectorized_loops > 0 ? TODO_cleanup_cfg : 0;
2849 }
2850
2851 /* Increase alignment of global arrays to improve vectorization potential.
2852    TODO:
2853    - Consider also structs that have an array field.
2854    - Use ipa analysis to prune arrays that can't be vectorized?
2855      This should involve global alignment analysis and in the future also
2856      array padding.  */
2857
2858 static unsigned int
2859 increase_alignment (void)
2860 {
2861   struct varpool_node *vnode;
2862
2863   /* Increase the alignment of all global arrays for vectorization.  */
2864   for (vnode = varpool_nodes_queue;
2865        vnode;
2866        vnode = vnode->next_needed)
2867     {
2868       tree vectype, decl = vnode->decl;
2869       unsigned int alignment;
2870
2871       if (TREE_CODE (TREE_TYPE (decl)) != ARRAY_TYPE)
2872         continue;
2873       vectype = get_vectype_for_scalar_type (TREE_TYPE (TREE_TYPE (decl)));
2874       if (!vectype)
2875         continue;
2876       alignment = TYPE_ALIGN (vectype);
2877       if (DECL_ALIGN (decl) >= alignment)
2878         continue;
2879
2880       if (vect_can_force_dr_alignment_p (decl, alignment))
2881         {
2882           DECL_ALIGN (decl) = TYPE_ALIGN (vectype);
2883           DECL_USER_ALIGN (decl) = 1;
2884           if (dump_file)
2885             {
2886               fprintf (dump_file, "Increasing alignment of decl: ");
2887               print_generic_expr (dump_file, decl, TDF_SLIM);
2888             }
2889         }
2890     }
2891   return 0;
2892 }
2893
2894 static bool
2895 gate_increase_alignment (void)
2896 {
2897   return flag_section_anchors && flag_tree_vectorize;
2898 }
2899
2900 struct simple_ipa_opt_pass pass_ipa_increase_alignment =
2901 {
2902  {
2903   SIMPLE_IPA_PASS,
2904   "increase_alignment",                 /* name */
2905   gate_increase_alignment,              /* gate */
2906   increase_alignment,                   /* execute */
2907   NULL,                                 /* sub */
2908   NULL,                                 /* next */
2909   0,                                    /* static_pass_number */
2910   0,                                    /* tv_id */
2911   0,                                    /* properties_required */
2912   0,                                    /* properties_provided */
2913   0,                                    /* properties_destroyed */
2914   0,                                    /* todo_flags_start */
2915   0                                     /* todo_flags_finish */
2916  }
2917 };