gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
   3    Free Software Foundation, Inc.
   4    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   5    Ira Rosen <irar@il.ibm.com>
   6
   7 This file is part of GCC.
   8
   9 GCC is free software; you can redistribute it and/or modify it under
  10 the terms of the GNU General Public License as published by the Free
  11 Software Foundation; either version 3, or (at your option) any later
  12 version.
  13
  14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  17 for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with GCC; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "tree-pretty-print.h"
  31 #include "gimple-pretty-print.h"
  32 #include "tree-flow.h"
  33 #include "tree-dump.h"
  34 #include "cfgloop.h"
  35 #include "cfglayout.h"
  36 #include "expr.h"
  37 #include "recog.h"
  38 #include "optabs.h"
  39 #include "params.h"
  40 #include "toplev.h"
  41 #include "tree-chrec.h"
  42 #include "tree-scalar-evolution.h"
  43 #include "tree-vectorizer.h"
  44 #include "target.h"
  45
  46 /* Loop Vectorization Pass.
  47
  48    This pass tries to vectorize loops.
  49
  50    For example, the vectorizer transforms the following simple loop:
  51
  52         short a[N]; short b[N]; short c[N]; int i;
  53
  54         for (i=0; i<N; i++){
  55           a[i] = b[i] + c[i];
  56         }
  57
  58    as if it was manually vectorized by rewriting the source code into:
  59
  60         typedef int __attribute__((mode(V8HI))) v8hi;
  61         short a[N];  short b[N]; short c[N];   int i;
  62         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  63         v8hi va, vb, vc;
  64
  65         for (i=0; i<N/8; i++){
  66           vb = pb[i];
  67           vc = pc[i];
  68           va = vb + vc;
  69           pa[i] = va;
  70         }
  71
  72         The main entry to this pass is vectorize_loops(), in which
  73    the vectorizer applies a set of analyses on a given set of loops,
  74    followed by the actual vectorization transformation for the loops that
  75    had successfully passed the analysis phase.
  76         Throughout this pass we make a distinction between two types of
  77    data: scalars (which are represented by SSA_NAMES), and memory references
  78    ("data-refs"). These two types of data require different handling both
  79    during analysis and transformation. The types of data-refs that the
  80    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  81    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  82    accesses are required to have a simple (consecutive) access pattern.
  83
  84    Analysis phase:
  85    ===============
  86         The driver for the analysis phase is vect_analyze_loop().
  87    It applies a set of analyses, some of which rely on the scalar evolution
  88    analyzer (scev) developed by Sebastian Pop.
  89
  90         During the analysis phase the vectorizer records some information
  91    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  92    loop, as well as general information about the loop as a whole, which is
  93    recorded in a "loop_vec_info" struct attached to each loop.
  94
  95    Transformation phase:
  96    =====================
  97         The loop transformation phase scans all the stmts in the loop, and
  98    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  99    the loop that needs to be vectorized. It inserts the vector code sequence
 100    just before the scalar stmt S, and records a pointer to the vector code
 101    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 102    attached to S). This pointer will be used for the vectorization of following
 103    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 104    otherwise, we rely on dead code elimination for removing it.
 105
 106         For example, say stmt S1 was vectorized into stmt VS1:
 107
 108    VS1: vb = px[i];
 109    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 110    S2:  a = b;
 111
 112    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 113    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 114    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
 115    resulting sequence would be:
 116
 117    VS1: vb = px[i];
 118    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 119    VS2: va = vb;
 120    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 121
 122         Operands that are not SSA_NAMEs, are data-refs that appear in
 123    load/store operations (like 'x[i]' in S1), and are handled differently.
 124
 125    Target modeling:
 126    =================
 127         Currently the only target specific information that is used is the
 128    size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
 129    support different sizes of vectors, for now will need to specify one value
 130    for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
 131
 132         Since we only vectorize operations which vector form can be
 133    expressed using existing tree codes, to verify that an operation is
 134    supported, the vectorizer checks the relevant optab at the relevant
 135    machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
 136    the value found is CODE_FOR_nothing, then there's no target support, and
 137    we can't vectorize the stmt.
 138
 139    For additional information on this project see:
 140    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 141 */
 142
 143 /* Function vect_determine_vectorization_factor
 144
 145    Determine the vectorization factor (VF). VF is the number of data elements
 146    that are operated upon in parallel in a single iteration of the vectorized
 147    loop. For example, when vectorizing a loop that operates on 4byte elements,
 148    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 149    elements can fit in a single vector register.
 150
 151    We currently support vectorization of loops in which all types operated upon
 152    are of the same size. Therefore this function currently sets VF according to
 153    the size of the types operated upon, and fails if there are multiple sizes
 154    in the loop.
 155
 156    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 157    original loop:
 158         for (i=0; i<N; i++){
 159           a[i] = b[i] + c[i];
 160         }
 161
 162    vectorized loop:
 163         for (i=0; i<N; i+=VF){
 164           a[i:VF] = b[i:VF] + c[i:VF];
 165         }
 166 */
 167
 168 static bool
 169 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 170 {
 171   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 172   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 173   int nbbs = loop->num_nodes;
 174   gimple_stmt_iterator si;
 175   unsigned int vectorization_factor = 0;
 176   tree scalar_type;
 177   gimple phi;
 178   tree vectype;
 179   unsigned int nunits;
 180   stmt_vec_info stmt_info;
 181   int i;
 182   HOST_WIDE_INT dummy;
 183
 184   if (vect_print_dump_info (REPORT_DETAILS))
 185     fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
 186
 187   for (i = 0; i < nbbs; i++)
 188     {
 189       basic_block bb = bbs[i];
 190
 191       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 192         {
 193           phi = gsi_stmt (si);
 194           stmt_info = vinfo_for_stmt (phi);
 195           if (vect_print_dump_info (REPORT_DETAILS))
 196             {
 197               fprintf (vect_dump, "==> examining phi: ");
 198               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 199             }
 200
 201           gcc_assert (stmt_info);
 202
 203           if (STMT_VINFO_RELEVANT_P (stmt_info))
 204             {
 205               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 206               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 207
 208               if (vect_print_dump_info (REPORT_DETAILS))
 209                 {
 210                   fprintf (vect_dump, "get vectype for scalar type:  ");
 211                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 212                 }
 213
 214               vectype = get_vectype_for_scalar_type (scalar_type);
 215               if (!vectype)
 216                 {
 217                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 218                     {
 219                       fprintf (vect_dump,
 220                                "not vectorized: unsupported data-type ");
 221                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 222                     }
 223                   return false;
 224                 }
 225               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 226
 227               if (vect_print_dump_info (REPORT_DETAILS))
 228                 {
 229                   fprintf (vect_dump, "vectype: ");
 230                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 231                 }
 232
 233               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 234               if (vect_print_dump_info (REPORT_DETAILS))
 235                 fprintf (vect_dump, "nunits = %d", nunits);
 236
 237               if (!vectorization_factor
 238                   || (nunits > vectorization_factor))
 239                 vectorization_factor = nunits;
 240             }
 241         }
 242
 243       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 244         {
 245           tree vf_vectype;
 246           gimple stmt = gsi_stmt (si);
 247           stmt_info = vinfo_for_stmt (stmt);
 248
 249           if (vect_print_dump_info (REPORT_DETAILS))
 250             {
 251               fprintf (vect_dump, "==> examining statement: ");
 252               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 253             }
 254
 255           gcc_assert (stmt_info);
 256
 257           /* skip stmts which do not need to be vectorized.  */
 258           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 259               && !STMT_VINFO_LIVE_P (stmt_info))
 260             {
 261               if (vect_print_dump_info (REPORT_DETAILS))
 262                 fprintf (vect_dump, "skip.");
 263               continue;
 264             }
 265
 266           if (gimple_get_lhs (stmt) == NULL_TREE)
 267             {
 268               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 269                 {
 270                   fprintf (vect_dump, "not vectorized: irregular stmt.");
 271                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 272                 }
 273               return false;
 274             }
 275
 276           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 277             {
 278               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 279                 {
 280                   fprintf (vect_dump, "not vectorized: vector stmt in loop:");
 281                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 282                 }
 283               return false;
 284             }
 285
 286           if (STMT_VINFO_VECTYPE (stmt_info))
 287             {
 288               /* The only case when a vectype had been already set is for stmts
 289                  that contain a dataref, or for "pattern-stmts" (stmts generated
 290                  by the vectorizer to represent/replace a certain idiom).  */
 291               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 292                           || is_pattern_stmt_p (stmt_info));
 293               vectype = STMT_VINFO_VECTYPE (stmt_info);
 294             }
 295           else
 296             {
 297               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
 298                           && !is_pattern_stmt_p (stmt_info));
 299
 300               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 301               if (vect_print_dump_info (REPORT_DETAILS))
 302                 {
 303                   fprintf (vect_dump, "get vectype for scalar type:  ");
 304                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 305                 }
 306               vectype = get_vectype_for_scalar_type (scalar_type);
 307               if (!vectype)
 308                 {
 309                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 310                     {
 311                       fprintf (vect_dump,
 312                                "not vectorized: unsupported data-type ");
 313                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 314                     }
 315                   return false;
 316                 }
 317
 318               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 319             }
 320
 321           /* The vectorization factor is according to the smallest
 322              scalar type (or the largest vector size, but we only
 323              support one vector size per loop).  */
 324           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 325                                                        &dummy);
 326           if (vect_print_dump_info (REPORT_DETAILS))
 327             {
 328               fprintf (vect_dump, "get vectype for scalar type:  ");
 329               print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 330             }
 331           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 332           if (!vf_vectype)
 333             {
 334               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 335                 {
 336                   fprintf (vect_dump,
 337                            "not vectorized: unsupported data-type ");
 338                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 339                 }
 340               return false;
 341             }
 342
 343           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 344                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 345             {
 346               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 347                 {
 348                   fprintf (vect_dump,
 349                            "not vectorized: different sized vector "
 350                            "types in statement, ");
 351                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 352                   fprintf (vect_dump, " and ");
 353                   print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
 354                 }
 355               return false;
 356             }
 357
 358           if (vect_print_dump_info (REPORT_DETAILS))
 359             {
 360               fprintf (vect_dump, "vectype: ");
 361               print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
 362             }
 363
 364           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 365           if (vect_print_dump_info (REPORT_DETAILS))
 366             fprintf (vect_dump, "nunits = %d", nunits);
 367
 368           if (!vectorization_factor
 369               || (nunits > vectorization_factor))
 370             vectorization_factor = nunits;
 371         }
 372     }
 373
 374   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 375   if (vect_print_dump_info (REPORT_DETAILS))
 376     fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
 377   if (vectorization_factor <= 1)
 378     {
 379       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 380         fprintf (vect_dump, "not vectorized: unsupported data-type");
 381       return false;
 382     }
 383   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 384
 385   return true;
 386 }
 387
 388
 389 /* Function vect_is_simple_iv_evolution.
 390
 391    FORNOW: A simple evolution of an induction variables in the loop is
 392    considered a polynomial evolution with constant step.  */
 393
 394 static bool
 395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 396                              tree * step)
 397 {
 398   tree init_expr;
 399   tree step_expr;
 400   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 401
 402   /* When there is no evolution in this loop, the evolution function
 403      is not "simple".  */
 404   if (evolution_part == NULL_TREE)
 405     return false;
 406
 407   /* When the evolution is a polynomial of degree >= 2
 408      the evolution function is not "simple".  */
 409   if (tree_is_chrec (evolution_part))
 410     return false;
 411
 412   step_expr = evolution_part;
 413   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 414
 415   if (vect_print_dump_info (REPORT_DETAILS))
 416     {
 417       fprintf (vect_dump, "step: ");
 418       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
 419       fprintf (vect_dump, ",  init: ");
 420       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
 421     }
 422
 423   *init = init_expr;
 424   *step = step_expr;
 425
 426   if (TREE_CODE (step_expr) != INTEGER_CST)
 427     {
 428       if (vect_print_dump_info (REPORT_DETAILS))
 429         fprintf (vect_dump, "step unknown.");
 430       return false;
 431     }
 432
 433   return true;
 434 }
 435
 436 /* Function vect_analyze_scalar_cycles_1.
 437
 438    Examine the cross iteration def-use cycles of scalar variables
 439    in LOOP. LOOP_VINFO represents the loop that is now being
 440    considered for vectorization (can be LOOP, or an outer-loop
 441    enclosing LOOP).  */
 442
 443 static void
 444 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 445 {
 446   basic_block bb = loop->header;
 447   tree dumy;
 448   VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
 449   gimple_stmt_iterator gsi;
 450   bool double_reduc;
 451
 452   if (vect_print_dump_info (REPORT_DETAILS))
 453     fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
 454
 455   /* First - identify all inductions. Reduction detection assumes that all the
 456      inductions have been identified, therefore, this order must not be
 457      changed.  */
 458   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 459     {
 460       gimple phi = gsi_stmt (gsi);
 461       tree access_fn = NULL;
 462       tree def = PHI_RESULT (phi);
 463       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 464
 465       if (vect_print_dump_info (REPORT_DETAILS))
 466         {
 467           fprintf (vect_dump, "Analyze phi: ");
 468           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 469         }
 470
 471       /* Skip virtual phi's. The data dependences that are associated with
 472          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 473       if (!is_gimple_reg (SSA_NAME_VAR (def)))
 474         continue;
 475
 476       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 477
 478       /* Analyze the evolution function.  */
 479       access_fn = analyze_scalar_evolution (loop, def);
 480       if (access_fn && vect_print_dump_info (REPORT_DETAILS))
 481         {
 482           fprintf (vect_dump, "Access function of PHI: ");
 483           print_generic_expr (vect_dump, access_fn, TDF_SLIM);
 484         }
 485
 486       if (!access_fn
 487           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 488         {
 489           VEC_safe_push (gimple, heap, worklist, phi);
 490           continue;
 491         }
 492
 493       if (vect_print_dump_info (REPORT_DETAILS))
 494         fprintf (vect_dump, "Detected induction.");
 495       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 496     }
 497
 498
 499   /* Second - identify all reductions and nested cycles.  */
 500   while (VEC_length (gimple, worklist) > 0)
 501     {
 502       gimple phi = VEC_pop (gimple, worklist);
 503       tree def = PHI_RESULT (phi);
 504       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 505       gimple reduc_stmt;
 506       bool nested_cycle;
 507
 508       if (vect_print_dump_info (REPORT_DETAILS))
 509         {
 510           fprintf (vect_dump, "Analyze phi: ");
 511           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 512         }
 513
 514       gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
 515       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 516
 517       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 518       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 519                                                 &double_reduc);
 520       if (reduc_stmt)
 521         {
 522           if (double_reduc)
 523             {
 524               if (vect_print_dump_info (REPORT_DETAILS))
 525                 fprintf (vect_dump, "Detected double reduction.");
 526
 527               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 528               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 529                                                     vect_double_reduction_def;
 530             }
 531           else
 532             {
 533               if (nested_cycle)
 534                 {
 535                   if (vect_print_dump_info (REPORT_DETAILS))
 536                     fprintf (vect_dump, "Detected vectorizable nested cycle.");
 537
 538                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 539                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 540                                                              vect_nested_cycle;
 541                 }
 542               else
 543                 {
 544                   if (vect_print_dump_info (REPORT_DETAILS))
 545                     fprintf (vect_dump, "Detected reduction.");
 546
 547                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 548                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 549                                                            vect_reduction_def;
 550                   /* Store the reduction cycles for possible vectorization in
 551                      loop-aware SLP.  */
 552                   VEC_safe_push (gimple, heap,
 553                                  LOOP_VINFO_REDUCTIONS (loop_vinfo),
 554                                  reduc_stmt);
 555                 }
 556             }
 557         }
 558       else
 559         if (vect_print_dump_info (REPORT_DETAILS))
 560           fprintf (vect_dump, "Unknown def-use cycle pattern.");
 561     }
 562
 563   VEC_free (gimple, heap, worklist);
 564 }
 565
 566
 567 /* Function vect_analyze_scalar_cycles.
 568
 569    Examine the cross iteration def-use cycles of scalar variables, by
 570    analyzing the loop-header PHIs of scalar variables; Classify each
 571    cycle as one of the following: invariant, induction, reduction, unknown.
 572    We do that for the loop represented by LOOP_VINFO, and also to its
 573    inner-loop, if exists.
 574    Examples for scalar cycles:
 575
 576    Example1: reduction:
 577
 578               loop1:
 579               for (i=0; i<N; i++)
 580                  sum += a[i];
 581
 582    Example2: induction:
 583
 584               loop2:
 585               for (i=0; i<N; i++)
 586                  a[i] = i;  */
 587
 588 static void
 589 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 590 {
 591   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 592
 593   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 594
 595   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 596      Reductions in such inner-loop therefore have different properties than
 597      the reductions in the nest that gets vectorized:
 598      1. When vectorized, they are executed in the same order as in the original
 599         scalar loop, so we can't change the order of computation when
 600         vectorizing them.
 601      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 602         current checks are too strict.  */
 603
 604   if (loop->inner)
 605     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 606 }
 607
 608 /* Function vect_get_loop_niters.
 609
 610    Determine how many iterations the loop is executed.
 611    If an expression that represents the number of iterations
 612    can be constructed, place it in NUMBER_OF_ITERATIONS.
 613    Return the loop exit condition.  */
 614
 615 static gimple
 616 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 617 {
 618   tree niters;
 619
 620   if (vect_print_dump_info (REPORT_DETAILS))
 621     fprintf (vect_dump, "=== get_loop_niters ===");
 622
 623   niters = number_of_exit_cond_executions (loop);
 624
 625   if (niters != NULL_TREE
 626       && niters != chrec_dont_know)
 627     {
 628       *number_of_iterations = niters;
 629
 630       if (vect_print_dump_info (REPORT_DETAILS))
 631         {
 632           fprintf (vect_dump, "==> get_loop_niters:" );
 633           print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
 634         }
 635     }
 636
 637   return get_loop_exit_condition (loop);
 638 }
 639
 640
 641 /* Function bb_in_loop_p
 642
 643    Used as predicate for dfs order traversal of the loop bbs.  */
 644
 645 static bool
 646 bb_in_loop_p (const_basic_block bb, const void *data)
 647 {
 648   const struct loop *const loop = (const struct loop *)data;
 649   if (flow_bb_inside_loop_p (loop, bb))
 650     return true;
 651   return false;
 652 }
 653
 654
 655 /* Function new_loop_vec_info.
 656
 657    Create and initialize a new loop_vec_info struct for LOOP, as well as
 658    stmt_vec_info structs for all the stmts in LOOP.  */
 659
 660 static loop_vec_info
 661 new_loop_vec_info (struct loop *loop)
 662 {
 663   loop_vec_info res;
 664   basic_block *bbs;
 665   gimple_stmt_iterator si;
 666   unsigned int i, nbbs;
 667
 668   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 669   LOOP_VINFO_LOOP (res) = loop;
 670
 671   bbs = get_loop_body (loop);
 672
 673   /* Create/Update stmt_info for all stmts in the loop.  */
 674   for (i = 0; i < loop->num_nodes; i++)
 675     {
 676       basic_block bb = bbs[i];
 677
 678       /* BBs in a nested inner-loop will have been already processed (because
 679          we will have called vect_analyze_loop_form for any nested inner-loop).
 680          Therefore, for stmts in an inner-loop we just want to update the
 681          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 682          loop_info of the outer-loop we are currently considering to vectorize
 683          (instead of the loop_info of the inner-loop).
 684          For stmts in other BBs we need to create a stmt_info from scratch.  */
 685       if (bb->loop_father != loop)
 686         {
 687           /* Inner-loop bb.  */
 688           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 689           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 690             {
 691               gimple phi = gsi_stmt (si);
 692               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 693               loop_vec_info inner_loop_vinfo =
 694                 STMT_VINFO_LOOP_VINFO (stmt_info);
 695               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 696               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 697             }
 698           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 699            {
 700               gimple stmt = gsi_stmt (si);
 701               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 702               loop_vec_info inner_loop_vinfo =
 703                  STMT_VINFO_LOOP_VINFO (stmt_info);
 704               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 705               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 706            }
 707         }
 708       else
 709         {
 710           /* bb in current nest.  */
 711           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 712             {
 713               gimple phi = gsi_stmt (si);
 714               gimple_set_uid (phi, 0);
 715               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 716             }
 717
 718           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 719             {
 720               gimple stmt = gsi_stmt (si);
 721               gimple_set_uid (stmt, 0);
 722               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 723             }
 724         }
 725     }
 726
 727   /* CHECKME: We want to visit all BBs before their successors (except for
 728      latch blocks, for which this assertion wouldn't hold).  In the simple
 729      case of the loop forms we allow, a dfs order of the BBs would the same
 730      as reversed postorder traversal, so we are safe.  */
 731
 732    free (bbs);
 733    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 734    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 735                               bbs, loop->num_nodes, loop);
 736    gcc_assert (nbbs == loop->num_nodes);
 737
 738   LOOP_VINFO_BBS (res) = bbs;
 739   LOOP_VINFO_NITERS (res) = NULL;
 740   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 741   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 742   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 743   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 744   LOOP_VINFO_VECT_FACTOR (res) = 0;
 745   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
 746   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
 747   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 748   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
 749     VEC_alloc (gimple, heap,
 750                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 751   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
 752     VEC_alloc (ddr_p, heap,
 753                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 754   LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
 755   LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
 756   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
 757   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 758   LOOP_VINFO_PEELING_HTAB (res) = NULL;
 759
 760   return res;
 761 }
 762
 763
 764 /* Function destroy_loop_vec_info.
 765
 766    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 767    stmts in the loop.  */
 768
 769 void
 770 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 771 {
 772   struct loop *loop;
 773   basic_block *bbs;
 774   int nbbs;
 775   gimple_stmt_iterator si;
 776   int j;
 777   VEC (slp_instance, heap) *slp_instances;
 778   slp_instance instance;
 779
 780   if (!loop_vinfo)
 781     return;
 782
 783   loop = LOOP_VINFO_LOOP (loop_vinfo);
 784
 785   bbs = LOOP_VINFO_BBS (loop_vinfo);
 786   nbbs = loop->num_nodes;
 787
 788   if (!clean_stmts)
 789     {
 790       free (LOOP_VINFO_BBS (loop_vinfo));
 791       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 792       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 793       VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 794
 795       free (loop_vinfo);
 796       loop->aux = NULL;
 797       return;
 798     }
 799
 800   for (j = 0; j < nbbs; j++)
 801     {
 802       basic_block bb = bbs[j];
 803       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 804         free_stmt_vec_info (gsi_stmt (si));
 805
 806       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 807         {
 808           gimple stmt = gsi_stmt (si);
 809           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 810
 811           if (stmt_info)
 812             {
 813               /* Check if this is a "pattern stmt" (introduced by the
 814                  vectorizer during the pattern recognition pass).  */
 815               bool remove_stmt_p = false;
 816               gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
 817               if (orig_stmt)
 818                 {
 819                   stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
 820                   if (orig_stmt_info
 821                       && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
 822                     remove_stmt_p = true;
 823                 }
 824
 825               /* Free stmt_vec_info.  */
 826               free_stmt_vec_info (stmt);
 827
 828               /* Remove dead "pattern stmts".  */
 829               if (remove_stmt_p)
 830                 gsi_remove (&si, true);
 831             }
 832           gsi_next (&si);
 833         }
 834     }
 835
 836   free (LOOP_VINFO_BBS (loop_vinfo));
 837   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 838   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 839   VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 840   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 841   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 842   for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
 843     vect_free_slp_instance (instance);
 844
 845   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
 846   VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
 847   VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
 848
 849   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
 850     htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
 851
 852   free (loop_vinfo);
 853   loop->aux = NULL;
 854 }
 855
 856
 857 /* Function vect_analyze_loop_1.
 858
 859    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 860    for it. The different analyses will record information in the
 861    loop_vec_info struct.  This is a subset of the analyses applied in
 862    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 863    that is now considered for (outer-loop) vectorization.  */
 864
 865 static loop_vec_info
 866 vect_analyze_loop_1 (struct loop *loop)
 867 {
 868   loop_vec_info loop_vinfo;
 869
 870   if (vect_print_dump_info (REPORT_DETAILS))
 871     fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
 872
 873   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 874
 875   loop_vinfo = vect_analyze_loop_form (loop);
 876   if (!loop_vinfo)
 877     {
 878       if (vect_print_dump_info (REPORT_DETAILS))
 879         fprintf (vect_dump, "bad inner-loop form.");
 880       return NULL;
 881     }
 882
 883   return loop_vinfo;
 884 }
 885
 886
 887 /* Function vect_analyze_loop_form.
 888
 889    Verify that certain CFG restrictions hold, including:
 890    - the loop has a pre-header
 891    - the loop has a single entry and exit
 892    - the loop exit condition is simple enough, and the number of iterations
 893      can be analyzed (a countable loop).  */
 894
 895 loop_vec_info
 896 vect_analyze_loop_form (struct loop *loop)
 897 {
 898   loop_vec_info loop_vinfo;
 899   gimple loop_cond;
 900   tree number_of_iterations = NULL;
 901   loop_vec_info inner_loop_vinfo = NULL;
 902
 903   if (vect_print_dump_info (REPORT_DETAILS))
 904     fprintf (vect_dump, "=== vect_analyze_loop_form ===");
 905
 906   /* Different restrictions apply when we are considering an inner-most loop,
 907      vs. an outer (nested) loop.
 908      (FORNOW. May want to relax some of these restrictions in the future).  */
 909
 910   if (!loop->inner)
 911     {
 912       /* Inner-most loop.  We currently require that the number of BBs is
 913          exactly 2 (the header and latch).  Vectorizable inner-most loops
 914          look like this:
 915
 916                         (pre-header)
 917                            |
 918                           header <--------+
 919                            | |            |
 920                            | +--> latch --+
 921                            |
 922                         (exit-bb)  */
 923
 924       if (loop->num_nodes != 2)
 925         {
 926           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 927             fprintf (vect_dump, "not vectorized: control flow in loop.");
 928           return NULL;
 929         }
 930
 931       if (empty_block_p (loop->header))
 932     {
 933           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 934             fprintf (vect_dump, "not vectorized: empty loop.");
 935       return NULL;
 936     }
 937     }
 938   else
 939     {
 940       struct loop *innerloop = loop->inner;
 941       edge entryedge;
 942
 943       /* Nested loop. We currently require that the loop is doubly-nested,
 944          contains a single inner loop, and the number of BBs is exactly 5.
 945          Vectorizable outer-loops look like this:
 946
 947                         (pre-header)
 948                            |
 949                           header <---+
 950                            |         |
 951                           inner-loop |
 952                            |         |
 953                           tail ------+
 954                            |
 955                         (exit-bb)
 956
 957          The inner-loop has the properties expected of inner-most loops
 958          as described above.  */
 959
 960       if ((loop->inner)->inner || (loop->inner)->next)
 961         {
 962           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 963             fprintf (vect_dump, "not vectorized: multiple nested loops.");
 964           return NULL;
 965         }
 966
 967       /* Analyze the inner-loop.  */
 968       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
 969       if (!inner_loop_vinfo)
 970         {
 971           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 972             fprintf (vect_dump, "not vectorized: Bad inner loop.");
 973           return NULL;
 974         }
 975
 976       if (!expr_invariant_in_loop_p (loop,
 977                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
 978         {
 979           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 980             fprintf (vect_dump,
 981                      "not vectorized: inner-loop count not invariant.");
 982           destroy_loop_vec_info (inner_loop_vinfo, true);
 983           return NULL;
 984         }
 985
 986       if (loop->num_nodes != 5)
 987         {
 988           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 989             fprintf (vect_dump, "not vectorized: control flow in loop.");
 990           destroy_loop_vec_info (inner_loop_vinfo, true);
 991           return NULL;
 992         }
 993
 994       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
 995       entryedge = EDGE_PRED (innerloop->header, 0);
 996       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
 997         entryedge = EDGE_PRED (innerloop->header, 1);
 998
 999       if (entryedge->src != loop->header
1000           || !single_exit (innerloop)
1001           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1002         {
1003           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1004             fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1005           destroy_loop_vec_info (inner_loop_vinfo, true);
1006           return NULL;
1007         }
1008
1009       if (vect_print_dump_info (REPORT_DETAILS))
1010         fprintf (vect_dump, "Considering outer-loop vectorization.");
1011     }
1012
1013   if (!single_exit (loop)
1014       || EDGE_COUNT (loop->header->preds) != 2)
1015     {
1016       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1017         {
1018           if (!single_exit (loop))
1019             fprintf (vect_dump, "not vectorized: multiple exits.");
1020           else if (EDGE_COUNT (loop->header->preds) != 2)
1021             fprintf (vect_dump, "not vectorized: too many incoming edges.");
1022         }
1023       if (inner_loop_vinfo)
1024         destroy_loop_vec_info (inner_loop_vinfo, true);
1025       return NULL;
1026     }
1027
1028   /* We assume that the loop exit condition is at the end of the loop. i.e,
1029      that the loop is represented as a do-while (with a proper if-guard
1030      before the loop if needed), where the loop header contains all the
1031      executable statements, and the latch is empty.  */
1032   if (!empty_block_p (loop->latch)
1033         || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1034     {
1035       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1036         fprintf (vect_dump, "not vectorized: unexpected loop form.");
1037       if (inner_loop_vinfo)
1038         destroy_loop_vec_info (inner_loop_vinfo, true);
1039       return NULL;
1040     }
1041
1042   /* Make sure there exists a single-predecessor exit bb:  */
1043   if (!single_pred_p (single_exit (loop)->dest))
1044     {
1045       edge e = single_exit (loop);
1046       if (!(e->flags & EDGE_ABNORMAL))
1047         {
1048           split_loop_exit_edge (e);
1049           if (vect_print_dump_info (REPORT_DETAILS))
1050             fprintf (vect_dump, "split exit edge.");
1051         }
1052       else
1053         {
1054           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1055             fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1056           if (inner_loop_vinfo)
1057             destroy_loop_vec_info (inner_loop_vinfo, true);
1058           return NULL;
1059         }
1060     }
1061
1062   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1063   if (!loop_cond)
1064     {
1065       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1066         fprintf (vect_dump, "not vectorized: complicated exit condition.");
1067       if (inner_loop_vinfo)
1068         destroy_loop_vec_info (inner_loop_vinfo, true);
1069       return NULL;
1070     }
1071
1072   if (!number_of_iterations)
1073     {
1074       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1075         fprintf (vect_dump,
1076                  "not vectorized: number of iterations cannot be computed.");
1077       if (inner_loop_vinfo)
1078         destroy_loop_vec_info (inner_loop_vinfo, true);
1079       return NULL;
1080     }
1081
1082   if (chrec_contains_undetermined (number_of_iterations))
1083     {
1084       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1085         fprintf (vect_dump, "Infinite number of iterations.");
1086       if (inner_loop_vinfo)
1087         destroy_loop_vec_info (inner_loop_vinfo, true);
1088       return NULL;
1089     }
1090
1091   if (!NITERS_KNOWN_P (number_of_iterations))
1092     {
1093       if (vect_print_dump_info (REPORT_DETAILS))
1094         {
1095           fprintf (vect_dump, "Symbolic number of iterations is ");
1096           print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1097         }
1098     }
1099   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1100     {
1101       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1102         fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1103       if (inner_loop_vinfo)
1104         destroy_loop_vec_info (inner_loop_vinfo, false);
1105       return NULL;
1106     }
1107
1108   loop_vinfo = new_loop_vec_info (loop);
1109   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1110   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1111
1112   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1113
1114   /* CHECKME: May want to keep it around it in the future.  */
1115   if (inner_loop_vinfo)
1116     destroy_loop_vec_info (inner_loop_vinfo, false);
1117
1118   gcc_assert (!loop->aux);
1119   loop->aux = loop_vinfo;
1120   return loop_vinfo;
1121 }
1122
1123
1124 /* Get cost by calling cost target builtin.  */
1125
1126 static inline
1127 int vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1128 {
1129   tree dummy_type = NULL;
1130   int dummy = 0;
1131
1132   return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1133                                                        dummy_type, dummy);
1134 }
1135
1136
1137 /* Function vect_analyze_loop_operations.
1138
1139    Scan the loop stmts and make sure they are all vectorizable.  */
1140
1141 static bool
1142 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1143 {
1144   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1145   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1146   int nbbs = loop->num_nodes;
1147   gimple_stmt_iterator si;
1148   unsigned int vectorization_factor = 0;
1149   int i;
1150   gimple phi;
1151   stmt_vec_info stmt_info;
1152   bool need_to_vectorize = false;
1153   int min_profitable_iters;
1154   int min_scalar_loop_bound;
1155   unsigned int th;
1156   bool only_slp_in_loop = true, ok;
1157
1158   if (vect_print_dump_info (REPORT_DETAILS))
1159     fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1160
1161   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1162   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1163
1164   for (i = 0; i < nbbs; i++)
1165     {
1166       basic_block bb = bbs[i];
1167
1168       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1169         {
1170           phi = gsi_stmt (si);
1171           ok = true;
1172
1173           stmt_info = vinfo_for_stmt (phi);
1174           if (vect_print_dump_info (REPORT_DETAILS))
1175             {
1176               fprintf (vect_dump, "examining phi: ");
1177               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1178             }
1179
1180           if (! is_loop_header_bb_p (bb))
1181             {
1182               /* inner-loop loop-closed exit phi in outer-loop vectorization
1183                  (i.e. a phi in the tail of the outer-loop).
1184                  FORNOW: we currently don't support the case that these phis
1185                  are not used in the outerloop (unless it is double reduction,
1186                  i.e., this phi is vect_reduction_def), cause this case
1187                  requires to actually do something here.  */
1188               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1189                    || STMT_VINFO_LIVE_P (stmt_info))
1190                   && STMT_VINFO_DEF_TYPE (stmt_info)
1191                      != vect_double_reduction_def)
1192                 {
1193                   if (vect_print_dump_info (REPORT_DETAILS))
1194                     fprintf (vect_dump,
1195                              "Unsupported loop-closed phi in outer-loop.");
1196                   return false;
1197                 }
1198               continue;
1199             }
1200
1201           gcc_assert (stmt_info);
1202
1203           if (STMT_VINFO_LIVE_P (stmt_info))
1204             {
1205               /* FORNOW: not yet supported.  */
1206               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1207                 fprintf (vect_dump, "not vectorized: value used after loop.");
1208               return false;
1209             }
1210
1211           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1212               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1213             {
1214               /* A scalar-dependence cycle that we don't support.  */
1215               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1216                 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1217               return false;
1218             }
1219
1220           if (STMT_VINFO_RELEVANT_P (stmt_info))
1221             {
1222               need_to_vectorize = true;
1223               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1224                 ok = vectorizable_induction (phi, NULL, NULL);
1225             }
1226
1227           if (!ok)
1228             {
1229               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1230                 {
1231                   fprintf (vect_dump,
1232                            "not vectorized: relevant phi not supported: ");
1233                   print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1234                 }
1235               return false;
1236             }
1237         }
1238
1239       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1240         {
1241           gimple stmt = gsi_stmt (si);
1242           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1243
1244           gcc_assert (stmt_info);
1245
1246           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1247             return false;
1248
1249           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1250                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1251               && !PURE_SLP_STMT (stmt_info))
1252             /* STMT needs both SLP and loop-based vectorization.  */
1253             only_slp_in_loop = false;
1254         }
1255     } /* bbs */
1256
1257   /* All operations in the loop are either irrelevant (deal with loop
1258      control, or dead), or only used outside the loop and can be moved
1259      out of the loop (e.g. invariants, inductions).  The loop can be
1260      optimized away by scalar optimizations.  We're better off not
1261      touching this loop.  */
1262   if (!need_to_vectorize)
1263     {
1264       if (vect_print_dump_info (REPORT_DETAILS))
1265         fprintf (vect_dump,
1266                  "All the computation can be taken out of the loop.");
1267       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1268         fprintf (vect_dump,
1269                  "not vectorized: redundant loop. no profit to vectorize.");
1270       return false;
1271     }
1272
1273   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1274      vectorization factor of the loop is the unrolling factor required by the
1275      SLP instances.  If that unrolling factor is 1, we say, that we perform
1276      pure SLP on loop - cross iteration parallelism is not exploited.  */
1277   if (only_slp_in_loop)
1278     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1279   else
1280     vectorization_factor = least_common_multiple (vectorization_factor,
1281                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1282
1283   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1284
1285   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1286       && vect_print_dump_info (REPORT_DETAILS))
1287     fprintf (vect_dump,
1288         "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1289         vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1290
1291   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1292       && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1293     {
1294       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1295         fprintf (vect_dump, "not vectorized: iteration count too small.");
1296       if (vect_print_dump_info (REPORT_DETAILS))
1297         fprintf (vect_dump,"not vectorized: iteration count smaller than "
1298                  "vectorization factor.");
1299       return false;
1300     }
1301
1302   /* Analyze cost. Decide if worth while to vectorize.  */
1303
1304   /* Once VF is set, SLP costs should be updated since the number of created
1305      vector stmts depends on VF.  */
1306   vect_update_slp_costs_according_to_vf (loop_vinfo);
1307
1308   min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1309   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1310
1311   if (min_profitable_iters < 0)
1312     {
1313       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1314         fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1315       if (vect_print_dump_info (REPORT_DETAILS))
1316         fprintf (vect_dump, "not vectorized: vector version will never be "
1317                  "profitable.");
1318       return false;
1319     }
1320
1321   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1322                             * vectorization_factor) - 1);
1323
1324   /* Use the cost model only if it is more conservative than user specified
1325      threshold.  */
1326
1327   th = (unsigned) min_scalar_loop_bound;
1328   if (min_profitable_iters
1329       && (!min_scalar_loop_bound
1330           || min_profitable_iters > min_scalar_loop_bound))
1331     th = (unsigned) min_profitable_iters;
1332
1333   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1334       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1335     {
1336       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1337         fprintf (vect_dump, "not vectorized: vectorization not "
1338                  "profitable.");
1339       if (vect_print_dump_info (REPORT_DETAILS))
1340         fprintf (vect_dump, "not vectorized: iteration count smaller than "
1341                  "user specified loop bound parameter or minimum "
1342                  "profitable iterations (whichever is more conservative).");
1343       return false;
1344     }
1345
1346   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1347       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1348       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1349     {
1350       if (vect_print_dump_info (REPORT_DETAILS))
1351         fprintf (vect_dump, "epilog loop required.");
1352       if (!vect_can_advance_ivs_p (loop_vinfo))
1353         {
1354           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1355             fprintf (vect_dump,
1356                      "not vectorized: can't create epilog loop 1.");
1357           return false;
1358         }
1359       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1360         {
1361           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1362             fprintf (vect_dump,
1363                      "not vectorized: can't create epilog loop 2.");
1364           return false;
1365         }
1366     }
1367
1368   return true;
1369 }
1370
1371
1372 /* Function vect_analyze_loop.
1373
1374    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1375    for it. The different analyses will record information in the
1376    loop_vec_info struct.  */
1377 loop_vec_info
1378 vect_analyze_loop (struct loop *loop)
1379 {
1380   bool ok;
1381   loop_vec_info loop_vinfo;
1382   int max_vf = MAX_VECTORIZATION_FACTOR;
1383   int min_vf = 2;
1384
1385   if (vect_print_dump_info (REPORT_DETAILS))
1386     fprintf (vect_dump, "===== analyze_loop_nest =====");
1387
1388   if (loop_outer (loop)
1389       && loop_vec_info_for_loop (loop_outer (loop))
1390       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1391     {
1392       if (vect_print_dump_info (REPORT_DETAILS))
1393         fprintf (vect_dump, "outer-loop already vectorized.");
1394       return NULL;
1395     }
1396
1397   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1398
1399   loop_vinfo = vect_analyze_loop_form (loop);
1400   if (!loop_vinfo)
1401     {
1402       if (vect_print_dump_info (REPORT_DETAILS))
1403         fprintf (vect_dump, "bad loop form.");
1404       return NULL;
1405     }
1406
1407   /* Find all data references in the loop (which correspond to vdefs/vuses)
1408      and analyze their evolution in the loop.  Also adjust the minimal
1409      vectorization factor according to the loads and stores.
1410
1411      FORNOW: Handle only simple, array references, which
1412      alignment can be forced, and aligned pointer-references.  */
1413
1414   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1415   if (!ok)
1416     {
1417       if (vect_print_dump_info (REPORT_DETAILS))
1418         fprintf (vect_dump, "bad data references.");
1419       destroy_loop_vec_info (loop_vinfo, true);
1420       return NULL;
1421     }
1422
1423   /* Classify all cross-iteration scalar data-flow cycles.
1424      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1425
1426   vect_analyze_scalar_cycles (loop_vinfo);
1427
1428   vect_pattern_recog (loop_vinfo);
1429
1430   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1431
1432   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1433   if (!ok)
1434     {
1435       if (vect_print_dump_info (REPORT_DETAILS))
1436         fprintf (vect_dump, "unexpected pattern.");
1437       destroy_loop_vec_info (loop_vinfo, true);
1438       return NULL;
1439     }
1440
1441   /* Analyze data dependences between the data-refs in the loop
1442      and adjust the maximum vectorization factor according to
1443      the dependences.
1444      FORNOW: fail at the first data dependence that we encounter.  */
1445
1446   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1447   if (!ok
1448       || max_vf < min_vf)
1449     {
1450       if (vect_print_dump_info (REPORT_DETAILS))
1451         fprintf (vect_dump, "bad data dependence.");
1452       destroy_loop_vec_info (loop_vinfo, true);
1453       return NULL;
1454     }
1455
1456   ok = vect_determine_vectorization_factor (loop_vinfo);
1457   if (!ok)
1458     {
1459       if (vect_print_dump_info (REPORT_DETAILS))
1460         fprintf (vect_dump, "can't determine vectorization factor.");
1461       destroy_loop_vec_info (loop_vinfo, true);
1462       return NULL;
1463     }
1464   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1465     {
1466       if (vect_print_dump_info (REPORT_DETAILS))
1467         fprintf (vect_dump, "bad data dependence.");
1468       destroy_loop_vec_info (loop_vinfo, true);
1469       return NULL;
1470     }
1471
1472   /* Analyze the alignment of the data-refs in the loop.
1473      Fail if a data reference is found that cannot be vectorized.  */
1474
1475   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1476   if (!ok)
1477     {
1478       if (vect_print_dump_info (REPORT_DETAILS))
1479         fprintf (vect_dump, "bad data alignment.");
1480       destroy_loop_vec_info (loop_vinfo, true);
1481       return NULL;
1482     }
1483
1484   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1485      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1486
1487   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1488   if (!ok)
1489     {
1490       if (vect_print_dump_info (REPORT_DETAILS))
1491         fprintf (vect_dump, "bad data access.");
1492       destroy_loop_vec_info (loop_vinfo, true);
1493       return NULL;
1494     }
1495
1496   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1497      It is important to call pruning after vect_analyze_data_ref_accesses,
1498      since we use grouping information gathered by interleaving analysis.  */
1499   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1500   if (!ok)
1501     {
1502       if (vect_print_dump_info (REPORT_DETAILS))
1503         fprintf (vect_dump, "too long list of versioning for alias "
1504                             "run-time tests.");
1505       destroy_loop_vec_info (loop_vinfo, true);
1506       return NULL;
1507     }
1508
1509   /* This pass will decide on using loop versioning and/or loop peeling in
1510      order to enhance the alignment of data references in the loop.  */
1511
1512   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1513   if (!ok)
1514     {
1515       if (vect_print_dump_info (REPORT_DETAILS))
1516         fprintf (vect_dump, "bad data alignment.");
1517       destroy_loop_vec_info (loop_vinfo, true);
1518       return NULL;
1519     }
1520
1521   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1522   ok = vect_analyze_slp (loop_vinfo, NULL);
1523   if (ok)
1524     {
1525       /* Decide which possible SLP instances to SLP.  */
1526       vect_make_slp_decision (loop_vinfo);
1527
1528       /* Find stmts that need to be both vectorized and SLPed.  */
1529       vect_detect_hybrid_slp (loop_vinfo);
1530     }
1531
1532   /* Scan all the operations in the loop and make sure they are
1533      vectorizable.  */
1534
1535   ok = vect_analyze_loop_operations (loop_vinfo);
1536   if (!ok)
1537     {
1538       if (vect_print_dump_info (REPORT_DETAILS))
1539         fprintf (vect_dump, "bad operation or unsupported loop bound.");
1540       destroy_loop_vec_info (loop_vinfo, true);
1541       return NULL;
1542     }
1543
1544   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1545
1546   return loop_vinfo;
1547 }
1548
1549
1550 /* Function reduction_code_for_scalar_code
1551
1552    Input:
1553    CODE - tree_code of a reduction operations.
1554
1555    Output:
1556    REDUC_CODE - the corresponding tree-code to be used to reduce the
1557       vector of partial results into a single scalar result (which
1558       will also reside in a vector) or ERROR_MARK if the operation is
1559       a supported reduction operation, but does not have such tree-code.
1560
1561    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1562
1563 static bool
1564 reduction_code_for_scalar_code (enum tree_code code,
1565                                 enum tree_code *reduc_code)
1566 {
1567   switch (code)
1568     {
1569       case MAX_EXPR:
1570         *reduc_code = REDUC_MAX_EXPR;
1571         return true;
1572
1573       case MIN_EXPR:
1574         *reduc_code = REDUC_MIN_EXPR;
1575         return true;
1576
1577       case PLUS_EXPR:
1578         *reduc_code = REDUC_PLUS_EXPR;
1579         return true;
1580
1581       case MULT_EXPR:
1582       case MINUS_EXPR:
1583       case BIT_IOR_EXPR:
1584       case BIT_XOR_EXPR:
1585       case BIT_AND_EXPR:
1586         *reduc_code = ERROR_MARK;
1587         return true;
1588
1589       default:
1590        return false;
1591     }
1592 }
1593
1594
1595 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1596    STMT is printed with a message MSG. */
1597
1598 static void
1599 report_vect_op (gimple stmt, const char *msg)
1600 {
1601   fprintf (vect_dump, "%s", msg);
1602   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1603 }
1604
1605
1606 /* Function vect_is_simple_reduction_1
1607
1608    (1) Detect a cross-iteration def-use cycle that represents a simple
1609    reduction computation. We look for the following pattern:
1610
1611    loop_header:
1612      a1 = phi < a0, a2 >
1613      a3 = ...
1614      a2 = operation (a3, a1)
1615
1616    such that:
1617    1. operation is commutative and associative and it is safe to
1618       change the order of the computation (if CHECK_REDUCTION is true)
1619    2. no uses for a2 in the loop (a2 is used out of the loop)
1620    3. no uses of a1 in the loop besides the reduction operation.
1621
1622    Condition 1 is tested here.
1623    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1624
1625    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1626    nested cycles, if CHECK_REDUCTION is false.
1627
1628    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1629    reductions:
1630
1631      a1 = phi < a0, a2 >
1632      inner loop (def of a3)
1633      a2 = phi < a3 >
1634
1635    If MODIFY is true it tries also to rework the code in-place to enable
1636    detection of more reduction patterns.  For the time being we rewrite
1637    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1638 */
1639
1640 static gimple
1641 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1642                             bool check_reduction, bool *double_reduc,
1643                             bool modify)
1644 {
1645   struct loop *loop = (gimple_bb (phi))->loop_father;
1646   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1647   edge latch_e = loop_latch_edge (loop);
1648   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1649   gimple def_stmt, def1 = NULL, def2 = NULL;
1650   enum tree_code orig_code, code;
1651   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1652   tree type;
1653   int nloop_uses;
1654   tree name;
1655   imm_use_iterator imm_iter;
1656   use_operand_p use_p;
1657   bool phi_def;
1658
1659   *double_reduc = false;
1660
1661   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1662      otherwise, we assume outer loop vectorization.  */
1663   gcc_assert ((check_reduction && loop == vect_loop)
1664               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1665
1666   name = PHI_RESULT (phi);
1667   nloop_uses = 0;
1668   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1669     {
1670       gimple use_stmt = USE_STMT (use_p);
1671       if (is_gimple_debug (use_stmt))
1672         continue;
1673       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1674           && vinfo_for_stmt (use_stmt)
1675           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1676         nloop_uses++;
1677       if (nloop_uses > 1)
1678         {
1679           if (vect_print_dump_info (REPORT_DETAILS))
1680             fprintf (vect_dump, "reduction used in loop.");
1681           return NULL;
1682         }
1683     }
1684
1685   if (TREE_CODE (loop_arg) != SSA_NAME)
1686     {
1687       if (vect_print_dump_info (REPORT_DETAILS))
1688         {
1689           fprintf (vect_dump, "reduction: not ssa_name: ");
1690           print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1691         }
1692       return NULL;
1693     }
1694
1695   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1696   if (!def_stmt)
1697     {
1698       if (vect_print_dump_info (REPORT_DETAILS))
1699         fprintf (vect_dump, "reduction: no def_stmt.");
1700       return NULL;
1701     }
1702
1703   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1704     {
1705       if (vect_print_dump_info (REPORT_DETAILS))
1706         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1707       return NULL;
1708     }
1709
1710   if (is_gimple_assign (def_stmt))
1711     {
1712       name = gimple_assign_lhs (def_stmt);
1713       phi_def = false;
1714     }
1715   else
1716     {
1717       name = PHI_RESULT (def_stmt);
1718       phi_def = true;
1719     }
1720
1721   nloop_uses = 0;
1722   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1723     {
1724       gimple use_stmt = USE_STMT (use_p);
1725       if (is_gimple_debug (use_stmt))
1726         continue;
1727       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1728           && vinfo_for_stmt (use_stmt)
1729           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1730         nloop_uses++;
1731       if (nloop_uses > 1)
1732         {
1733           if (vect_print_dump_info (REPORT_DETAILS))
1734             fprintf (vect_dump, "reduction used in loop.");
1735           return NULL;
1736         }
1737     }
1738
1739   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1740      defined in the inner loop.  */
1741   if (phi_def)
1742     {
1743       op1 = PHI_ARG_DEF (def_stmt, 0);
1744
1745       if (gimple_phi_num_args (def_stmt) != 1
1746           || TREE_CODE (op1) != SSA_NAME)
1747         {
1748           if (vect_print_dump_info (REPORT_DETAILS))
1749             fprintf (vect_dump, "unsupported phi node definition.");
1750
1751           return NULL;
1752         }
1753
1754       def1 = SSA_NAME_DEF_STMT (op1);
1755       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1756           && loop->inner
1757           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1758           && is_gimple_assign (def1))
1759         {
1760           if (vect_print_dump_info (REPORT_DETAILS))
1761             report_vect_op (def_stmt, "detected double reduction: ");
1762
1763           *double_reduc = true;
1764           return def_stmt;
1765         }
1766
1767       return NULL;
1768     }
1769
1770   code = orig_code = gimple_assign_rhs_code (def_stmt);
1771
1772   /* We can handle "res -= x[i]", which is non-associative by
1773      simply rewriting this into "res += -x[i]".  Avoid changing
1774      gimple instruction for the first simple tests and only do this
1775      if we're allowed to change code at all.  */
1776   if (code == MINUS_EXPR && modify)
1777     code = PLUS_EXPR;
1778
1779   if (check_reduction
1780       && (!commutative_tree_code (code) || !associative_tree_code (code)))
1781     {
1782       if (vect_print_dump_info (REPORT_DETAILS))
1783         report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1784       return NULL;
1785     }
1786
1787   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1788     {
1789       if (code != COND_EXPR)
1790         {
1791           if (vect_print_dump_info (REPORT_DETAILS))
1792             report_vect_op (def_stmt, "reduction: not binary operation: ");
1793
1794           return NULL;
1795         }
1796
1797       op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1798       if (COMPARISON_CLASS_P (op3))
1799         {
1800           op4 = TREE_OPERAND (op3, 1);
1801           op3 = TREE_OPERAND (op3, 0);
1802         }
1803
1804       op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1805       op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1806
1807       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1808         {
1809           if (vect_print_dump_info (REPORT_DETAILS))
1810             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1811
1812           return NULL;
1813         }
1814     }
1815   else
1816     {
1817       op1 = gimple_assign_rhs1 (def_stmt);
1818       op2 = gimple_assign_rhs2 (def_stmt);
1819
1820       if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1821         {
1822           if (vect_print_dump_info (REPORT_DETAILS))
1823             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1824
1825           return NULL;
1826         }
1827    }
1828
1829   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1830   if ((TREE_CODE (op1) == SSA_NAME
1831        && !types_compatible_p (type,TREE_TYPE (op1)))
1832       || (TREE_CODE (op2) == SSA_NAME
1833           && !types_compatible_p (type, TREE_TYPE (op2)))
1834       || (op3 && TREE_CODE (op3) == SSA_NAME
1835           && !types_compatible_p (type, TREE_TYPE (op3)))
1836       || (op4 && TREE_CODE (op4) == SSA_NAME
1837           && !types_compatible_p (type, TREE_TYPE (op4))))
1838     {
1839       if (vect_print_dump_info (REPORT_DETAILS))
1840         {
1841           fprintf (vect_dump, "reduction: multiple types: operation type: ");
1842           print_generic_expr (vect_dump, type, TDF_SLIM);
1843           fprintf (vect_dump, ", operands types: ");
1844           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1845           fprintf (vect_dump, ",");
1846           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1847           if (op3)
1848             {
1849               fprintf (vect_dump, ",");
1850               print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1851             }
1852
1853           if (op4)
1854             {
1855               fprintf (vect_dump, ",");
1856               print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1857             }
1858         }
1859
1860       return NULL;
1861     }
1862
1863   /* Check that it's ok to change the order of the computation.
1864      Generally, when vectorizing a reduction we change the order of the
1865      computation.  This may change the behavior of the program in some
1866      cases, so we need to check that this is ok.  One exception is when
1867      vectorizing an outer-loop: the inner-loop is executed sequentially,
1868      and therefore vectorizing reductions in the inner-loop during
1869      outer-loop vectorization is safe.  */
1870
1871   /* CHECKME: check for !flag_finite_math_only too?  */
1872   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1873       && check_reduction)
1874     {
1875       /* Changing the order of operations changes the semantics.  */
1876       if (vect_print_dump_info (REPORT_DETAILS))
1877         report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1878       return NULL;
1879     }
1880   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1881            && check_reduction)
1882     {
1883       /* Changing the order of operations changes the semantics.  */
1884       if (vect_print_dump_info (REPORT_DETAILS))
1885         report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1886       return NULL;
1887     }
1888   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1889     {
1890       /* Changing the order of operations changes the semantics.  */
1891       if (vect_print_dump_info (REPORT_DETAILS))
1892         report_vect_op (def_stmt,
1893                         "reduction: unsafe fixed-point math optimization: ");
1894       return NULL;
1895     }
1896
1897   /* If we detected "res -= x[i]" earlier, rewrite it into
1898      "res += -x[i]" now.  If this turns out to be useless reassoc
1899      will clean it up again.  */
1900   if (orig_code == MINUS_EXPR)
1901     {
1902       tree rhs = gimple_assign_rhs2 (def_stmt);
1903       tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
1904       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
1905                                                          rhs, NULL);
1906       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
1907       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
1908                                                           loop_info, NULL));
1909       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
1910       gimple_assign_set_rhs2 (def_stmt, negrhs);
1911       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
1912       update_stmt (def_stmt);
1913     }
1914
1915   /* Reduction is safe. We're dealing with one of the following:
1916      1) integer arithmetic and no trapv
1917      2) floating point arithmetic, and special flags permit this optimization
1918      3) nested cycle (i.e., outer loop vectorization).  */
1919   if (TREE_CODE (op1) == SSA_NAME)
1920     def1 = SSA_NAME_DEF_STMT (op1);
1921
1922   if (TREE_CODE (op2) == SSA_NAME)
1923     def2 = SSA_NAME_DEF_STMT (op2);
1924
1925   if (code != COND_EXPR
1926       && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1927     {
1928       if (vect_print_dump_info (REPORT_DETAILS))
1929         report_vect_op (def_stmt, "reduction: no defs for operands: ");
1930       return NULL;
1931     }
1932
1933   /* Check that one def is the reduction def, defined by PHI,
1934      the other def is either defined in the loop ("vect_internal_def"),
1935      or it's an induction (defined by a loop-header phi-node).  */
1936
1937   if (def2 && def2 == phi
1938       && (code == COND_EXPR
1939           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1940               && (is_gimple_assign (def1)
1941                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1942                       == vect_induction_def
1943                   || (gimple_code (def1) == GIMPLE_PHI
1944                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1945                           == vect_internal_def
1946                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
1947     {
1948       if (vect_print_dump_info (REPORT_DETAILS))
1949         report_vect_op (def_stmt, "detected reduction: ");
1950       return def_stmt;
1951     }
1952   else if (def1 && def1 == phi
1953            && (code == COND_EXPR
1954                || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1955                    && (is_gimple_assign (def2)
1956                        || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1957                            == vect_induction_def
1958                        || (gimple_code (def2) == GIMPLE_PHI
1959                            && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1960                                == vect_internal_def
1961                            && !is_loop_header_bb_p (gimple_bb (def2)))))))
1962     {
1963       if (check_reduction)
1964         {
1965           /* Swap operands (just for simplicity - so that the rest of the code
1966              can assume that the reduction variable is always the last (second)
1967              argument).  */
1968           if (vect_print_dump_info (REPORT_DETAILS))
1969             report_vect_op (def_stmt,
1970                             "detected reduction: need to swap operands: ");
1971
1972           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
1973                               gimple_assign_rhs2_ptr (def_stmt));
1974         }
1975       else
1976         {
1977           if (vect_print_dump_info (REPORT_DETAILS))
1978             report_vect_op (def_stmt, "detected reduction: ");
1979         }
1980
1981       return def_stmt;
1982     }
1983   else
1984     {
1985       if (vect_print_dump_info (REPORT_DETAILS))
1986         report_vect_op (def_stmt, "reduction: unknown pattern: ");
1987
1988       return NULL;
1989     }
1990 }
1991
1992 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
1993    in-place.  Arguments as there.  */
1994
1995 static gimple
1996 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
1997                           bool check_reduction, bool *double_reduc)
1998 {
1999   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2000                                      double_reduc, false);
2001 }
2002
2003 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2004    in-place if it enables detection of more reductions.  Arguments
2005    as there.  */
2006
2007 gimple
2008 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2009                           bool check_reduction, bool *double_reduc)
2010 {
2011   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2012                                      double_reduc, true);
2013 }
2014
2015 /* Calculate the cost of one scalar iteration of the loop.  */
2016 int
2017 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2018 {
2019   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2020   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2021   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2022   int innerloop_iters, i, stmt_cost;
2023
2024   /* Count statements in scalar loop. Using this as scalar cost for a single
2025      iteration for now.
2026
2027      TODO: Add outer loop support.
2028
2029      TODO: Consider assigning different costs to different scalar
2030      statements.  */
2031
2032   /* FORNOW.  */
2033   if (loop->inner)
2034     innerloop_iters = 50; /* FIXME */
2035
2036   for (i = 0; i < nbbs; i++)
2037     {
2038       gimple_stmt_iterator si;
2039       basic_block bb = bbs[i];
2040
2041       if (bb->loop_father == loop->inner)
2042         factor = innerloop_iters;
2043       else
2044         factor = 1;
2045
2046       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2047         {
2048           gimple stmt = gsi_stmt (si);
2049
2050           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2051             continue;
2052
2053           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2054             {
2055               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2056                stmt_cost = vect_get_cost (scalar_load);
2057              else
2058                stmt_cost = vect_get_cost (scalar_store);
2059             }
2060           else
2061             stmt_cost = vect_get_cost (scalar_stmt);
2062
2063           scalar_single_iter_cost += stmt_cost * factor;
2064         }
2065     }
2066   return scalar_single_iter_cost;
2067 }
2068
2069 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2070 int
2071 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2072                              int *peel_iters_epilogue,
2073                              int scalar_single_iter_cost)
2074 {
2075   int peel_guard_costs = 0;
2076   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2077
2078   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2079     {
2080       *peel_iters_epilogue = vf/2;
2081       if (vect_print_dump_info (REPORT_COST))
2082         fprintf (vect_dump, "cost model: "
2083                             "epilogue peel iters set to vf/2 because "
2084                             "loop iterations are unknown .");
2085
2086       /* If peeled iterations are known but number of scalar loop
2087          iterations are unknown, count a taken branch per peeled loop.  */
2088       peel_guard_costs =  2 * vect_get_cost (cond_branch_taken);
2089     }
2090   else
2091     {
2092       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2093       peel_iters_prologue = niters < peel_iters_prologue ?
2094                             niters : peel_iters_prologue;
2095       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2096     }
2097
2098    return (peel_iters_prologue * scalar_single_iter_cost)
2099             + (*peel_iters_epilogue * scalar_single_iter_cost)
2100            + peel_guard_costs;
2101 }
2102
2103 /* Function vect_estimate_min_profitable_iters
2104
2105    Return the number of iterations required for the vector version of the
2106    loop to be profitable relative to the cost of the scalar version of the
2107    loop.
2108
2109    TODO: Take profile info into account before making vectorization
2110    decisions, if available.  */
2111
2112 int
2113 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2114 {
2115   int i;
2116   int min_profitable_iters;
2117   int peel_iters_prologue;
2118   int peel_iters_epilogue;
2119   int vec_inside_cost = 0;
2120   int vec_outside_cost = 0;
2121   int scalar_single_iter_cost = 0;
2122   int scalar_outside_cost = 0;
2123   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2124   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2125   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2126   int nbbs = loop->num_nodes;
2127   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2128   int peel_guard_costs = 0;
2129   int innerloop_iters = 0, factor;
2130   VEC (slp_instance, heap) *slp_instances;
2131   slp_instance instance;
2132
2133   /* Cost model disabled.  */
2134   if (!flag_vect_cost_model)
2135     {
2136       if (vect_print_dump_info (REPORT_COST))
2137         fprintf (vect_dump, "cost model disabled.");
2138       return 0;
2139     }
2140
2141   /* Requires loop versioning tests to handle misalignment.  */
2142   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2143     {
2144       /*  FIXME: Make cost depend on complexity of individual check.  */
2145       vec_outside_cost +=
2146         VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2147       if (vect_print_dump_info (REPORT_COST))
2148         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2149                  "versioning to treat misalignment.\n");
2150     }
2151
2152   /* Requires loop versioning with alias checks.  */
2153   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2154     {
2155       /*  FIXME: Make cost depend on complexity of individual check.  */
2156       vec_outside_cost +=
2157         VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2158       if (vect_print_dump_info (REPORT_COST))
2159         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2160                  "versioning aliasing.\n");
2161     }
2162
2163   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2164       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2165     vec_outside_cost += vect_get_cost (cond_branch_taken);
2166
2167   /* Count statements in scalar loop.  Using this as scalar cost for a single
2168      iteration for now.
2169
2170      TODO: Add outer loop support.
2171
2172      TODO: Consider assigning different costs to different scalar
2173      statements.  */
2174
2175   /* FORNOW.  */
2176   if (loop->inner)
2177     innerloop_iters = 50; /* FIXME */
2178
2179   for (i = 0; i < nbbs; i++)
2180     {
2181       gimple_stmt_iterator si;
2182       basic_block bb = bbs[i];
2183
2184       if (bb->loop_father == loop->inner)
2185         factor = innerloop_iters;
2186       else
2187         factor = 1;
2188
2189       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2190         {
2191           gimple stmt = gsi_stmt (si);
2192           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2193           /* Skip stmts that are not vectorized inside the loop.  */
2194           if (!STMT_VINFO_RELEVANT_P (stmt_info)
2195               && (!STMT_VINFO_LIVE_P (stmt_info)
2196                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2197             continue;
2198           vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2199           /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2200              some of the "outside" costs are generated inside the outer-loop.  */
2201           vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2202         }
2203     }
2204
2205   scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2206
2207   /* Add additional cost for the peeled instructions in prologue and epilogue
2208      loop.
2209
2210      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2211      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2212
2213      TODO: Build an expression that represents peel_iters for prologue and
2214      epilogue to be used in a run-time test.  */
2215
2216   if (npeel  < 0)
2217     {
2218       peel_iters_prologue = vf/2;
2219       if (vect_print_dump_info (REPORT_COST))
2220         fprintf (vect_dump, "cost model: "
2221                  "prologue peel iters set to vf/2.");
2222
2223       /* If peeling for alignment is unknown, loop bound of main loop becomes
2224          unknown.  */
2225       peel_iters_epilogue = vf/2;
2226       if (vect_print_dump_info (REPORT_COST))
2227         fprintf (vect_dump, "cost model: "
2228                  "epilogue peel iters set to vf/2 because "
2229                  "peeling for alignment is unknown .");
2230
2231       /* If peeled iterations are unknown, count a taken branch and a not taken
2232          branch per peeled loop. Even if scalar loop iterations are known,
2233          vector iterations are not known since peeled prologue iterations are
2234          not known. Hence guards remain the same.  */
2235       peel_guard_costs +=  2 * (vect_get_cost (cond_branch_taken)
2236                                 + vect_get_cost (cond_branch_not_taken));
2237       vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2238                            + (peel_iters_epilogue * scalar_single_iter_cost)
2239                            + peel_guard_costs;
2240     }
2241   else
2242     {
2243       peel_iters_prologue = npeel;
2244       vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2245                                     peel_iters_prologue, &peel_iters_epilogue,
2246                                     scalar_single_iter_cost);
2247     }
2248
2249   /* FORNOW: The scalar outside cost is incremented in one of the
2250      following ways:
2251
2252      1. The vectorizer checks for alignment and aliasing and generates
2253      a condition that allows dynamic vectorization.  A cost model
2254      check is ANDED with the versioning condition.  Hence scalar code
2255      path now has the added cost of the versioning check.
2256
2257        if (cost > th & versioning_check)
2258          jmp to vector code
2259
2260      Hence run-time scalar is incremented by not-taken branch cost.
2261
2262      2. The vectorizer then checks if a prologue is required.  If the
2263      cost model check was not done before during versioning, it has to
2264      be done before the prologue check.
2265
2266        if (cost <= th)
2267          prologue = scalar_iters
2268        if (prologue == 0)
2269          jmp to vector code
2270        else
2271          execute prologue
2272        if (prologue == num_iters)
2273          go to exit
2274
2275      Hence the run-time scalar cost is incremented by a taken branch,
2276      plus a not-taken branch, plus a taken branch cost.
2277
2278      3. The vectorizer then checks if an epilogue is required.  If the
2279      cost model check was not done before during prologue check, it
2280      has to be done with the epilogue check.
2281
2282        if (prologue == 0)
2283          jmp to vector code
2284        else
2285          execute prologue
2286        if (prologue == num_iters)
2287          go to exit
2288        vector code:
2289          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2290            jmp to epilogue
2291
2292      Hence the run-time scalar cost should be incremented by 2 taken
2293      branches.
2294
2295      TODO: The back end may reorder the BBS's differently and reverse
2296      conditions/branch directions.  Change the estimates below to
2297      something more reasonable.  */
2298
2299   /* If the number of iterations is known and we do not do versioning, we can
2300      decide whether to vectorize at compile time. Hence the scalar version
2301      do not carry cost model guard costs.  */
2302   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2303       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2304       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2305     {
2306       /* Cost model check occurs at versioning.  */
2307       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2308           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2309         scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2310       else
2311         {
2312           /* Cost model check occurs at prologue generation.  */
2313           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2314             scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2315                                    + vect_get_cost (cond_branch_not_taken);
2316           /* Cost model check occurs at epilogue generation.  */
2317           else
2318             scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2319         }
2320     }
2321
2322   /* Add SLP costs.  */
2323   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2324   for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
2325     {
2326       vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2327       vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2328     }
2329
2330   /* Calculate number of iterations required to make the vector version
2331      profitable, relative to the loop bodies only. The following condition
2332      must hold true:
2333      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2334      where
2335      SIC = scalar iteration cost, VIC = vector iteration cost,
2336      VOC = vector outside cost, VF = vectorization factor,
2337      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2338      SOC = scalar outside cost for run time cost model check.  */
2339
2340   if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2341     {
2342       if (vec_outside_cost <= 0)
2343         min_profitable_iters = 1;
2344       else
2345         {
2346           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2347                                   - vec_inside_cost * peel_iters_prologue
2348                                   - vec_inside_cost * peel_iters_epilogue)
2349                                  / ((scalar_single_iter_cost * vf)
2350                                     - vec_inside_cost);
2351
2352           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2353               <= ((vec_inside_cost * min_profitable_iters)
2354                   + ((vec_outside_cost - scalar_outside_cost) * vf)))
2355             min_profitable_iters++;
2356         }
2357     }
2358   /* vector version will never be profitable.  */
2359   else
2360     {
2361       if (vect_print_dump_info (REPORT_COST))
2362         fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2363                  "divided by the scalar iteration cost = %d "
2364                  "is greater or equal to the vectorization factor = %d.",
2365                  vec_inside_cost, scalar_single_iter_cost, vf);
2366       return -1;
2367     }
2368
2369   if (vect_print_dump_info (REPORT_COST))
2370     {
2371       fprintf (vect_dump, "Cost model analysis: \n");
2372       fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
2373                vec_inside_cost);
2374       fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
2375                vec_outside_cost);
2376       fprintf (vect_dump, "  Scalar iteration cost: %d\n",
2377                scalar_single_iter_cost);
2378       fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
2379       fprintf (vect_dump, "  prologue iterations: %d\n",
2380                peel_iters_prologue);
2381       fprintf (vect_dump, "  epilogue iterations: %d\n",
2382                peel_iters_epilogue);
2383       fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
2384                min_profitable_iters);
2385     }
2386
2387   min_profitable_iters =
2388         min_profitable_iters < vf ? vf : min_profitable_iters;
2389
2390   /* Because the condition we create is:
2391      if (niters <= min_profitable_iters)
2392        then skip the vectorized loop.  */
2393   min_profitable_iters--;
2394
2395   if (vect_print_dump_info (REPORT_COST))
2396     fprintf (vect_dump, "  Profitability threshold = %d\n",
2397              min_profitable_iters);
2398
2399   return min_profitable_iters;
2400 }
2401
2402
2403 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2404    functions. Design better to avoid maintenance issues.  */
2405
2406 /* Function vect_model_reduction_cost.
2407
2408    Models cost for a reduction operation, including the vector ops
2409    generated within the strip-mine loop, the initial definition before
2410    the loop, and the epilogue code that must be generated.  */
2411
2412 static bool
2413 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2414                            int ncopies)
2415 {
2416   int outer_cost = 0;
2417   enum tree_code code;
2418   optab optab;
2419   tree vectype;
2420   gimple stmt, orig_stmt;
2421   tree reduction_op;
2422   enum machine_mode mode;
2423   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2424   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2425
2426
2427   /* Cost of reduction op inside loop.  */
2428   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2429     += ncopies * vect_get_cost (vector_stmt);
2430
2431   stmt = STMT_VINFO_STMT (stmt_info);
2432
2433   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2434     {
2435     case GIMPLE_SINGLE_RHS:
2436       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2437       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2438       break;
2439     case GIMPLE_UNARY_RHS:
2440       reduction_op = gimple_assign_rhs1 (stmt);
2441       break;
2442     case GIMPLE_BINARY_RHS:
2443       reduction_op = gimple_assign_rhs2 (stmt);
2444       break;
2445     default:
2446       gcc_unreachable ();
2447     }
2448
2449   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2450   if (!vectype)
2451     {
2452       if (vect_print_dump_info (REPORT_COST))
2453         {
2454           fprintf (vect_dump, "unsupported data-type ");
2455           print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2456         }
2457       return false;
2458    }
2459
2460   mode = TYPE_MODE (vectype);
2461   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2462
2463   if (!orig_stmt)
2464     orig_stmt = STMT_VINFO_STMT (stmt_info);
2465
2466   code = gimple_assign_rhs_code (orig_stmt);
2467
2468   /* Add in cost for initial definition.  */
2469   outer_cost += vect_get_cost (scalar_to_vec);
2470
2471   /* Determine cost of epilogue code.
2472
2473      We have a reduction operator that will reduce the vector in one statement.
2474      Also requires scalar extract.  */
2475
2476   if (!nested_in_vect_loop_p (loop, orig_stmt))
2477     {
2478       if (reduc_code != ERROR_MARK)
2479         outer_cost += vect_get_cost (vector_stmt)
2480                       + vect_get_cost (vec_to_scalar);
2481       else
2482         {
2483           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2484           tree bitsize =
2485             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2486           int element_bitsize = tree_low_cst (bitsize, 1);
2487           int nelements = vec_size_in_bits / element_bitsize;
2488
2489           optab = optab_for_tree_code (code, vectype, optab_default);
2490
2491           /* We have a whole vector shift available.  */
2492           if (VECTOR_MODE_P (mode)
2493               && optab_handler (optab, mode) != CODE_FOR_nothing
2494               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2495             /* Final reduction via vector shifts and the reduction operator. Also
2496                requires scalar extract.  */
2497             outer_cost += ((exact_log2(nelements) * 2)
2498               * vect_get_cost (vector_stmt)
2499               + vect_get_cost (vec_to_scalar));
2500           else
2501             /* Use extracts and reduction op for final reduction.  For N elements,
2502                we have N extracts and N-1 reduction ops.  */
2503             outer_cost += ((nelements + nelements - 1)
2504               * vect_get_cost (vector_stmt));
2505         }
2506     }
2507
2508   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2509
2510   if (vect_print_dump_info (REPORT_COST))
2511     fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2512              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2513              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2514
2515   return true;
2516 }
2517
2518
2519 /* Function vect_model_induction_cost.
2520
2521    Models cost for induction operations.  */
2522
2523 static void
2524 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2525 {
2526   /* loop cost for vec_loop.  */
2527   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2528     = ncopies * vect_get_cost (vector_stmt);
2529   /* prologue cost for vec_init and vec_step.  */
2530   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2531     = 2 * vect_get_cost (scalar_to_vec);
2532
2533   if (vect_print_dump_info (REPORT_COST))
2534     fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2535              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2536              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2537 }
2538
2539
2540 /* Function get_initial_def_for_induction
2541
2542    Input:
2543    STMT - a stmt that performs an induction operation in the loop.
2544    IV_PHI - the initial value of the induction variable
2545
2546    Output:
2547    Return a vector variable, initialized with the first VF values of
2548    the induction variable. E.g., for an iv with IV_PHI='X' and
2549    evolution S, for a vector of 4 units, we want to return:
2550    [X, X + S, X + 2*S, X + 3*S].  */
2551
2552 static tree
2553 get_initial_def_for_induction (gimple iv_phi)
2554 {
2555   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2556   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2557   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2558   tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2559   tree vectype;
2560   int nunits;
2561   edge pe = loop_preheader_edge (loop);
2562   struct loop *iv_loop;
2563   basic_block new_bb;
2564   tree vec, vec_init, vec_step, t;
2565   tree access_fn;
2566   tree new_var;
2567   tree new_name;
2568   gimple init_stmt, induction_phi, new_stmt;
2569   tree induc_def, vec_def, vec_dest;
2570   tree init_expr, step_expr;
2571   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2572   int i;
2573   bool ok;
2574   int ncopies;
2575   tree expr;
2576   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2577   bool nested_in_vect_loop = false;
2578   gimple_seq stmts = NULL;
2579   imm_use_iterator imm_iter;
2580   use_operand_p use_p;
2581   gimple exit_phi;
2582   edge latch_e;
2583   tree loop_arg;
2584   gimple_stmt_iterator si;
2585   basic_block bb = gimple_bb (iv_phi);
2586   tree stepvectype;
2587
2588   vectype = get_vectype_for_scalar_type (scalar_type);
2589   gcc_assert (vectype);
2590   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2591   ncopies = vf / nunits;
2592
2593   gcc_assert (phi_info);
2594   gcc_assert (ncopies >= 1);
2595
2596   /* Find the first insertion point in the BB.  */
2597   si = gsi_after_labels (bb);
2598
2599   if (INTEGRAL_TYPE_P (scalar_type))
2600     step_expr = build_int_cst (scalar_type, 0);
2601   else if (POINTER_TYPE_P (scalar_type))
2602     step_expr = build_int_cst (sizetype, 0);
2603   else
2604     step_expr = build_real (scalar_type, dconst0);
2605
2606   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
2607   if (nested_in_vect_loop_p (loop, iv_phi))
2608     {
2609       nested_in_vect_loop = true;
2610       iv_loop = loop->inner;
2611     }
2612   else
2613     iv_loop = loop;
2614   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2615
2616   latch_e = loop_latch_edge (iv_loop);
2617   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2618
2619   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2620   gcc_assert (access_fn);
2621   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2622                                     &init_expr, &step_expr);
2623   gcc_assert (ok);
2624   pe = loop_preheader_edge (iv_loop);
2625
2626   /* Create the vector that holds the initial_value of the induction.  */
2627   if (nested_in_vect_loop)
2628     {
2629       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
2630          been created during vectorization of previous stmts; We obtain it from
2631          the STMT_VINFO_VEC_STMT of the defining stmt. */
2632       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2633                                            loop_preheader_edge (iv_loop));
2634       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2635     }
2636   else
2637     {
2638       /* iv_loop is the loop to be vectorized. Create:
2639          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
2640       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2641       add_referenced_var (new_var);
2642
2643       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2644       if (stmts)
2645         {
2646           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2647           gcc_assert (!new_bb);
2648         }
2649
2650       t = NULL_TREE;
2651       t = tree_cons (NULL_TREE, init_expr, t);
2652       for (i = 1; i < nunits; i++)
2653         {
2654           /* Create: new_name_i = new_name + step_expr  */
2655           enum tree_code code = POINTER_TYPE_P (scalar_type)
2656                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2657           init_stmt = gimple_build_assign_with_ops (code, new_var,
2658                                                     new_name, step_expr);
2659           new_name = make_ssa_name (new_var, init_stmt);
2660           gimple_assign_set_lhs (init_stmt, new_name);
2661
2662           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2663           gcc_assert (!new_bb);
2664
2665           if (vect_print_dump_info (REPORT_DETAILS))
2666             {
2667               fprintf (vect_dump, "created new init_stmt: ");
2668               print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2669             }
2670           t = tree_cons (NULL_TREE, new_name, t);
2671         }
2672       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
2673       vec = build_constructor_from_list (vectype, nreverse (t));
2674       vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2675     }
2676
2677
2678   /* Create the vector that holds the step of the induction.  */
2679   if (nested_in_vect_loop)
2680     /* iv_loop is nested in the loop to be vectorized. Generate:
2681        vec_step = [S, S, S, S]  */
2682     new_name = step_expr;
2683   else
2684     {
2685       /* iv_loop is the loop to be vectorized. Generate:
2686           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
2687       expr = build_int_cst (TREE_TYPE (step_expr), vf);
2688       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2689                               expr, step_expr);
2690     }
2691
2692   t = NULL_TREE;
2693   for (i = 0; i < nunits; i++)
2694     t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2695   gcc_assert (CONSTANT_CLASS_P (new_name));
2696   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2697   gcc_assert (stepvectype);
2698   vec = build_vector (stepvectype, t);
2699   vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2700
2701
2702   /* Create the following def-use cycle:
2703      loop prolog:
2704          vec_init = ...
2705          vec_step = ...
2706      loop:
2707          vec_iv = PHI <vec_init, vec_loop>
2708          ...
2709          STMT
2710          ...
2711          vec_loop = vec_iv + vec_step;  */
2712
2713   /* Create the induction-phi that defines the induction-operand.  */
2714   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2715   add_referenced_var (vec_dest);
2716   induction_phi = create_phi_node (vec_dest, iv_loop->header);
2717   set_vinfo_for_stmt (induction_phi,
2718                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2719   induc_def = PHI_RESULT (induction_phi);
2720
2721   /* Create the iv update inside the loop  */
2722   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2723                                            induc_def, vec_step);
2724   vec_def = make_ssa_name (vec_dest, new_stmt);
2725   gimple_assign_set_lhs (new_stmt, vec_def);
2726   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2727   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2728                                                    NULL));
2729
2730   /* Set the arguments of the phi node:  */
2731   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2732   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2733                UNKNOWN_LOCATION);
2734
2735
2736   /* In case that vectorization factor (VF) is bigger than the number
2737      of elements that we can fit in a vectype (nunits), we have to generate
2738      more than one vector stmt - i.e - we need to "unroll" the
2739      vector stmt by a factor VF/nunits.  For more details see documentation
2740      in vectorizable_operation.  */
2741
2742   if (ncopies > 1)
2743     {
2744       stmt_vec_info prev_stmt_vinfo;
2745       /* FORNOW. This restriction should be relaxed.  */
2746       gcc_assert (!nested_in_vect_loop);
2747
2748       /* Create the vector that holds the step of the induction.  */
2749       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2750       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2751                               expr, step_expr);
2752       t = NULL_TREE;
2753       for (i = 0; i < nunits; i++)
2754         t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2755       gcc_assert (CONSTANT_CLASS_P (new_name));
2756       vec = build_vector (stepvectype, t);
2757       vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2758
2759       vec_def = induc_def;
2760       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2761       for (i = 1; i < ncopies; i++)
2762         {
2763           /* vec_i = vec_prev + vec_step  */
2764           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2765                                                    vec_def, vec_step);
2766           vec_def = make_ssa_name (vec_dest, new_stmt);
2767           gimple_assign_set_lhs (new_stmt, vec_def);
2768
2769           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2770           set_vinfo_for_stmt (new_stmt,
2771                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2772           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2773           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2774         }
2775     }
2776
2777   if (nested_in_vect_loop)
2778     {
2779       /* Find the loop-closed exit-phi of the induction, and record
2780          the final vector of induction results:  */
2781       exit_phi = NULL;
2782       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2783         {
2784           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2785             {
2786               exit_phi = USE_STMT (use_p);
2787               break;
2788             }
2789         }
2790       if (exit_phi)
2791         {
2792           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2793           /* FORNOW. Currently not supporting the case that an inner-loop induction
2794              is not used in the outer-loop (i.e. only outside the outer-loop).  */
2795           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2796                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
2797
2798           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2799           if (vect_print_dump_info (REPORT_DETAILS))
2800             {
2801               fprintf (vect_dump, "vector of inductions after inner-loop:");
2802               print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2803             }
2804         }
2805     }
2806
2807
2808   if (vect_print_dump_info (REPORT_DETAILS))
2809     {
2810       fprintf (vect_dump, "transform induction: created def-use cycle: ");
2811       print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2812       fprintf (vect_dump, "\n");
2813       print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2814     }
2815
2816   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2817   return induc_def;
2818 }
2819
2820
2821 /* Function get_initial_def_for_reduction
2822
2823    Input:
2824    STMT - a stmt that performs a reduction operation in the loop.
2825    INIT_VAL - the initial value of the reduction variable
2826
2827    Output:
2828    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2829         of the reduction (used for adjusting the epilog - see below).
2830    Return a vector variable, initialized according to the operation that STMT
2831         performs. This vector will be used as the initial value of the
2832         vector of partial results.
2833
2834    Option1 (adjust in epilog): Initialize the vector as follows:
2835      add/bit or/xor:    [0,0,...,0,0]
2836      mult/bit and:      [1,1,...,1,1]
2837      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2838    and when necessary (e.g. add/mult case) let the caller know
2839    that it needs to adjust the result by init_val.
2840
2841    Option2: Initialize the vector as follows:
2842      add/bit or/xor:    [init_val,0,0,...,0]
2843      mult/bit and:      [init_val,1,1,...,1]
2844      min/max/cond_expr: [init_val,init_val,...,init_val]
2845    and no adjustments are needed.
2846
2847    For example, for the following code:
2848
2849    s = init_val;
2850    for (i=0;i<n;i++)
2851      s = s + a[i];
2852
2853    STMT is 's = s + a[i]', and the reduction variable is 's'.
2854    For a vector of 4 units, we want to return either [0,0,0,init_val],
2855    or [0,0,0,0] and let the caller know that it needs to adjust
2856    the result at the end by 'init_val'.
2857
2858    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2859    initialization vector is simpler (same element in all entries), if
2860    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2861
2862    A cost model should help decide between these two schemes.  */
2863
2864 tree
2865 get_initial_def_for_reduction (gimple stmt, tree init_val,
2866                                tree *adjustment_def)
2867 {
2868   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2869   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2870   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2871   tree scalar_type = TREE_TYPE (init_val);
2872   tree vectype = get_vectype_for_scalar_type (scalar_type);
2873   int nunits;
2874   enum tree_code code = gimple_assign_rhs_code (stmt);
2875   tree def_for_init;
2876   tree init_def;
2877   tree t = NULL_TREE;
2878   int i;
2879   bool nested_in_vect_loop = false;
2880   tree init_value;
2881   REAL_VALUE_TYPE real_init_val = dconst0;
2882   int int_init_val = 0;
2883   gimple def_stmt = NULL;
2884
2885   gcc_assert (vectype);
2886   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2887
2888   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2889               || SCALAR_FLOAT_TYPE_P (scalar_type));
2890
2891   if (nested_in_vect_loop_p (loop, stmt))
2892     nested_in_vect_loop = true;
2893   else
2894     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2895
2896   /* In case of double reduction we only create a vector variable to be put
2897      in the reduction phi node. The actual statement creation is done in
2898      vect_create_epilog_for_reduction.  */
2899   if (adjustment_def && nested_in_vect_loop
2900       && TREE_CODE (init_val) == SSA_NAME
2901       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2902       && gimple_code (def_stmt) == GIMPLE_PHI
2903       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2904       && vinfo_for_stmt (def_stmt)
2905       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2906           == vect_double_reduction_def)
2907     {
2908       *adjustment_def = NULL;
2909       return vect_create_destination_var (init_val, vectype);
2910     }
2911
2912   if (TREE_CONSTANT (init_val))
2913     {
2914       if (SCALAR_FLOAT_TYPE_P (scalar_type))
2915         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2916       else
2917         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2918     }
2919   else
2920     init_value = init_val;
2921
2922   switch (code)
2923     {
2924       case WIDEN_SUM_EXPR:
2925       case DOT_PROD_EXPR:
2926       case PLUS_EXPR:
2927       case MINUS_EXPR:
2928       case BIT_IOR_EXPR:
2929       case BIT_XOR_EXPR:
2930       case MULT_EXPR:
2931       case BIT_AND_EXPR:
2932         /* ADJUSMENT_DEF is NULL when called from
2933            vect_create_epilog_for_reduction to vectorize double reduction.  */
2934         if (adjustment_def)
2935           {
2936             if (nested_in_vect_loop)
2937               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2938                                                               NULL);
2939             else
2940               *adjustment_def = init_val;
2941           }
2942
2943         if (code == MULT_EXPR)
2944           {
2945             real_init_val = dconst1;
2946             int_init_val = 1;
2947           }
2948
2949         if (code == BIT_AND_EXPR)
2950           int_init_val = -1;
2951
2952         if (SCALAR_FLOAT_TYPE_P (scalar_type))
2953           def_for_init = build_real (scalar_type, real_init_val);
2954         else
2955           def_for_init = build_int_cst (scalar_type, int_init_val);
2956
2957         /* Create a vector of '0' or '1' except the first element.  */
2958         for (i = nunits - 2; i >= 0; --i)
2959           t = tree_cons (NULL_TREE, def_for_init, t);
2960
2961         /* Option1: the first element is '0' or '1' as well.  */
2962         if (adjustment_def)
2963           {
2964             t = tree_cons (NULL_TREE, def_for_init, t);
2965             init_def = build_vector (vectype, t);
2966             break;
2967           }
2968
2969         /* Option2: the first element is INIT_VAL.  */
2970         t = tree_cons (NULL_TREE, init_value, t);
2971         if (TREE_CONSTANT (init_val))
2972           init_def = build_vector (vectype, t);
2973         else
2974           init_def = build_constructor_from_list (vectype, t);
2975
2976         break;
2977
2978       case MIN_EXPR:
2979       case MAX_EXPR:
2980       case COND_EXPR:
2981         if (adjustment_def)
2982           {
2983             *adjustment_def = NULL_TREE;
2984             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2985             break;
2986           }
2987
2988         for (i = nunits - 1; i >= 0; --i)
2989           t = tree_cons (NULL_TREE, init_value, t);
2990
2991         if (TREE_CONSTANT (init_val))
2992           init_def = build_vector (vectype, t);
2993         else
2994           init_def = build_constructor_from_list (vectype, t);
2995
2996         break;
2997
2998       default:
2999         gcc_unreachable ();
3000     }
3001
3002   return init_def;
3003 }
3004
3005
3006 /* Function vect_create_epilog_for_reduction
3007
3008    Create code at the loop-epilog to finalize the result of a reduction
3009    computation.
3010
3011    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3012      reduction statements.
3013    STMT is the scalar reduction stmt that is being vectorized.
3014    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3015      number of elements that we can fit in a vectype (nunits). In this case
3016      we have to generate more than one vector stmt - i.e - we need to "unroll"
3017      the vector stmt by a factor VF/nunits.  For more details see documentation
3018      in vectorizable_operation.
3019    REDUC_CODE is the tree-code for the epilog reduction.
3020    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3021      computation.
3022    REDUC_INDEX is the index of the operand in the right hand side of the
3023      statement that is defined by REDUCTION_PHI.
3024    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3025    SLP_NODE is an SLP node containing a group of reduction statements. The
3026      first one in this group is STMT.
3027
3028    This function:
3029    1. Creates the reduction def-use cycles: sets the arguments for
3030       REDUCTION_PHIS:
3031       The loop-entry argument is the vectorized initial-value of the reduction.
3032       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3033       sums.
3034    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3035       by applying the operation specified by REDUC_CODE if available, or by
3036       other means (whole-vector shifts or a scalar loop).
3037       The function also creates a new phi node at the loop exit to preserve
3038       loop-closed form, as illustrated below.
3039
3040      The flow at the entry to this function:
3041
3042         loop:
3043           vec_def = phi <null, null>            # REDUCTION_PHI
3044           VECT_DEF = vector_stmt                # vectorized form of STMT
3045           s_loop = scalar_stmt                  # (scalar) STMT
3046         loop_exit:
3047           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3048           use <s_out0>
3049           use <s_out0>
3050
3051      The above is transformed by this function into:
3052
3053         loop:
3054           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3055           VECT_DEF = vector_stmt                # vectorized form of STMT
3056           s_loop = scalar_stmt                  # (scalar) STMT
3057         loop_exit:
3058           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3059           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3060           v_out2 = reduce <v_out1>
3061           s_out3 = extract_field <v_out2, 0>
3062           s_out4 = adjust_result <s_out3>
3063           use <s_out4>
3064           use <s_out4>
3065 */
3066
3067 static void
3068 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3069                                   int ncopies, enum tree_code reduc_code,
3070                                   VEC (gimple, heap) *reduction_phis,
3071                                   int reduc_index, bool double_reduc,
3072                                   slp_tree slp_node)
3073 {
3074   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3075   stmt_vec_info prev_phi_info;
3076   tree vectype;
3077   enum machine_mode mode;
3078   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3079   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3080   basic_block exit_bb;
3081   tree scalar_dest;
3082   tree scalar_type;
3083   gimple new_phi = NULL, phi;
3084   gimple_stmt_iterator exit_gsi;
3085   tree vec_dest;
3086   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3087   gimple epilog_stmt = NULL;
3088   enum tree_code code = gimple_assign_rhs_code (stmt);
3089   gimple exit_phi;
3090   tree bitsize, bitpos;
3091   tree adjustment_def = NULL;
3092   tree vec_initial_def = NULL;
3093   tree reduction_op, expr, def;
3094   tree orig_name, scalar_result;
3095   imm_use_iterator imm_iter;
3096   use_operand_p use_p;
3097   bool extract_scalar_result = false;
3098   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3099   bool nested_in_vect_loop = false;
3100   VEC (gimple, heap) *new_phis = NULL;
3101   enum vect_def_type dt = vect_unknown_def_type;
3102   int j, i;
3103   VEC (tree, heap) *scalar_results = NULL;
3104   unsigned int group_size = 1, k, ratio;
3105   VEC (tree, heap) *vec_initial_defs = NULL;
3106   VEC (gimple, heap) *phis;
3107
3108   if (slp_node)
3109     group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3110
3111   if (nested_in_vect_loop_p (loop, stmt))
3112     {
3113       outer_loop = loop;
3114       loop = loop->inner;
3115       nested_in_vect_loop = true;
3116       gcc_assert (!slp_node);
3117     }
3118
3119   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3120     {
3121     case GIMPLE_SINGLE_RHS:
3122       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3123                                        == ternary_op);
3124       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3125       break;
3126     case GIMPLE_UNARY_RHS:
3127       reduction_op = gimple_assign_rhs1 (stmt);
3128       break;
3129     case GIMPLE_BINARY_RHS:
3130       reduction_op = reduc_index ?
3131                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3132       break;
3133     default:
3134       gcc_unreachable ();
3135     }
3136
3137   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3138   gcc_assert (vectype);
3139   mode = TYPE_MODE (vectype);
3140
3141   /* 1. Create the reduction def-use cycle:
3142      Set the arguments of REDUCTION_PHIS, i.e., transform
3143
3144         loop:
3145           vec_def = phi <null, null>            # REDUCTION_PHI
3146           VECT_DEF = vector_stmt                # vectorized form of STMT
3147           ...
3148
3149      into:
3150
3151         loop:
3152           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3153           VECT_DEF = vector_stmt                # vectorized form of STMT
3154           ...
3155
3156      (in case of SLP, do it for all the phis). */
3157
3158   /* Get the loop-entry arguments.  */
3159   if (slp_node)
3160     vect_get_slp_defs (slp_node, &vec_initial_defs, NULL, reduc_index);
3161   else
3162     {
3163       vec_initial_defs = VEC_alloc (tree, heap, 1);
3164      /* For the case of reduction, vect_get_vec_def_for_operand returns
3165         the scalar def before the loop, that defines the initial value
3166         of the reduction variable.  */
3167       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3168                                                       &adjustment_def);
3169       VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3170     }
3171
3172   /* Set phi nodes arguments.  */
3173   for (i = 0; VEC_iterate (gimple, reduction_phis, i, phi); i++)
3174     {
3175       tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3176       tree def = VEC_index (tree, vect_defs, i);
3177       for (j = 0; j < ncopies; j++)
3178         {
3179           /* Set the loop-entry arg of the reduction-phi.  */
3180           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3181                        UNKNOWN_LOCATION);
3182
3183           /* Set the loop-latch arg for the reduction-phi.  */
3184           if (j > 0)
3185             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3186
3187           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3188
3189           if (vect_print_dump_info (REPORT_DETAILS))
3190             {
3191               fprintf (vect_dump, "transform reduction: created def-use"
3192                                   " cycle: ");
3193               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3194               fprintf (vect_dump, "\n");
3195               print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3196                                  TDF_SLIM);
3197             }
3198
3199           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3200         }
3201     }
3202
3203   VEC_free (tree, heap, vec_initial_defs);
3204
3205   /* 2. Create epilog code.
3206         The reduction epilog code operates across the elements of the vector
3207         of partial results computed by the vectorized loop.
3208         The reduction epilog code consists of:
3209
3210         step 1: compute the scalar result in a vector (v_out2)
3211         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3212         step 3: adjust the scalar result (s_out3) if needed.
3213
3214         Step 1 can be accomplished using one the following three schemes:
3215           (scheme 1) using reduc_code, if available.
3216           (scheme 2) using whole-vector shifts, if available.
3217           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3218                      combined.
3219
3220           The overall epilog code looks like this:
3221
3222           s_out0 = phi <s_loop>         # original EXIT_PHI
3223           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3224           v_out2 = reduce <v_out1>              # step 1
3225           s_out3 = extract_field <v_out2, 0>    # step 2
3226           s_out4 = adjust_result <s_out3>       # step 3
3227
3228           (step 3 is optional, and steps 1 and 2 may be combined).
3229           Lastly, the uses of s_out0 are replaced by s_out4.  */
3230
3231
3232   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3233          v_out1 = phi <VECT_DEF>
3234          Store them in NEW_PHIS.  */
3235
3236   exit_bb = single_exit (loop)->dest;
3237   prev_phi_info = NULL;
3238   new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3239   for (i = 0; VEC_iterate (tree, vect_defs, i, def); i++)
3240     {
3241       for (j = 0; j < ncopies; j++)
3242         {
3243           phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3244           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3245           if (j == 0)
3246             VEC_quick_push (gimple, new_phis, phi);
3247           else
3248             {
3249               def = vect_get_vec_def_for_stmt_copy (dt, def);
3250               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3251             }
3252
3253           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3254           prev_phi_info = vinfo_for_stmt (phi);
3255         }
3256     }
3257
3258   exit_gsi = gsi_after_labels (exit_bb);
3259
3260   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3261          (i.e. when reduc_code is not available) and in the final adjustment
3262          code (if needed).  Also get the original scalar reduction variable as
3263          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3264          represents a reduction pattern), the tree-code and scalar-def are
3265          taken from the original stmt that the pattern-stmt (STMT) replaces.
3266          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3267          are taken from STMT.  */
3268
3269   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3270   if (!orig_stmt)
3271     {
3272       /* Regular reduction  */
3273       orig_stmt = stmt;
3274     }
3275   else
3276     {
3277       /* Reduction pattern  */
3278       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3279       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3280       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3281     }
3282
3283   code = gimple_assign_rhs_code (orig_stmt);
3284   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3285      partial results are added and not subtracted.  */
3286   if (code == MINUS_EXPR)
3287     code = PLUS_EXPR;
3288
3289   scalar_dest = gimple_assign_lhs (orig_stmt);
3290   scalar_type = TREE_TYPE (scalar_dest);
3291   scalar_results = VEC_alloc (tree, heap, group_size);
3292   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3293   bitsize = TYPE_SIZE (scalar_type);
3294
3295   /* In case this is a reduction in an inner-loop while vectorizing an outer
3296      loop - we don't need to extract a single scalar result at the end of the
3297      inner-loop (unless it is double reduction, i.e., the use of reduction is
3298      outside the outer-loop). The final vector of partial results will be used
3299      in the vectorized outer-loop, or reduced to a scalar result at the end of
3300      the outer-loop.  */
3301   if (nested_in_vect_loop && !double_reduc)
3302     goto vect_finalize_reduction;
3303
3304   /* 2.3 Create the reduction code, using one of the three schemes described
3305          above. In SLP we simply need to extract all the elements from the
3306          vector (without reducing them), so we use scalar shifts.  */
3307   if (reduc_code != ERROR_MARK && !slp_node)
3308     {
3309       tree tmp;
3310
3311       /*** Case 1:  Create:
3312            v_out2 = reduc_expr <v_out1>  */
3313
3314       if (vect_print_dump_info (REPORT_DETAILS))
3315         fprintf (vect_dump, "Reduce using direct vector reduction.");
3316
3317       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3318       new_phi = VEC_index (gimple, new_phis, 0);
3319       tmp = build1 (reduc_code, vectype,  PHI_RESULT (new_phi));
3320       epilog_stmt = gimple_build_assign (vec_dest, tmp);
3321       new_temp = make_ssa_name (vec_dest, epilog_stmt);
3322       gimple_assign_set_lhs (epilog_stmt, new_temp);
3323       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3324
3325       extract_scalar_result = true;
3326     }
3327   else
3328     {
3329       enum tree_code shift_code = ERROR_MARK;
3330       bool have_whole_vector_shift = true;
3331       int bit_offset;
3332       int element_bitsize = tree_low_cst (bitsize, 1);
3333       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3334       tree vec_temp;
3335
3336       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3337         shift_code = VEC_RSHIFT_EXPR;
3338       else
3339         have_whole_vector_shift = false;
3340
3341       /* Regardless of whether we have a whole vector shift, if we're
3342          emulating the operation via tree-vect-generic, we don't want
3343          to use it.  Only the first round of the reduction is likely
3344          to still be profitable via emulation.  */
3345       /* ??? It might be better to emit a reduction tree code here, so that
3346          tree-vect-generic can expand the first round via bit tricks.  */
3347       if (!VECTOR_MODE_P (mode))
3348         have_whole_vector_shift = false;
3349       else
3350         {
3351           optab optab = optab_for_tree_code (code, vectype, optab_default);
3352           if (optab_handler (optab, mode) == CODE_FOR_nothing)
3353             have_whole_vector_shift = false;
3354         }
3355
3356       if (have_whole_vector_shift && !slp_node)
3357         {
3358           /*** Case 2: Create:
3359              for (offset = VS/2; offset >= element_size; offset/=2)
3360                 {
3361                   Create:  va' = vec_shift <va, offset>
3362                   Create:  va = vop <va, va'>
3363                 }  */
3364
3365           if (vect_print_dump_info (REPORT_DETAILS))
3366             fprintf (vect_dump, "Reduce using vector shifts");
3367
3368           vec_dest = vect_create_destination_var (scalar_dest, vectype);
3369           new_phi = VEC_index (gimple, new_phis, 0);
3370           new_temp = PHI_RESULT (new_phi);
3371           for (bit_offset = vec_size_in_bits/2;
3372                bit_offset >= element_bitsize;
3373                bit_offset /= 2)
3374             {
3375               tree bitpos = size_int (bit_offset);
3376
3377               epilog_stmt = gimple_build_assign_with_ops (shift_code,
3378                                                vec_dest, new_temp, bitpos);
3379               new_name = make_ssa_name (vec_dest, epilog_stmt);
3380               gimple_assign_set_lhs (epilog_stmt, new_name);
3381               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3382
3383               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3384                                                           new_name, new_temp);
3385               new_temp = make_ssa_name (vec_dest, epilog_stmt);
3386               gimple_assign_set_lhs (epilog_stmt, new_temp);
3387               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3388             }
3389
3390           extract_scalar_result = true;
3391         }
3392       else
3393         {
3394           tree rhs;
3395
3396           /*** Case 3: Create:
3397              s = extract_field <v_out2, 0>
3398              for (offset = element_size;
3399                   offset < vector_size;
3400                   offset += element_size;)
3401                {
3402                  Create:  s' = extract_field <v_out2, offset>
3403                  Create:  s = op <s, s'>  // For non SLP cases
3404                }  */
3405
3406           if (vect_print_dump_info (REPORT_DETAILS))
3407             fprintf (vect_dump, "Reduce using scalar code. ");
3408
3409           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3410           for (i = 0; VEC_iterate (gimple, new_phis, i, new_phi); i++)
3411             {
3412               vec_temp = PHI_RESULT (new_phi);
3413               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3414                             bitsize_zero_node);
3415               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3416               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3417               gimple_assign_set_lhs (epilog_stmt, new_temp);
3418               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3419
3420               /* In SLP we don't need to apply reduction operation, so we just
3421                  collect s' values in SCALAR_RESULTS.  */
3422               if (slp_node)
3423                 VEC_safe_push (tree, heap, scalar_results, new_temp);
3424
3425               for (bit_offset = element_bitsize;
3426                    bit_offset < vec_size_in_bits;
3427                    bit_offset += element_bitsize)
3428                 {
3429                   tree bitpos = bitsize_int (bit_offset);
3430                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3431                                      bitsize, bitpos);
3432
3433                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3434                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3435                   gimple_assign_set_lhs (epilog_stmt, new_name);
3436                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3437
3438                   if (slp_node)
3439                     {
3440                       /* In SLP we don't need to apply reduction operation, so
3441                          we just collect s' values in SCALAR_RESULTS.  */
3442                       new_temp = new_name;
3443                       VEC_safe_push (tree, heap, scalar_results, new_name);
3444                     }
3445                   else
3446                     {
3447                       epilog_stmt = gimple_build_assign_with_ops (code,
3448                                           new_scalar_dest, new_name, new_temp);
3449                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3450                       gimple_assign_set_lhs (epilog_stmt, new_temp);
3451                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3452                     }
3453                 }
3454             }
3455
3456           /* The only case where we need to reduce scalar results in SLP, is
3457              unrolling. If the size of SCALAR_RESULTS is greater than
3458              GROUP_SIZE, we reduce them combining elements modulo
3459              GROUP_SIZE.  */
3460           if (slp_node)
3461             {
3462               tree res, first_res, new_res;
3463               gimple new_stmt;
3464
3465               /* Reduce multiple scalar results in case of SLP unrolling.  */
3466               for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3467                    j++)
3468                 {
3469                   first_res = VEC_index (tree, scalar_results, j % group_size);
3470                   new_stmt = gimple_build_assign_with_ops (code,
3471                                               new_scalar_dest, first_res, res);
3472                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
3473                   gimple_assign_set_lhs (new_stmt, new_res);
3474                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3475                   VEC_replace (tree, scalar_results, j % group_size, new_res);
3476                 }
3477             }
3478           else
3479             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
3480             VEC_safe_push (tree, heap, scalar_results, new_temp);
3481
3482           extract_scalar_result = false;
3483         }
3484     }
3485
3486   /* 2.4  Extract the final scalar result.  Create:
3487           s_out3 = extract_field <v_out2, bitpos>  */
3488
3489   if (extract_scalar_result)
3490     {
3491       tree rhs;
3492
3493       if (vect_print_dump_info (REPORT_DETAILS))
3494         fprintf (vect_dump, "extract scalar result");
3495
3496       if (BYTES_BIG_ENDIAN)
3497         bitpos = size_binop (MULT_EXPR,
3498                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3499                              TYPE_SIZE (scalar_type));
3500       else
3501         bitpos = bitsize_zero_node;
3502
3503       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3504       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3505       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3506       gimple_assign_set_lhs (epilog_stmt, new_temp);
3507       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3508       VEC_safe_push (tree, heap, scalar_results, new_temp);
3509     }
3510
3511 vect_finalize_reduction:
3512
3513   /* 2.5 Adjust the final result by the initial value of the reduction
3514          variable. (When such adjustment is not needed, then
3515          'adjustment_def' is zero).  For example, if code is PLUS we create:
3516          new_temp = loop_exit_def + adjustment_def  */
3517
3518   if (adjustment_def)
3519     {
3520       gcc_assert (!slp_node);
3521       if (nested_in_vect_loop)
3522         {
3523           new_phi = VEC_index (gimple, new_phis, 0);
3524           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3525           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3526           new_dest = vect_create_destination_var (scalar_dest, vectype);
3527         }
3528       else
3529         {
3530           new_temp = VEC_index (tree, scalar_results, 0);
3531           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3532           expr = build2 (code, scalar_type, new_temp, adjustment_def);
3533           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3534         }
3535
3536       epilog_stmt = gimple_build_assign (new_dest, expr);
3537       new_temp = make_ssa_name (new_dest, epilog_stmt);
3538       gimple_assign_set_lhs (epilog_stmt, new_temp);
3539       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3540       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3541       if (nested_in_vect_loop)
3542         {
3543           set_vinfo_for_stmt (epilog_stmt,
3544                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
3545                                                  NULL));
3546           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3547                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3548
3549           if (!double_reduc)
3550             VEC_quick_push (tree, scalar_results, new_temp);
3551           else
3552             VEC_replace (tree, scalar_results, 0, new_temp);
3553         }
3554       else
3555         VEC_replace (tree, scalar_results, 0, new_temp);
3556
3557       VEC_replace (gimple, new_phis, 0, epilog_stmt);
3558     }
3559
3560   /* 2.6  Handle the loop-exit phis. Replace the uses of scalar loop-exit
3561           phis with new adjusted scalar results, i.e., replace use <s_out0>
3562           with use <s_out4>.
3563
3564      Transform:
3565         loop_exit:
3566           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3567           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3568           v_out2 = reduce <v_out1>
3569           s_out3 = extract_field <v_out2, 0>
3570           s_out4 = adjust_result <s_out3>
3571           use <s_out0>
3572           use <s_out0>
3573
3574      into:
3575
3576         loop_exit:
3577           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3578           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3579           v_out2 = reduce <v_out1>
3580           s_out3 = extract_field <v_out2, 0>
3581           s_out4 = adjust_result <s_out3>
3582           use <s_out4>
3583           use <s_out4> */
3584
3585   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
3586      case that GROUP_SIZE is greater than vectorization factor). Therefore, we
3587      need to match SCALAR_RESULTS with corresponding statements. The first
3588      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
3589      the first vector stmt, etc.
3590      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
3591   if (group_size > VEC_length (gimple, new_phis))
3592     {
3593       ratio = group_size / VEC_length (gimple, new_phis);
3594       gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
3595     }
3596   else
3597     ratio = 1;
3598
3599   for (k = 0; k < group_size; k++)
3600     {
3601       if (k % ratio == 0)
3602         {
3603           epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
3604           reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
3605         }
3606
3607       if (slp_node)
3608         {
3609           gimple current_stmt = VEC_index (gimple,
3610                                        SLP_TREE_SCALAR_STMTS (slp_node), k);
3611
3612           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
3613           /* SLP statements can't participate in patterns.  */
3614           gcc_assert (!orig_stmt);
3615           scalar_dest = gimple_assign_lhs (current_stmt);
3616         }
3617
3618       phis = VEC_alloc (gimple, heap, 3);
3619       /* Find the loop-closed-use at the loop exit of the original scalar
3620          result. (The reduction result is expected to have two immediate uses -
3621          one at the latch block, and one at the loop exit).  */
3622       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3623         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3624           VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3625
3626       /* We expect to have found an exit_phi because of loop-closed-ssa
3627          form.  */
3628       gcc_assert (!VEC_empty (gimple, phis));
3629
3630       for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
3631         {
3632           if (outer_loop)
3633             {
3634               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
3635               gimple vect_phi;
3636
3637               /* FORNOW. Currently not supporting the case that an inner-loop
3638                  reduction is not used in the outer-loop (but only outside the
3639                  outer-loop), unless it is double reduction.  */
3640               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
3641                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
3642                           || double_reduc);
3643
3644               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
3645               if (!double_reduc
3646                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
3647                       != vect_double_reduction_def)
3648                 continue;
3649
3650               /* Handle double reduction:
3651
3652                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
3653                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
3654                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
3655                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
3656
3657                  At that point the regular reduction (stmt2 and stmt3) is
3658                  already vectorized, as well as the exit phi node, stmt4.
3659                  Here we vectorize the phi node of double reduction, stmt1, and
3660                  update all relevant statements.  */
3661
3662               /* Go through all the uses of s2 to find double reduction phi
3663                  node, i.e., stmt1 above.  */
3664               orig_name = PHI_RESULT (exit_phi);
3665               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3666                 {
3667                   stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3668                   stmt_vec_info new_phi_vinfo;
3669                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3670                   basic_block bb = gimple_bb (use_stmt);
3671                   gimple use;
3672
3673                   /* Check that USE_STMT is really double reduction phi
3674                      node.  */
3675                   if (gimple_code (use_stmt) != GIMPLE_PHI
3676                       || gimple_phi_num_args (use_stmt) != 2
3677                       || !use_stmt_vinfo
3678                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3679                           != vect_double_reduction_def
3680                       || bb->loop_father != outer_loop)
3681                     continue;
3682
3683                   /* Create vector phi node for double reduction:
3684                      vs1 = phi <vs0, vs2>
3685                      vs1 was created previously in this function by a call to
3686                        vect_get_vec_def_for_operand and is stored in
3687                        vec_initial_def;
3688                      vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3689                      vs0 is created here.  */
3690
3691                   /* Create vector phi node.  */
3692                   vect_phi = create_phi_node (vec_initial_def, bb);
3693                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
3694                                     loop_vec_info_for_loop (outer_loop), NULL);
3695                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3696
3697                   /* Create vs0 - initial def of the double reduction phi.  */
3698                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3699                                              loop_preheader_edge (outer_loop));
3700                   init_def = get_initial_def_for_reduction (stmt,
3701                                                           preheader_arg, NULL);
3702                   vect_phi_init = vect_init_vector (use_stmt, init_def,
3703                                                     vectype, NULL);
3704
3705                   /* Update phi node arguments with vs0 and vs2.  */
3706                   add_phi_arg (vect_phi, vect_phi_init,
3707                                loop_preheader_edge (outer_loop),
3708                                UNKNOWN_LOCATION);
3709                   add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3710                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3711                   if (vect_print_dump_info (REPORT_DETAILS))
3712                     {
3713                       fprintf (vect_dump, "created double reduction phi "
3714                                           "node: ");
3715                       print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3716                     }
3717
3718                   vect_phi_res = PHI_RESULT (vect_phi);
3719
3720                   /* Replace the use, i.e., set the correct vs1 in the regular
3721                      reduction phi node. FORNOW, NCOPIES is always 1, so the
3722                      loop is redundant.  */
3723                   use = reduction_phi;
3724                   for (j = 0; j < ncopies; j++)
3725                     {
3726                       edge pr_edge = loop_preheader_edge (loop);
3727                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3728                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3729                     }
3730                 }
3731             }
3732
3733           /* Replace the uses:  */
3734           orig_name = PHI_RESULT (exit_phi);
3735           scalar_result = VEC_index (tree, scalar_results, k);
3736           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3737             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3738               SET_USE (use_p, scalar_result);
3739         }
3740
3741       VEC_free (gimple, heap, phis);
3742     }
3743
3744   VEC_free (tree, heap, scalar_results);
3745   VEC_free (gimple, heap, new_phis);
3746 }
3747
3748
3749 /* Function vectorizable_reduction.
3750
3751    Check if STMT performs a reduction operation that can be vectorized.
3752    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3753    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3754    Return FALSE if not a vectorizable STMT, TRUE otherwise.
3755
3756    This function also handles reduction idioms (patterns) that have been
3757    recognized in advance during vect_pattern_recog. In this case, STMT may be
3758    of this form:
3759      X = pattern_expr (arg0, arg1, ..., X)
3760    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3761    sequence that had been detected and replaced by the pattern-stmt (STMT).
3762
3763    In some cases of reduction patterns, the type of the reduction variable X is
3764    different than the type of the other arguments of STMT.
3765    In such cases, the vectype that is used when transforming STMT into a vector
3766    stmt is different than the vectype that is used to determine the
3767    vectorization factor, because it consists of a different number of elements
3768    than the actual number of elements that are being operated upon in parallel.
3769
3770    For example, consider an accumulation of shorts into an int accumulator.
3771    On some targets it's possible to vectorize this pattern operating on 8
3772    shorts at a time (hence, the vectype for purposes of determining the
3773    vectorization factor should be V8HI); on the other hand, the vectype that
3774    is used to create the vector form is actually V4SI (the type of the result).
3775
3776    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3777    indicates what is the actual level of parallelism (V8HI in the example), so
3778    that the right vectorization factor would be derived. This vectype
3779    corresponds to the type of arguments to the reduction stmt, and should *NOT*
3780    be used to create the vectorized stmt. The right vectype for the vectorized
3781    stmt is obtained from the type of the result X:
3782         get_vectype_for_scalar_type (TREE_TYPE (X))
3783
3784    This means that, contrary to "regular" reductions (or "regular" stmts in
3785    general), the following equation:
3786       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3787    does *NOT* necessarily hold for reduction patterns.  */
3788
3789 bool
3790 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3791                         gimple *vec_stmt, slp_tree slp_node)
3792 {
3793   tree vec_dest;
3794   tree scalar_dest;
3795   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3796   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3797   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
3798   tree vectype_in = NULL_TREE;
3799   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3800   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3801   enum tree_code code, orig_code, epilog_reduc_code;
3802   enum machine_mode vec_mode;
3803   int op_type;
3804   optab optab, reduc_optab;
3805   tree new_temp = NULL_TREE;
3806   tree def;
3807   gimple def_stmt;
3808   enum vect_def_type dt;
3809   gimple new_phi = NULL;
3810   tree scalar_type;
3811   bool is_simple_use;
3812   gimple orig_stmt;
3813   stmt_vec_info orig_stmt_info;
3814   tree expr = NULL_TREE;
3815   int i;
3816   int ncopies;
3817   int epilog_copies;
3818   stmt_vec_info prev_stmt_info, prev_phi_info;
3819   bool single_defuse_cycle = false;
3820   tree reduc_def = NULL_TREE;
3821   gimple new_stmt = NULL;
3822   int j;
3823   tree ops[3];
3824   bool nested_cycle = false, found_nested_cycle_def = false;
3825   gimple reduc_def_stmt = NULL;
3826   /* The default is that the reduction variable is the last in statement.  */
3827   int reduc_index = 2;
3828   bool double_reduc = false, dummy;
3829   basic_block def_bb;
3830   struct loop * def_stmt_loop, *outer_loop = NULL;
3831   tree def_arg;
3832   gimple def_arg_stmt;
3833   VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
3834   VEC (gimple, heap) *phis = NULL;
3835   int vec_num;
3836   tree def0, def1;
3837
3838   if (nested_in_vect_loop_p (loop, stmt))
3839     {
3840       outer_loop = loop;
3841       loop = loop->inner;
3842       nested_cycle = true;
3843     }
3844
3845   /* 1. Is vectorizable reduction?  */
3846   /* Not supportable if the reduction variable is used in the loop.  */
3847   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3848     return false;
3849
3850   /* Reductions that are not used even in an enclosing outer-loop,
3851      are expected to be "live" (used out of the loop).  */
3852   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3853       && !STMT_VINFO_LIVE_P (stmt_info))
3854     return false;
3855
3856   /* Make sure it was already recognized as a reduction computation.  */
3857   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3858       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3859     return false;
3860
3861   /* 2. Has this been recognized as a reduction pattern?
3862
3863      Check if STMT represents a pattern that has been recognized
3864      in earlier analysis stages.  For stmts that represent a pattern,
3865      the STMT_VINFO_RELATED_STMT field records the last stmt in
3866      the original sequence that constitutes the pattern.  */
3867
3868   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3869   if (orig_stmt)
3870     {
3871       orig_stmt_info = vinfo_for_stmt (orig_stmt);
3872       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3873       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3874       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3875     }
3876
3877   /* 3. Check the operands of the operation. The first operands are defined
3878         inside the loop body. The last operand is the reduction variable,
3879         which is defined by the loop-header-phi.  */
3880
3881   gcc_assert (is_gimple_assign (stmt));
3882
3883   /* Flatten RHS */
3884   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3885     {
3886     case GIMPLE_SINGLE_RHS:
3887       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3888       if (op_type == ternary_op)
3889         {
3890           tree rhs = gimple_assign_rhs1 (stmt);
3891           ops[0] = TREE_OPERAND (rhs, 0);
3892           ops[1] = TREE_OPERAND (rhs, 1);
3893           ops[2] = TREE_OPERAND (rhs, 2);
3894           code = TREE_CODE (rhs);
3895         }
3896       else
3897         return false;
3898       break;
3899
3900     case GIMPLE_BINARY_RHS:
3901       code = gimple_assign_rhs_code (stmt);
3902       op_type = TREE_CODE_LENGTH (code);
3903       gcc_assert (op_type == binary_op);
3904       ops[0] = gimple_assign_rhs1 (stmt);
3905       ops[1] = gimple_assign_rhs2 (stmt);
3906       break;
3907
3908     case GIMPLE_UNARY_RHS:
3909       return false;
3910
3911     default:
3912       gcc_unreachable ();
3913     }
3914
3915   scalar_dest = gimple_assign_lhs (stmt);
3916   scalar_type = TREE_TYPE (scalar_dest);
3917   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3918       && !SCALAR_FLOAT_TYPE_P (scalar_type))
3919     return false;
3920
3921   /* All uses but the last are expected to be defined in the loop.
3922      The last use is the reduction variable. In case of nested cycle this
3923      assumption is not true: we use reduc_index to record the index of the
3924      reduction variable.  */
3925   for (i = 0; i < op_type-1; i++)
3926     {
3927       tree tem;
3928
3929       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
3930       if (i == 0 && code == COND_EXPR)
3931         continue;
3932
3933       is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
3934                                             &def_stmt, &def, &dt, &tem);
3935       if (!vectype_in)
3936         vectype_in = tem;
3937       gcc_assert (is_simple_use);
3938       if (dt != vect_internal_def
3939           && dt != vect_external_def
3940           && dt != vect_constant_def
3941           && dt != vect_induction_def
3942           && !(dt == vect_nested_cycle && nested_cycle))
3943         return false;
3944
3945       if (dt == vect_nested_cycle)
3946         {
3947           found_nested_cycle_def = true;
3948           reduc_def_stmt = def_stmt;
3949           reduc_index = i;
3950         }
3951     }
3952
3953   is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3954                                       &def, &dt);
3955   gcc_assert (is_simple_use);
3956   gcc_assert (dt == vect_reduction_def
3957               || dt == vect_nested_cycle
3958               || ((dt == vect_internal_def || dt == vect_external_def
3959                    || dt == vect_constant_def || dt == vect_induction_def)
3960                    && nested_cycle && found_nested_cycle_def));
3961   if (!found_nested_cycle_def)
3962     reduc_def_stmt = def_stmt;
3963
3964   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
3965   if (orig_stmt)
3966     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
3967                                                        reduc_def_stmt,
3968                                                        !nested_cycle,
3969                                                        &dummy));
3970   else
3971     gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
3972                                                   !nested_cycle, &dummy));
3973
3974   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
3975     return false;
3976
3977   if (slp_node)
3978     ncopies = 1;
3979   else
3980     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3981                / TYPE_VECTOR_SUBPARTS (vectype_in));
3982
3983   gcc_assert (ncopies >= 1);
3984
3985   vec_mode = TYPE_MODE (vectype_in);
3986
3987   if (code == COND_EXPR)
3988     {
3989       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
3990         {
3991           if (vect_print_dump_info (REPORT_DETAILS))
3992             fprintf (vect_dump, "unsupported condition in reduction");
3993
3994             return false;
3995         }
3996     }
3997   else
3998     {
3999       /* 4. Supportable by target?  */
4000
4001       /* 4.1. check support for the operation in the loop  */
4002       optab = optab_for_tree_code (code, vectype_in, optab_default);
4003       if (!optab)
4004         {
4005           if (vect_print_dump_info (REPORT_DETAILS))
4006             fprintf (vect_dump, "no optab.");
4007
4008           return false;
4009         }
4010
4011       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4012         {
4013           if (vect_print_dump_info (REPORT_DETAILS))
4014             fprintf (vect_dump, "op not supported by target.");
4015
4016           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4017               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4018                   < vect_min_worthwhile_factor (code))
4019             return false;
4020
4021           if (vect_print_dump_info (REPORT_DETAILS))
4022             fprintf (vect_dump, "proceeding using word mode.");
4023         }
4024
4025       /* Worthwhile without SIMD support?  */
4026       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4027           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4028              < vect_min_worthwhile_factor (code))
4029         {
4030           if (vect_print_dump_info (REPORT_DETAILS))
4031             fprintf (vect_dump, "not worthwhile without SIMD support.");
4032
4033           return false;
4034         }
4035     }
4036
4037   /* 4.2. Check support for the epilog operation.
4038
4039           If STMT represents a reduction pattern, then the type of the
4040           reduction variable may be different than the type of the rest
4041           of the arguments.  For example, consider the case of accumulation
4042           of shorts into an int accumulator; The original code:
4043                         S1: int_a = (int) short_a;
4044           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4045
4046           was replaced with:
4047                         STMT: int_acc = widen_sum <short_a, int_acc>
4048
4049           This means that:
4050           1. The tree-code that is used to create the vector operation in the
4051              epilog code (that reduces the partial results) is not the
4052              tree-code of STMT, but is rather the tree-code of the original
4053              stmt from the pattern that STMT is replacing. I.e, in the example
4054              above we want to use 'widen_sum' in the loop, but 'plus' in the
4055              epilog.
4056           2. The type (mode) we use to check available target support
4057              for the vector operation to be created in the *epilog*, is
4058              determined by the type of the reduction variable (in the example
4059              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4060              However the type (mode) we use to check available target support
4061              for the vector operation to be created *inside the loop*, is
4062              determined by the type of the other arguments to STMT (in the
4063              example we'd check this: optab_handler (widen_sum_optab,
4064              vect_short_mode)).
4065
4066           This is contrary to "regular" reductions, in which the types of all
4067           the arguments are the same as the type of the reduction variable.
4068           For "regular" reductions we can therefore use the same vector type
4069           (and also the same tree-code) when generating the epilog code and
4070           when generating the code inside the loop.  */
4071
4072   if (orig_stmt)
4073     {
4074       /* This is a reduction pattern: get the vectype from the type of the
4075          reduction variable, and get the tree-code from orig_stmt.  */
4076       orig_code = gimple_assign_rhs_code (orig_stmt);
4077       gcc_assert (vectype_out);
4078       vec_mode = TYPE_MODE (vectype_out);
4079     }
4080   else
4081     {
4082       /* Regular reduction: use the same vectype and tree-code as used for
4083          the vector code inside the loop can be used for the epilog code. */
4084       orig_code = code;
4085     }
4086
4087   if (nested_cycle)
4088     {
4089       def_bb = gimple_bb (reduc_def_stmt);
4090       def_stmt_loop = def_bb->loop_father;
4091       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4092                                        loop_preheader_edge (def_stmt_loop));
4093       if (TREE_CODE (def_arg) == SSA_NAME
4094           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4095           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4096           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4097           && vinfo_for_stmt (def_arg_stmt)
4098           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4099               == vect_double_reduction_def)
4100         double_reduc = true;
4101     }
4102
4103   epilog_reduc_code = ERROR_MARK;
4104   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4105     {
4106       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4107                                          optab_default);
4108       if (!reduc_optab)
4109         {
4110           if (vect_print_dump_info (REPORT_DETAILS))
4111             fprintf (vect_dump, "no optab for reduction.");
4112
4113           epilog_reduc_code = ERROR_MARK;
4114         }
4115
4116       if (reduc_optab
4117           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4118         {
4119           if (vect_print_dump_info (REPORT_DETAILS))
4120             fprintf (vect_dump, "reduc op not supported by target.");
4121
4122           epilog_reduc_code = ERROR_MARK;
4123         }
4124     }
4125   else
4126     {
4127       if (!nested_cycle || double_reduc)
4128         {
4129           if (vect_print_dump_info (REPORT_DETAILS))
4130             fprintf (vect_dump, "no reduc code for scalar code.");
4131
4132           return false;
4133         }
4134     }
4135
4136   if (double_reduc && ncopies > 1)
4137     {
4138       if (vect_print_dump_info (REPORT_DETAILS))
4139         fprintf (vect_dump, "multiple types in double reduction");
4140
4141       return false;
4142     }
4143
4144   if (!vec_stmt) /* transformation not required.  */
4145     {
4146       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4147       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4148         return false;
4149       return true;
4150     }
4151
4152   /** Transform.  **/
4153
4154   if (vect_print_dump_info (REPORT_DETAILS))
4155     fprintf (vect_dump, "transform reduction.");
4156
4157   /* FORNOW: Multiple types are not supported for condition.  */
4158   if (code == COND_EXPR)
4159     gcc_assert (ncopies == 1);
4160
4161   /* Create the destination vector  */
4162   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4163
4164   /* In case the vectorization factor (VF) is bigger than the number
4165      of elements that we can fit in a vectype (nunits), we have to generate
4166      more than one vector stmt - i.e - we need to "unroll" the
4167      vector stmt by a factor VF/nunits.  For more details see documentation
4168      in vectorizable_operation.  */
4169
4170   /* If the reduction is used in an outer loop we need to generate
4171      VF intermediate results, like so (e.g. for ncopies=2):
4172         r0 = phi (init, r0)
4173         r1 = phi (init, r1)
4174         r0 = x0 + r0;
4175         r1 = x1 + r1;
4176     (i.e. we generate VF results in 2 registers).
4177     In this case we have a separate def-use cycle for each copy, and therefore
4178     for each copy we get the vector def for the reduction variable from the
4179     respective phi node created for this copy.
4180
4181     Otherwise (the reduction is unused in the loop nest), we can combine
4182     together intermediate results, like so (e.g. for ncopies=2):
4183         r = phi (init, r)
4184         r = x0 + r;
4185         r = x1 + r;
4186    (i.e. we generate VF/2 results in a single register).
4187    In this case for each copy we get the vector def for the reduction variable
4188    from the vectorized reduction operation generated in the previous iteration.
4189   */
4190
4191   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4192     {
4193       single_defuse_cycle = true;
4194       epilog_copies = 1;
4195     }
4196   else
4197     epilog_copies = ncopies;
4198
4199   prev_stmt_info = NULL;
4200   prev_phi_info = NULL;
4201   if (slp_node)
4202     {
4203       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4204       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4205                   == TYPE_VECTOR_SUBPARTS (vectype_in));
4206     }
4207   else
4208     {
4209       vec_num = 1;
4210       vec_oprnds0 = VEC_alloc (tree, heap, 1);
4211       if (op_type == ternary_op)
4212         vec_oprnds1 = VEC_alloc (tree, heap, 1);
4213     }
4214
4215   phis = VEC_alloc (gimple, heap, vec_num);
4216   vect_defs = VEC_alloc (tree, heap, vec_num);
4217   if (!slp_node)
4218     VEC_quick_push (tree, vect_defs, NULL_TREE);
4219
4220   for (j = 0; j < ncopies; j++)
4221     {
4222       if (j == 0 || !single_defuse_cycle)
4223         {
4224           for (i = 0; i < vec_num; i++)
4225             {
4226               /* Create the reduction-phi that defines the reduction
4227                  operand.  */
4228               new_phi = create_phi_node (vec_dest, loop->header);
4229               set_vinfo_for_stmt (new_phi,
4230                                   new_stmt_vec_info (new_phi, loop_vinfo,
4231                                                      NULL));
4232                if (j == 0 || slp_node)
4233                  VEC_quick_push (gimple, phis, new_phi);
4234             }
4235         }
4236
4237       if (code == COND_EXPR)
4238         {
4239           gcc_assert (!slp_node);
4240           vectorizable_condition (stmt, gsi, vec_stmt,
4241                                   PHI_RESULT (VEC_index (gimple, phis, 0)),
4242                                   reduc_index);
4243           /* Multiple types are not supported for condition.  */
4244           break;
4245         }
4246
4247       /* Handle uses.  */
4248       if (j == 0)
4249         {
4250           if (slp_node)
4251             vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1);
4252           else
4253             {
4254               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4255                                                             stmt, NULL);
4256               VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4257               if (op_type == ternary_op)
4258                {
4259                  if (reduc_index == 0)
4260                    loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
4261                                                                  NULL);
4262                  else
4263                    loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
4264                                                                  NULL);
4265
4266                  VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4267                }
4268             }
4269         }
4270       else
4271         {
4272           if (!slp_node)
4273             {
4274               enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
4275               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
4276               VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4277               if (op_type == ternary_op)
4278                 {
4279                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4280                                                                 loop_vec_def1);
4281                   VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4282                 }
4283             }
4284
4285           if (single_defuse_cycle)
4286             reduc_def = gimple_assign_lhs (new_stmt);
4287
4288           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4289         }
4290
4291       for (i = 0; VEC_iterate (tree, vec_oprnds0, i, def0); i++)
4292         {
4293           if (slp_node)
4294             reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4295           else
4296             {
4297               if (!single_defuse_cycle || j == 0)
4298                 reduc_def = PHI_RESULT (new_phi);
4299             }
4300
4301           def1 = ((op_type == ternary_op)
4302                   ? VEC_index (tree, vec_oprnds1, i) : NULL);
4303           if (op_type == binary_op)
4304             {
4305               if (reduc_index == 0)
4306                 expr = build2 (code, vectype_out, reduc_def, def0);
4307               else
4308                 expr = build2 (code, vectype_out, def0, reduc_def);
4309             }
4310           else
4311             {
4312               if (reduc_index == 0)
4313                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4314               else
4315                 {
4316                   if (reduc_index == 1)
4317                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
4318                   else
4319                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
4320                 }
4321             }
4322
4323           new_stmt = gimple_build_assign (vec_dest, expr);
4324           new_temp = make_ssa_name (vec_dest, new_stmt);
4325           gimple_assign_set_lhs (new_stmt, new_temp);
4326           vect_finish_stmt_generation (stmt, new_stmt, gsi);
4327           if (slp_node)
4328             {
4329               VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4330               VEC_quick_push (tree, vect_defs, new_temp);
4331             }
4332           else
4333             VEC_replace (tree, vect_defs, 0, new_temp);
4334         }
4335
4336       if (slp_node)
4337         continue;
4338
4339       if (j == 0)
4340         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4341       else
4342         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4343
4344       prev_stmt_info = vinfo_for_stmt (new_stmt);
4345       prev_phi_info = vinfo_for_stmt (new_phi);
4346     }
4347
4348   /* Finalize the reduction-phi (set its arguments) and create the
4349      epilog reduction code.  */
4350   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4351     {
4352       new_temp = gimple_assign_lhs (*vec_stmt);
4353       VEC_replace (tree, vect_defs, 0, new_temp);
4354     }
4355
4356   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4357                                     epilog_reduc_code, phis, reduc_index,
4358                                     double_reduc, slp_node);
4359
4360   VEC_free (gimple, heap, phis);
4361   VEC_free (tree, heap, vec_oprnds0);
4362   if (vec_oprnds1)
4363     VEC_free (tree, heap, vec_oprnds1);
4364
4365   return true;
4366 }
4367
4368 /* Function vect_min_worthwhile_factor.
4369
4370    For a loop where we could vectorize the operation indicated by CODE,
4371    return the minimum vectorization factor that makes it worthwhile
4372    to use generic vectors.  */
4373 int
4374 vect_min_worthwhile_factor (enum tree_code code)
4375 {
4376   switch (code)
4377     {
4378     case PLUS_EXPR:
4379     case MINUS_EXPR:
4380     case NEGATE_EXPR:
4381       return 4;
4382
4383     case BIT_AND_EXPR:
4384     case BIT_IOR_EXPR:
4385     case BIT_XOR_EXPR:
4386     case BIT_NOT_EXPR:
4387       return 2;
4388
4389     default:
4390       return INT_MAX;
4391     }
4392 }
4393
4394
4395 /* Function vectorizable_induction
4396
4397    Check if PHI performs an induction computation that can be vectorized.
4398    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4399    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4400    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
4401
4402 bool
4403 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4404                         gimple *vec_stmt)
4405 {
4406   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4407   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4408   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4409   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4410   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4411   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4412   tree vec_def;
4413
4414   gcc_assert (ncopies >= 1);
4415   /* FORNOW. This restriction should be relaxed.  */
4416   if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4417     {
4418       if (vect_print_dump_info (REPORT_DETAILS))
4419         fprintf (vect_dump, "multiple types in nested loop.");
4420       return false;
4421     }
4422
4423   if (!STMT_VINFO_RELEVANT_P (stmt_info))
4424     return false;
4425
4426   /* FORNOW: SLP not supported.  */
4427   if (STMT_SLP_TYPE (stmt_info))
4428     return false;
4429
4430   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4431
4432   if (gimple_code (phi) != GIMPLE_PHI)
4433     return false;
4434
4435   if (!vec_stmt) /* transformation not required.  */
4436     {
4437       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4438       if (vect_print_dump_info (REPORT_DETAILS))
4439         fprintf (vect_dump, "=== vectorizable_induction ===");
4440       vect_model_induction_cost (stmt_info, ncopies);
4441       return true;
4442     }
4443
4444   /** Transform.  **/
4445
4446   if (vect_print_dump_info (REPORT_DETAILS))
4447     fprintf (vect_dump, "transform induction phi.");
4448
4449   vec_def = get_initial_def_for_induction (phi);
4450   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4451   return true;
4452 }
4453
4454 /* Function vectorizable_live_operation.
4455
4456    STMT computes a value that is used outside the loop. Check if
4457    it can be supported.  */
4458
4459 bool
4460 vectorizable_live_operation (gimple stmt,
4461                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4462                              gimple *vec_stmt ATTRIBUTE_UNUSED)
4463 {
4464   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4465   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4466   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4467   int i;
4468   int op_type;
4469   tree op;
4470   tree def;
4471   gimple def_stmt;
4472   enum vect_def_type dt;
4473   enum tree_code code;
4474   enum gimple_rhs_class rhs_class;
4475
4476   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4477
4478   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4479     return false;
4480
4481   if (!is_gimple_assign (stmt))
4482     return false;
4483
4484   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4485     return false;
4486
4487   /* FORNOW. CHECKME. */
4488   if (nested_in_vect_loop_p (loop, stmt))
4489     return false;
4490
4491   code = gimple_assign_rhs_code (stmt);
4492   op_type = TREE_CODE_LENGTH (code);
4493   rhs_class = get_gimple_rhs_class (code);
4494   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4495   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4496
4497   /* FORNOW: support only if all uses are invariant. This means
4498      that the scalar operations can remain in place, unvectorized.
4499      The original last scalar value that they compute will be used.  */
4500
4501   for (i = 0; i < op_type; i++)
4502     {
4503       if (rhs_class == GIMPLE_SINGLE_RHS)
4504         op = TREE_OPERAND (gimple_op (stmt, 1), i);
4505       else
4506         op = gimple_op (stmt, i + 1);
4507       if (op
4508           && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4509         {
4510           if (vect_print_dump_info (REPORT_DETAILS))
4511             fprintf (vect_dump, "use not simple.");
4512           return false;
4513         }
4514
4515       if (dt != vect_external_def && dt != vect_constant_def)
4516         return false;
4517     }
4518
4519   /* No transformation is required for the cases we currently support.  */
4520   return true;
4521 }
4522
4523 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
4524
4525 static void
4526 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4527 {
4528   ssa_op_iter op_iter;
4529   imm_use_iterator imm_iter;
4530   def_operand_p def_p;
4531   gimple ustmt;
4532
4533   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4534     {
4535       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4536         {
4537           basic_block bb;
4538
4539           if (!is_gimple_debug (ustmt))
4540             continue;
4541
4542           bb = gimple_bb (ustmt);
4543
4544           if (!flow_bb_inside_loop_p (loop, bb))
4545             {
4546               if (gimple_debug_bind_p (ustmt))
4547                 {
4548                   if (vect_print_dump_info (REPORT_DETAILS))
4549                     fprintf (vect_dump, "killing debug use");
4550
4551                   gimple_debug_bind_reset_value (ustmt);
4552                   update_stmt (ustmt);
4553                 }
4554               else
4555                 gcc_unreachable ();
4556             }
4557         }
4558     }
4559 }
4560
4561 /* Function vect_transform_loop.
4562
4563    The analysis phase has determined that the loop is vectorizable.
4564    Vectorize the loop - created vectorized stmts to replace the scalar
4565    stmts in the loop, and update the loop exit condition.  */
4566
4567 void
4568 vect_transform_loop (loop_vec_info loop_vinfo)
4569 {
4570   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4571   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4572   int nbbs = loop->num_nodes;
4573   gimple_stmt_iterator si;
4574   int i;
4575   tree ratio = NULL;
4576   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4577   bool strided_store;
4578   bool slp_scheduled = false;
4579   unsigned int nunits;
4580   tree cond_expr = NULL_TREE;
4581   gimple_seq cond_expr_stmt_list = NULL;
4582   bool do_peeling_for_loop_bound;
4583
4584   if (vect_print_dump_info (REPORT_DETAILS))
4585     fprintf (vect_dump, "=== vec_transform_loop ===");
4586
4587   /* Peel the loop if there are data refs with unknown alignment.
4588      Only one data ref with unknown store is allowed.  */
4589
4590   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4591     vect_do_peeling_for_alignment (loop_vinfo);
4592
4593   do_peeling_for_loop_bound
4594     = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4595        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4596            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4597
4598   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4599       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4600     vect_loop_versioning (loop_vinfo,
4601                           !do_peeling_for_loop_bound,
4602                           &cond_expr, &cond_expr_stmt_list);
4603
4604   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4605      compile time constant), or it is a constant that doesn't divide by the
4606      vectorization factor, then an epilog loop needs to be created.
4607      We therefore duplicate the loop: the original loop will be vectorized,
4608      and will compute the first (n/VF) iterations. The second copy of the loop
4609      will remain scalar and will compute the remaining (n%VF) iterations.
4610      (VF is the vectorization factor).  */
4611
4612   if (do_peeling_for_loop_bound)
4613     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4614                                     cond_expr, cond_expr_stmt_list);
4615   else
4616     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4617                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4618
4619   /* 1) Make sure the loop header has exactly two entries
4620      2) Make sure we have a preheader basic block.  */
4621
4622   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4623
4624   split_edge (loop_preheader_edge (loop));
4625
4626   /* FORNOW: the vectorizer supports only loops which body consist
4627      of one basic block (header + empty latch). When the vectorizer will
4628      support more involved loop forms, the order by which the BBs are
4629      traversed need to be reconsidered.  */
4630
4631   for (i = 0; i < nbbs; i++)
4632     {
4633       basic_block bb = bbs[i];
4634       stmt_vec_info stmt_info;
4635       gimple phi;
4636
4637       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4638         {
4639           phi = gsi_stmt (si);
4640           if (vect_print_dump_info (REPORT_DETAILS))
4641             {
4642               fprintf (vect_dump, "------>vectorizing phi: ");
4643               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4644             }
4645           stmt_info = vinfo_for_stmt (phi);
4646           if (!stmt_info)
4647             continue;
4648
4649           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4650             vect_loop_kill_debug_uses (loop, phi);
4651
4652           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4653               && !STMT_VINFO_LIVE_P (stmt_info))
4654             continue;
4655
4656           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4657                 != (unsigned HOST_WIDE_INT) vectorization_factor)
4658               && vect_print_dump_info (REPORT_DETAILS))
4659             fprintf (vect_dump, "multiple-types.");
4660
4661           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4662             {
4663               if (vect_print_dump_info (REPORT_DETAILS))
4664                 fprintf (vect_dump, "transform phi.");
4665               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4666             }
4667         }
4668
4669       for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4670         {
4671           gimple stmt = gsi_stmt (si);
4672           bool is_store;
4673
4674           if (vect_print_dump_info (REPORT_DETAILS))
4675             {
4676               fprintf (vect_dump, "------>vectorizing statement: ");
4677               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4678             }
4679
4680           stmt_info = vinfo_for_stmt (stmt);
4681
4682           /* vector stmts created in the outer-loop during vectorization of
4683              stmts in an inner-loop may not have a stmt_info, and do not
4684              need to be vectorized.  */
4685           if (!stmt_info)
4686             {
4687               gsi_next (&si);
4688               continue;
4689             }
4690
4691           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4692             vect_loop_kill_debug_uses (loop, stmt);
4693
4694           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4695               && !STMT_VINFO_LIVE_P (stmt_info))
4696             {
4697               gsi_next (&si);
4698               continue;
4699             }
4700
4701           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4702           nunits =
4703             (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4704           if (!STMT_SLP_TYPE (stmt_info)
4705               && nunits != (unsigned int) vectorization_factor
4706               && vect_print_dump_info (REPORT_DETAILS))
4707             /* For SLP VF is set according to unrolling factor, and not to
4708                vector size, hence for SLP this print is not valid.  */
4709             fprintf (vect_dump, "multiple-types.");
4710
4711           /* SLP. Schedule all the SLP instances when the first SLP stmt is
4712              reached.  */
4713           if (STMT_SLP_TYPE (stmt_info))
4714             {
4715               if (!slp_scheduled)
4716                 {
4717                   slp_scheduled = true;
4718
4719                   if (vect_print_dump_info (REPORT_DETAILS))
4720                     fprintf (vect_dump, "=== scheduling SLP instances ===");
4721
4722                   vect_schedule_slp (loop_vinfo, NULL);
4723                 }
4724
4725               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
4726               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4727                 {
4728                   gsi_next (&si);
4729                   continue;
4730                 }
4731             }
4732
4733           /* -------- vectorize statement ------------ */
4734           if (vect_print_dump_info (REPORT_DETAILS))
4735             fprintf (vect_dump, "transform statement.");
4736
4737           strided_store = false;
4738           is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4739           if (is_store)
4740             {
4741               if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4742                 {
4743                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4744                      interleaving chain was completed - free all the stores in
4745                      the chain.  */
4746                   vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4747                   gsi_remove (&si, true);
4748                   continue;
4749                 }
4750               else
4751                 {
4752                   /* Free the attached stmt_vec_info and remove the stmt.  */
4753                   free_stmt_vec_info (stmt);
4754                   gsi_remove (&si, true);
4755                   continue;
4756                 }
4757             }
4758           gsi_next (&si);
4759         }                       /* stmts in BB */
4760     }                           /* BBs in loop */
4761
4762   slpeel_make_loop_iterate_ntimes (loop, ratio);
4763
4764   /* The memory tags and pointers in vectorized statements need to
4765      have their SSA forms updated.  FIXME, why can't this be delayed
4766      until all the loops have been transformed?  */
4767   update_ssa (TODO_update_ssa);
4768
4769   if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4770     fprintf (vect_dump, "LOOP VECTORIZED.");
4771   if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4772     fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
4773 }