gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software
   3    Foundation, Inc.
   4    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   5    Ira Rosen <irar@il.ibm.com>
   6
   7 This file is part of GCC.
   8
   9 GCC is free software; you can redistribute it and/or modify it under
  10 the terms of the GNU General Public License as published by the Free
  11 Software Foundation; either version 3, or (at your option) any later
  12 version.
  13
  14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  17 for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with GCC; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "diagnostic.h"
  31 #include "tree-flow.h"
  32 #include "tree-dump.h"
  33 #include "cfgloop.h"
  34 #include "cfglayout.h"
  35 #include "expr.h"
  36 #include "recog.h"
  37 #include "optabs.h"
  38 #include "params.h"
  39 #include "toplev.h"
  40 #include "tree-chrec.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43
  44 /* Loop Vectorization Pass.
  45
  46    This pass tries to vectorize loops.
  47
  48    For example, the vectorizer transforms the following simple loop:
  49
  50         short a[N]; short b[N]; short c[N]; int i;
  51
  52         for (i=0; i<N; i++){
  53           a[i] = b[i] + c[i];
  54         }
  55
  56    as if it was manually vectorized by rewriting the source code into:
  57
  58         typedef int __attribute__((mode(V8HI))) v8hi;
  59         short a[N];  short b[N]; short c[N];   int i;
  60         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  61         v8hi va, vb, vc;
  62
  63         for (i=0; i<N/8; i++){
  64           vb = pb[i];
  65           vc = pc[i];
  66           va = vb + vc;
  67           pa[i] = va;
  68         }
  69
  70         The main entry to this pass is vectorize_loops(), in which
  71    the vectorizer applies a set of analyses on a given set of loops,
  72    followed by the actual vectorization transformation for the loops that
  73    had successfully passed the analysis phase.
  74         Throughout this pass we make a distinction between two types of
  75    data: scalars (which are represented by SSA_NAMES), and memory references
  76    ("data-refs"). These two types of data require different handling both
  77    during analysis and transformation. The types of data-refs that the
  78    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  79    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  80    accesses are required to have a simple (consecutive) access pattern.
  81
  82    Analysis phase:
  83    ===============
  84         The driver for the analysis phase is vect_analyze_loop().
  85    It applies a set of analyses, some of which rely on the scalar evolution
  86    analyzer (scev) developed by Sebastian Pop.
  87
  88         During the analysis phase the vectorizer records some information
  89    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  90    loop, as well as general information about the loop as a whole, which is
  91    recorded in a "loop_vec_info" struct attached to each loop.
  92
  93    Transformation phase:
  94    =====================
  95         The loop transformation phase scans all the stmts in the loop, and
  96    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  97    the loop that needs to be vectorized. It inserts the vector code sequence
  98    just before the scalar stmt S, and records a pointer to the vector code
  99    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 100    attached to S). This pointer will be used for the vectorization of following
 101    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 102    otherwise, we rely on dead code elimination for removing it.
 103
 104         For example, say stmt S1 was vectorized into stmt VS1:
 105
 106    VS1: vb = px[i];
 107    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 108    S2:  a = b;
 109
 110    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 111    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 112    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
 113    resulting sequence would be:
 114
 115    VS1: vb = px[i];
 116    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 117    VS2: va = vb;
 118    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 119
 120         Operands that are not SSA_NAMEs, are data-refs that appear in
 121    load/store operations (like 'x[i]' in S1), and are handled differently.
 122
 123    Target modeling:
 124    =================
 125         Currently the only target specific information that is used is the
 126    size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
 127    support different sizes of vectors, for now will need to specify one value
 128    for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
 129
 130         Since we only vectorize operations which vector form can be
 131    expressed using existing tree codes, to verify that an operation is
 132    supported, the vectorizer checks the relevant optab at the relevant
 133    machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
 134    the value found is CODE_FOR_nothing, then there's no target support, and
 135    we can't vectorize the stmt.
 136
 137    For additional information on this project see:
 138    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 139 */
 140
 141 /* Function vect_determine_vectorization_factor
 142
 143    Determine the vectorization factor (VF). VF is the number of data elements
 144    that are operated upon in parallel in a single iteration of the vectorized
 145    loop. For example, when vectorizing a loop that operates on 4byte elements,
 146    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 147    elements can fit in a single vector register.
 148
 149    We currently support vectorization of loops in which all types operated upon
 150    are of the same size. Therefore this function currently sets VF according to
 151    the size of the types operated upon, and fails if there are multiple sizes
 152    in the loop.
 153
 154    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 155    original loop:
 156         for (i=0; i<N; i++){
 157           a[i] = b[i] + c[i];
 158         }
 159
 160    vectorized loop:
 161         for (i=0; i<N; i+=VF){
 162           a[i:VF] = b[i:VF] + c[i:VF];
 163         }
 164 */
 165
 166 static bool
 167 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 168 {
 169   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 170   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 171   int nbbs = loop->num_nodes;
 172   gimple_stmt_iterator si;
 173   unsigned int vectorization_factor = 0;
 174   tree scalar_type;
 175   gimple phi;
 176   tree vectype;
 177   unsigned int nunits;
 178   stmt_vec_info stmt_info;
 179   int i;
 180   HOST_WIDE_INT dummy;
 181
 182   if (vect_print_dump_info (REPORT_DETAILS))
 183     fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
 184
 185   for (i = 0; i < nbbs; i++)
 186     {
 187       basic_block bb = bbs[i];
 188
 189       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 190         {
 191           phi = gsi_stmt (si);
 192           stmt_info = vinfo_for_stmt (phi);
 193           if (vect_print_dump_info (REPORT_DETAILS))
 194             {
 195               fprintf (vect_dump, "==> examining phi: ");
 196               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 197             }
 198
 199           gcc_assert (stmt_info);
 200
 201           if (STMT_VINFO_RELEVANT_P (stmt_info))
 202             {
 203               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 204               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 205
 206               if (vect_print_dump_info (REPORT_DETAILS))
 207                 {
 208                   fprintf (vect_dump, "get vectype for scalar type:  ");
 209                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 210                 }
 211
 212               vectype = get_vectype_for_scalar_type (scalar_type);
 213               if (!vectype)
 214                 {
 215                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 216                     {
 217                       fprintf (vect_dump,
 218                                "not vectorized: unsupported data-type ");
 219                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 220                     }
 221                   return false;
 222                 }
 223               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 224
 225               if (vect_print_dump_info (REPORT_DETAILS))
 226                 {
 227                   fprintf (vect_dump, "vectype: ");
 228                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 229                 }
 230
 231               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 232               if (vect_print_dump_info (REPORT_DETAILS))
 233                 fprintf (vect_dump, "nunits = %d", nunits);
 234
 235               if (!vectorization_factor
 236                   || (nunits > vectorization_factor))
 237                 vectorization_factor = nunits;
 238             }
 239         }
 240
 241       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 242         {
 243           gimple stmt = gsi_stmt (si);
 244           stmt_info = vinfo_for_stmt (stmt);
 245
 246           if (vect_print_dump_info (REPORT_DETAILS))
 247             {
 248               fprintf (vect_dump, "==> examining statement: ");
 249               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 250             }
 251
 252           gcc_assert (stmt_info);
 253
 254           /* skip stmts which do not need to be vectorized.  */
 255           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 256               && !STMT_VINFO_LIVE_P (stmt_info))
 257             {
 258               if (vect_print_dump_info (REPORT_DETAILS))
 259                 fprintf (vect_dump, "skip.");
 260               continue;
 261             }
 262
 263           if (gimple_get_lhs (stmt) == NULL_TREE)
 264             {
 265               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 266                 {
 267                   fprintf (vect_dump, "not vectorized: irregular stmt.");
 268                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 269                 }
 270               return false;
 271             }
 272
 273           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 274             {
 275               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 276                 {
 277                   fprintf (vect_dump, "not vectorized: vector stmt in loop:");
 278                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 279                 }
 280               return false;
 281             }
 282
 283           if (STMT_VINFO_VECTYPE (stmt_info))
 284             {
 285               /* The only case when a vectype had been already set is for stmts
 286                  that contain a dataref, or for "pattern-stmts" (stmts generated
 287                  by the vectorizer to represent/replace a certain idiom).  */
 288               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 289                           || is_pattern_stmt_p (stmt_info));
 290               vectype = STMT_VINFO_VECTYPE (stmt_info);
 291             }
 292           else
 293             {
 294               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
 295                           && !is_pattern_stmt_p (stmt_info));
 296
 297               scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 298                                                            &dummy);
 299               if (vect_print_dump_info (REPORT_DETAILS))
 300                 {
 301                   fprintf (vect_dump, "get vectype for scalar type:  ");
 302                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 303                 }
 304
 305               vectype = get_vectype_for_scalar_type (scalar_type);
 306               if (!vectype)
 307                 {
 308                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 309                     {
 310                       fprintf (vect_dump,
 311                                "not vectorized: unsupported data-type ");
 312                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 313                     }
 314                   return false;
 315                 }
 316               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 317             }
 318
 319           if (vect_print_dump_info (REPORT_DETAILS))
 320             {
 321               fprintf (vect_dump, "vectype: ");
 322               print_generic_expr (vect_dump, vectype, TDF_SLIM);
 323             }
 324
 325           nunits = TYPE_VECTOR_SUBPARTS (vectype);
 326           if (vect_print_dump_info (REPORT_DETAILS))
 327             fprintf (vect_dump, "nunits = %d", nunits);
 328
 329           if (!vectorization_factor
 330               || (nunits > vectorization_factor))
 331             vectorization_factor = nunits;
 332
 333         }
 334     }
 335
 336   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 337   if (vect_print_dump_info (REPORT_DETAILS))
 338     fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
 339   if (vectorization_factor <= 1)
 340     {
 341       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 342         fprintf (vect_dump, "not vectorized: unsupported data-type");
 343       return false;
 344     }
 345   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 346
 347   return true;
 348 }
 349
 350
 351 /* Function vect_is_simple_iv_evolution.
 352
 353    FORNOW: A simple evolution of an induction variables in the loop is
 354    considered a polynomial evolution with constant step.  */
 355
 356 static bool
 357 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 358                              tree * step)
 359 {
 360   tree init_expr;
 361   tree step_expr;
 362   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 363
 364   /* When there is no evolution in this loop, the evolution function
 365      is not "simple".  */
 366   if (evolution_part == NULL_TREE)
 367     return false;
 368
 369   /* When the evolution is a polynomial of degree >= 2
 370      the evolution function is not "simple".  */
 371   if (tree_is_chrec (evolution_part))
 372     return false;
 373
 374   step_expr = evolution_part;
 375   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 376
 377   if (vect_print_dump_info (REPORT_DETAILS))
 378     {
 379       fprintf (vect_dump, "step: ");
 380       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
 381       fprintf (vect_dump, ",  init: ");
 382       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
 383     }
 384
 385   *init = init_expr;
 386   *step = step_expr;
 387
 388   if (TREE_CODE (step_expr) != INTEGER_CST)
 389     {
 390       if (vect_print_dump_info (REPORT_DETAILS))
 391         fprintf (vect_dump, "step unknown.");
 392       return false;
 393     }
 394
 395   return true;
 396 }
 397
 398 /* Function vect_analyze_scalar_cycles_1.
 399
 400    Examine the cross iteration def-use cycles of scalar variables
 401    in LOOP. LOOP_VINFO represents the loop that is now being
 402    considered for vectorization (can be LOOP, or an outer-loop
 403    enclosing LOOP).  */
 404
 405 static void
 406 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 407 {
 408   basic_block bb = loop->header;
 409   tree dumy;
 410   VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
 411   gimple_stmt_iterator gsi;
 412   bool double_reduc;
 413
 414   if (vect_print_dump_info (REPORT_DETAILS))
 415     fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
 416
 417   /* First - identify all inductions. Reduction detection assumes that all the
 418      inductions have been identified, therefore, this order must not be
 419      changed.  */
 420   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 421     {
 422       gimple phi = gsi_stmt (gsi);
 423       tree access_fn = NULL;
 424       tree def = PHI_RESULT (phi);
 425       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 426
 427       if (vect_print_dump_info (REPORT_DETAILS))
 428         {
 429           fprintf (vect_dump, "Analyze phi: ");
 430           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 431         }
 432
 433       /* Skip virtual phi's. The data dependences that are associated with
 434          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 435       if (!is_gimple_reg (SSA_NAME_VAR (def)))
 436         continue;
 437
 438       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 439
 440       /* Analyze the evolution function.  */
 441       access_fn = analyze_scalar_evolution (loop, def);
 442       if (access_fn && vect_print_dump_info (REPORT_DETAILS))
 443         {
 444           fprintf (vect_dump, "Access function of PHI: ");
 445           print_generic_expr (vect_dump, access_fn, TDF_SLIM);
 446         }
 447
 448       if (!access_fn
 449           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 450         {
 451           VEC_safe_push (gimple, heap, worklist, phi);
 452           continue;
 453         }
 454
 455       if (vect_print_dump_info (REPORT_DETAILS))
 456         fprintf (vect_dump, "Detected induction.");
 457       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 458     }
 459
 460
 461   /* Second - identify all reductions and nested cycles.  */
 462   while (VEC_length (gimple, worklist) > 0)
 463     {
 464       gimple phi = VEC_pop (gimple, worklist);
 465       tree def = PHI_RESULT (phi);
 466       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 467       gimple reduc_stmt;
 468       bool nested_cycle;
 469
 470       if (vect_print_dump_info (REPORT_DETAILS))
 471         {
 472           fprintf (vect_dump, "Analyze phi: ");
 473           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 474         }
 475
 476       gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
 477       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 478
 479       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 480       reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle,
 481                                              &double_reduc);
 482       if (reduc_stmt)
 483         {
 484           if (double_reduc)
 485             {
 486               if (vect_print_dump_info (REPORT_DETAILS))
 487                 fprintf (vect_dump, "Detected double reduction.");
 488
 489               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 490               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 491                                                     vect_double_reduction_def;
 492             }
 493           else
 494             {
 495               if (nested_cycle)
 496                 {
 497                   if (vect_print_dump_info (REPORT_DETAILS))
 498                     fprintf (vect_dump, "Detected vectorizable nested cycle.");
 499
 500                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 501                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 502                                                              vect_nested_cycle;
 503                 }
 504               else
 505                 {
 506                   if (vect_print_dump_info (REPORT_DETAILS))
 507                     fprintf (vect_dump, "Detected reduction.");
 508
 509                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 510                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 511                                                            vect_reduction_def;
 512                 }
 513             }
 514         }
 515       else
 516         if (vect_print_dump_info (REPORT_DETAILS))
 517           fprintf (vect_dump, "Unknown def-use cycle pattern.");
 518     }
 519
 520   VEC_free (gimple, heap, worklist);
 521 }
 522
 523
 524 /* Function vect_analyze_scalar_cycles.
 525
 526    Examine the cross iteration def-use cycles of scalar variables, by
 527    analyzing the loop-header PHIs of scalar variables; Classify each
 528    cycle as one of the following: invariant, induction, reduction, unknown.
 529    We do that for the loop represented by LOOP_VINFO, and also to its
 530    inner-loop, if exists.
 531    Examples for scalar cycles:
 532
 533    Example1: reduction:
 534
 535               loop1:
 536               for (i=0; i<N; i++)
 537                  sum += a[i];
 538
 539    Example2: induction:
 540
 541               loop2:
 542               for (i=0; i<N; i++)
 543                  a[i] = i;  */
 544
 545 static void
 546 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 547 {
 548   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 549
 550   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 551
 552   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 553      Reductions in such inner-loop therefore have different properties than
 554      the reductions in the nest that gets vectorized:
 555      1. When vectorized, they are executed in the same order as in the original
 556         scalar loop, so we can't change the order of computation when
 557         vectorizing them.
 558      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 559         current checks are too strict.  */
 560
 561   if (loop->inner)
 562     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 563 }
 564
 565 /* Function vect_get_loop_niters.
 566
 567    Determine how many iterations the loop is executed.
 568    If an expression that represents the number of iterations
 569    can be constructed, place it in NUMBER_OF_ITERATIONS.
 570    Return the loop exit condition.  */
 571
 572 static gimple
 573 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 574 {
 575   tree niters;
 576
 577   if (vect_print_dump_info (REPORT_DETAILS))
 578     fprintf (vect_dump, "=== get_loop_niters ===");
 579
 580   niters = number_of_exit_cond_executions (loop);
 581
 582   if (niters != NULL_TREE
 583       && niters != chrec_dont_know)
 584     {
 585       *number_of_iterations = niters;
 586
 587       if (vect_print_dump_info (REPORT_DETAILS))
 588         {
 589           fprintf (vect_dump, "==> get_loop_niters:" );
 590           print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
 591         }
 592     }
 593
 594   return get_loop_exit_condition (loop);
 595 }
 596
 597
 598 /* Function bb_in_loop_p
 599
 600    Used as predicate for dfs order traversal of the loop bbs.  */
 601
 602 static bool
 603 bb_in_loop_p (const_basic_block bb, const void *data)
 604 {
 605   const struct loop *const loop = (const struct loop *)data;
 606   if (flow_bb_inside_loop_p (loop, bb))
 607     return true;
 608   return false;
 609 }
 610
 611
 612 /* Function new_loop_vec_info.
 613
 614    Create and initialize a new loop_vec_info struct for LOOP, as well as
 615    stmt_vec_info structs for all the stmts in LOOP.  */
 616
 617 static loop_vec_info
 618 new_loop_vec_info (struct loop *loop)
 619 {
 620   loop_vec_info res;
 621   basic_block *bbs;
 622   gimple_stmt_iterator si;
 623   unsigned int i, nbbs;
 624
 625   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 626   LOOP_VINFO_LOOP (res) = loop;
 627
 628   bbs = get_loop_body (loop);
 629
 630   /* Create/Update stmt_info for all stmts in the loop.  */
 631   for (i = 0; i < loop->num_nodes; i++)
 632     {
 633       basic_block bb = bbs[i];
 634
 635       /* BBs in a nested inner-loop will have been already processed (because
 636          we will have called vect_analyze_loop_form for any nested inner-loop).
 637          Therefore, for stmts in an inner-loop we just want to update the
 638          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 639          loop_info of the outer-loop we are currently considering to vectorize
 640          (instead of the loop_info of the inner-loop).
 641          For stmts in other BBs we need to create a stmt_info from scratch.  */
 642       if (bb->loop_father != loop)
 643         {
 644           /* Inner-loop bb.  */
 645           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 646           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 647             {
 648               gimple phi = gsi_stmt (si);
 649               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 650               loop_vec_info inner_loop_vinfo =
 651                 STMT_VINFO_LOOP_VINFO (stmt_info);
 652               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 653               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 654             }
 655           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 656            {
 657               gimple stmt = gsi_stmt (si);
 658               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 659               loop_vec_info inner_loop_vinfo =
 660                  STMT_VINFO_LOOP_VINFO (stmt_info);
 661               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 662               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 663            }
 664         }
 665       else
 666         {
 667           /* bb in current nest.  */
 668           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 669             {
 670               gimple phi = gsi_stmt (si);
 671               gimple_set_uid (phi, 0);
 672               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 673             }
 674
 675           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 676             {
 677               gimple stmt = gsi_stmt (si);
 678               gimple_set_uid (stmt, 0);
 679               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 680             }
 681         }
 682     }
 683
 684   /* CHECKME: We want to visit all BBs before their successors (except for
 685      latch blocks, for which this assertion wouldn't hold).  In the simple
 686      case of the loop forms we allow, a dfs order of the BBs would the same
 687      as reversed postorder traversal, so we are safe.  */
 688
 689    free (bbs);
 690    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 691    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 692                               bbs, loop->num_nodes, loop);
 693    gcc_assert (nbbs == loop->num_nodes);
 694
 695   LOOP_VINFO_BBS (res) = bbs;
 696   LOOP_VINFO_NITERS (res) = NULL;
 697   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 698   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 699   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 700   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 701   LOOP_VINFO_VECT_FACTOR (res) = 0;
 702   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
 703   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
 704   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 705   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
 706     VEC_alloc (gimple, heap,
 707                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 708   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
 709     VEC_alloc (ddr_p, heap,
 710                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 711   LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
 712   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
 713   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 714
 715   return res;
 716 }
 717
 718
 719 /* Function destroy_loop_vec_info.
 720
 721    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 722    stmts in the loop.  */
 723
 724 void
 725 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 726 {
 727   struct loop *loop;
 728   basic_block *bbs;
 729   int nbbs;
 730   gimple_stmt_iterator si;
 731   int j;
 732   VEC (slp_instance, heap) *slp_instances;
 733   slp_instance instance;
 734
 735   if (!loop_vinfo)
 736     return;
 737
 738   loop = LOOP_VINFO_LOOP (loop_vinfo);
 739
 740   bbs = LOOP_VINFO_BBS (loop_vinfo);
 741   nbbs = loop->num_nodes;
 742
 743   if (!clean_stmts)
 744     {
 745       free (LOOP_VINFO_BBS (loop_vinfo));
 746       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 747       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 748       VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 749
 750       free (loop_vinfo);
 751       loop->aux = NULL;
 752       return;
 753     }
 754
 755   for (j = 0; j < nbbs; j++)
 756     {
 757       basic_block bb = bbs[j];
 758       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 759         free_stmt_vec_info (gsi_stmt (si));
 760
 761       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 762         {
 763           gimple stmt = gsi_stmt (si);
 764           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 765
 766           if (stmt_info)
 767             {
 768               /* Check if this is a "pattern stmt" (introduced by the
 769                  vectorizer during the pattern recognition pass).  */
 770               bool remove_stmt_p = false;
 771               gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
 772               if (orig_stmt)
 773                 {
 774                   stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
 775                   if (orig_stmt_info
 776                       && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
 777                     remove_stmt_p = true;
 778                 }
 779
 780               /* Free stmt_vec_info.  */
 781               free_stmt_vec_info (stmt);
 782
 783               /* Remove dead "pattern stmts".  */
 784               if (remove_stmt_p)
 785                 gsi_remove (&si, true);
 786             }
 787           gsi_next (&si);
 788         }
 789     }
 790
 791   free (LOOP_VINFO_BBS (loop_vinfo));
 792   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 793   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 794   VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 795   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 796   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 797   for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
 798     vect_free_slp_instance (instance);
 799
 800   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
 801   VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
 802
 803   free (loop_vinfo);
 804   loop->aux = NULL;
 805 }
 806
 807
 808 /* Function vect_analyze_loop_1.
 809
 810    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 811    for it. The different analyses will record information in the
 812    loop_vec_info struct.  This is a subset of the analyses applied in
 813    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 814    that is now considered for (outer-loop) vectorization.  */
 815
 816 static loop_vec_info
 817 vect_analyze_loop_1 (struct loop *loop)
 818 {
 819   loop_vec_info loop_vinfo;
 820
 821   if (vect_print_dump_info (REPORT_DETAILS))
 822     fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
 823
 824   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 825
 826   loop_vinfo = vect_analyze_loop_form (loop);
 827   if (!loop_vinfo)
 828     {
 829       if (vect_print_dump_info (REPORT_DETAILS))
 830         fprintf (vect_dump, "bad inner-loop form.");
 831       return NULL;
 832     }
 833
 834   return loop_vinfo;
 835 }
 836
 837
 838 /* Function vect_analyze_loop_form.
 839
 840    Verify that certain CFG restrictions hold, including:
 841    - the loop has a pre-header
 842    - the loop has a single entry and exit
 843    - the loop exit condition is simple enough, and the number of iterations
 844      can be analyzed (a countable loop).  */
 845
 846 loop_vec_info
 847 vect_analyze_loop_form (struct loop *loop)
 848 {
 849   loop_vec_info loop_vinfo;
 850   gimple loop_cond;
 851   tree number_of_iterations = NULL;
 852   loop_vec_info inner_loop_vinfo = NULL;
 853
 854   if (vect_print_dump_info (REPORT_DETAILS))
 855     fprintf (vect_dump, "=== vect_analyze_loop_form ===");
 856
 857   /* Different restrictions apply when we are considering an inner-most loop,
 858      vs. an outer (nested) loop.
 859      (FORNOW. May want to relax some of these restrictions in the future).  */
 860
 861   if (!loop->inner)
 862     {
 863       /* Inner-most loop.  We currently require that the number of BBs is
 864          exactly 2 (the header and latch).  Vectorizable inner-most loops
 865          look like this:
 866
 867                         (pre-header)
 868                            |
 869                           header <--------+
 870                            | |            |
 871                            | +--> latch --+
 872                            |
 873                         (exit-bb)  */
 874
 875       if (loop->num_nodes != 2)
 876         {
 877           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 878             fprintf (vect_dump, "not vectorized: control flow in loop.");
 879           return NULL;
 880         }
 881
 882       if (empty_block_p (loop->header))
 883     {
 884           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 885             fprintf (vect_dump, "not vectorized: empty loop.");
 886       return NULL;
 887     }
 888     }
 889   else
 890     {
 891       struct loop *innerloop = loop->inner;
 892       edge entryedge;
 893
 894       /* Nested loop. We currently require that the loop is doubly-nested,
 895          contains a single inner loop, and the number of BBs is exactly 5.
 896          Vectorizable outer-loops look like this:
 897
 898                         (pre-header)
 899                            |
 900                           header <---+
 901                            |         |
 902                           inner-loop |
 903                            |         |
 904                           tail ------+
 905                            |
 906                         (exit-bb)
 907
 908          The inner-loop has the properties expected of inner-most loops
 909          as described above.  */
 910
 911       if ((loop->inner)->inner || (loop->inner)->next)
 912         {
 913           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 914             fprintf (vect_dump, "not vectorized: multiple nested loops.");
 915           return NULL;
 916         }
 917
 918       /* Analyze the inner-loop.  */
 919       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
 920       if (!inner_loop_vinfo)
 921         {
 922           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 923             fprintf (vect_dump, "not vectorized: Bad inner loop.");
 924           return NULL;
 925         }
 926
 927       if (!expr_invariant_in_loop_p (loop,
 928                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
 929         {
 930           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 931             fprintf (vect_dump,
 932                      "not vectorized: inner-loop count not invariant.");
 933           destroy_loop_vec_info (inner_loop_vinfo, true);
 934           return NULL;
 935         }
 936
 937       if (loop->num_nodes != 5)
 938         {
 939           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 940             fprintf (vect_dump, "not vectorized: control flow in loop.");
 941           destroy_loop_vec_info (inner_loop_vinfo, true);
 942           return NULL;
 943         }
 944
 945       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
 946       entryedge = EDGE_PRED (innerloop->header, 0);
 947       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
 948         entryedge = EDGE_PRED (innerloop->header, 1);
 949
 950       if (entryedge->src != loop->header
 951           || !single_exit (innerloop)
 952           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
 953         {
 954           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 955             fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
 956           destroy_loop_vec_info (inner_loop_vinfo, true);
 957           return NULL;
 958         }
 959
 960       if (vect_print_dump_info (REPORT_DETAILS))
 961         fprintf (vect_dump, "Considering outer-loop vectorization.");
 962     }
 963
 964   if (!single_exit (loop)
 965       || EDGE_COUNT (loop->header->preds) != 2)
 966     {
 967       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 968         {
 969           if (!single_exit (loop))
 970             fprintf (vect_dump, "not vectorized: multiple exits.");
 971           else if (EDGE_COUNT (loop->header->preds) != 2)
 972             fprintf (vect_dump, "not vectorized: too many incoming edges.");
 973         }
 974       if (inner_loop_vinfo)
 975         destroy_loop_vec_info (inner_loop_vinfo, true);
 976       return NULL;
 977     }
 978
 979   /* We assume that the loop exit condition is at the end of the loop. i.e,
 980      that the loop is represented as a do-while (with a proper if-guard
 981      before the loop if needed), where the loop header contains all the
 982      executable statements, and the latch is empty.  */
 983   if (!empty_block_p (loop->latch)
 984         || phi_nodes (loop->latch))
 985     {
 986       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 987         fprintf (vect_dump, "not vectorized: unexpected loop form.");
 988       if (inner_loop_vinfo)
 989         destroy_loop_vec_info (inner_loop_vinfo, true);
 990       return NULL;
 991     }
 992
 993   /* Make sure there exists a single-predecessor exit bb:  */
 994   if (!single_pred_p (single_exit (loop)->dest))
 995     {
 996       edge e = single_exit (loop);
 997       if (!(e->flags & EDGE_ABNORMAL))
 998         {
 999           split_loop_exit_edge (e);
1000           if (vect_print_dump_info (REPORT_DETAILS))
1001             fprintf (vect_dump, "split exit edge.");
1002         }
1003       else
1004         {
1005           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1006             fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1007           if (inner_loop_vinfo)
1008             destroy_loop_vec_info (inner_loop_vinfo, true);
1009           return NULL;
1010         }
1011     }
1012
1013   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1014   if (!loop_cond)
1015     {
1016       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1017         fprintf (vect_dump, "not vectorized: complicated exit condition.");
1018       if (inner_loop_vinfo)
1019         destroy_loop_vec_info (inner_loop_vinfo, true);
1020       return NULL;
1021     }
1022
1023   if (!number_of_iterations)
1024     {
1025       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1026         fprintf (vect_dump,
1027                  "not vectorized: number of iterations cannot be computed.");
1028       if (inner_loop_vinfo)
1029         destroy_loop_vec_info (inner_loop_vinfo, true);
1030       return NULL;
1031     }
1032
1033   if (chrec_contains_undetermined (number_of_iterations))
1034     {
1035       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1036         fprintf (vect_dump, "Infinite number of iterations.");
1037       if (inner_loop_vinfo)
1038         destroy_loop_vec_info (inner_loop_vinfo, true);
1039       return NULL;
1040     }
1041
1042   if (!NITERS_KNOWN_P (number_of_iterations))
1043     {
1044       if (vect_print_dump_info (REPORT_DETAILS))
1045         {
1046           fprintf (vect_dump, "Symbolic number of iterations is ");
1047           print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1048         }
1049     }
1050   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1051     {
1052       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1053         fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1054       if (inner_loop_vinfo)
1055         destroy_loop_vec_info (inner_loop_vinfo, false);
1056       return NULL;
1057     }
1058
1059   loop_vinfo = new_loop_vec_info (loop);
1060   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1061   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1062
1063   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1064
1065   /* CHECKME: May want to keep it around it in the future.  */
1066   if (inner_loop_vinfo)
1067     destroy_loop_vec_info (inner_loop_vinfo, false);
1068
1069   gcc_assert (!loop->aux);
1070   loop->aux = loop_vinfo;
1071   return loop_vinfo;
1072 }
1073
1074
1075 /* Function vect_analyze_loop_operations.
1076
1077    Scan the loop stmts and make sure they are all vectorizable.  */
1078
1079 static bool
1080 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1081 {
1082   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1083   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1084   int nbbs = loop->num_nodes;
1085   gimple_stmt_iterator si;
1086   unsigned int vectorization_factor = 0;
1087   int i;
1088   gimple phi;
1089   stmt_vec_info stmt_info;
1090   bool need_to_vectorize = false;
1091   int min_profitable_iters;
1092   int min_scalar_loop_bound;
1093   unsigned int th;
1094   bool only_slp_in_loop = true, ok;
1095
1096   if (vect_print_dump_info (REPORT_DETAILS))
1097     fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1098
1099   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1100   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1101
1102   for (i = 0; i < nbbs; i++)
1103     {
1104       basic_block bb = bbs[i];
1105
1106       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1107         {
1108           phi = gsi_stmt (si);
1109           ok = true;
1110
1111           stmt_info = vinfo_for_stmt (phi);
1112           if (vect_print_dump_info (REPORT_DETAILS))
1113             {
1114               fprintf (vect_dump, "examining phi: ");
1115               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1116             }
1117
1118           if (! is_loop_header_bb_p (bb))
1119             {
1120               /* inner-loop loop-closed exit phi in outer-loop vectorization
1121                  (i.e. a phi in the tail of the outer-loop).
1122                  FORNOW: we currently don't support the case that these phis
1123                  are not used in the outerloop (unless it is double reduction,
1124                  i.e., this phi is vect_reduction_def), cause this case
1125                  requires to actually do something here.  */
1126               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1127                    || STMT_VINFO_LIVE_P (stmt_info))
1128                   && STMT_VINFO_DEF_TYPE (stmt_info)
1129                      != vect_double_reduction_def)
1130                 {
1131                   if (vect_print_dump_info (REPORT_DETAILS))
1132                     fprintf (vect_dump,
1133                              "Unsupported loop-closed phi in outer-loop.");
1134                   return false;
1135                 }
1136               continue;
1137             }
1138
1139           gcc_assert (stmt_info);
1140
1141           if (STMT_VINFO_LIVE_P (stmt_info))
1142             {
1143               /* FORNOW: not yet supported.  */
1144               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1145                 fprintf (vect_dump, "not vectorized: value used after loop.");
1146               return false;
1147             }
1148
1149           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1150               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1151             {
1152               /* A scalar-dependence cycle that we don't support.  */
1153               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1154                 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1155               return false;
1156             }
1157
1158           if (STMT_VINFO_RELEVANT_P (stmt_info))
1159             {
1160               need_to_vectorize = true;
1161               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1162                 ok = vectorizable_induction (phi, NULL, NULL);
1163             }
1164
1165           if (!ok)
1166             {
1167               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1168                 {
1169                   fprintf (vect_dump,
1170                            "not vectorized: relevant phi not supported: ");
1171                   print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1172                 }
1173               return false;
1174             }
1175         }
1176
1177       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1178         {
1179           gimple stmt = gsi_stmt (si);
1180           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1181
1182           gcc_assert (stmt_info);
1183
1184           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1185             return false;
1186
1187           if (STMT_VINFO_RELEVANT_P (stmt_info) && !PURE_SLP_STMT (stmt_info))
1188             /* STMT needs both SLP and loop-based vectorization.  */
1189             only_slp_in_loop = false;
1190         }
1191     } /* bbs */
1192
1193   /* All operations in the loop are either irrelevant (deal with loop
1194      control, or dead), or only used outside the loop and can be moved
1195      out of the loop (e.g. invariants, inductions).  The loop can be
1196      optimized away by scalar optimizations.  We're better off not
1197      touching this loop.  */
1198   if (!need_to_vectorize)
1199     {
1200       if (vect_print_dump_info (REPORT_DETAILS))
1201         fprintf (vect_dump,
1202                  "All the computation can be taken out of the loop.");
1203       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1204         fprintf (vect_dump,
1205                  "not vectorized: redundant loop. no profit to vectorize.");
1206       return false;
1207     }
1208
1209   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1210      vectorization factor of the loop is the unrolling factor required by the
1211      SLP instances.  If that unrolling factor is 1, we say, that we perform
1212      pure SLP on loop - cross iteration parallelism is not exploited.  */
1213   if (only_slp_in_loop)
1214     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1215   else
1216     vectorization_factor = least_common_multiple (vectorization_factor,
1217                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1218
1219   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1220
1221   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1222       && vect_print_dump_info (REPORT_DETAILS))
1223     fprintf (vect_dump,
1224         "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1225         vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1226
1227   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1228       && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1229     {
1230       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1231         fprintf (vect_dump, "not vectorized: iteration count too small.");
1232       if (vect_print_dump_info (REPORT_DETAILS))
1233         fprintf (vect_dump,"not vectorized: iteration count smaller than "
1234                  "vectorization factor.");
1235       return false;
1236     }
1237
1238   /* Analyze cost. Decide if worth while to vectorize.  */
1239
1240   /* Once VF is set, SLP costs should be updated since the number of created
1241      vector stmts depends on VF.  */
1242   vect_update_slp_costs_according_to_vf (loop_vinfo);
1243
1244   min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1245   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1246
1247   if (min_profitable_iters < 0)
1248     {
1249       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1250         fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1251       if (vect_print_dump_info (REPORT_DETAILS))
1252         fprintf (vect_dump, "not vectorized: vector version will never be "
1253                  "profitable.");
1254       return false;
1255     }
1256
1257   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1258                             * vectorization_factor) - 1);
1259
1260   /* Use the cost model only if it is more conservative than user specified
1261      threshold.  */
1262
1263   th = (unsigned) min_scalar_loop_bound;
1264   if (min_profitable_iters
1265       && (!min_scalar_loop_bound
1266           || min_profitable_iters > min_scalar_loop_bound))
1267     th = (unsigned) min_profitable_iters;
1268
1269   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1270       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1271     {
1272       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1273         fprintf (vect_dump, "not vectorized: vectorization not "
1274                  "profitable.");
1275       if (vect_print_dump_info (REPORT_DETAILS))
1276         fprintf (vect_dump, "not vectorized: iteration count smaller than "
1277                  "user specified loop bound parameter or minimum "
1278                  "profitable iterations (whichever is more conservative).");
1279       return false;
1280     }
1281
1282   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1283       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1284       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1285     {
1286       if (vect_print_dump_info (REPORT_DETAILS))
1287         fprintf (vect_dump, "epilog loop required.");
1288       if (!vect_can_advance_ivs_p (loop_vinfo))
1289         {
1290           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1291             fprintf (vect_dump,
1292                      "not vectorized: can't create epilog loop 1.");
1293           return false;
1294         }
1295       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1296         {
1297           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1298             fprintf (vect_dump,
1299                      "not vectorized: can't create epilog loop 2.");
1300           return false;
1301         }
1302     }
1303
1304   return true;
1305 }
1306
1307
1308 /* Function vect_analyze_loop.
1309
1310    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1311    for it. The different analyses will record information in the
1312    loop_vec_info struct.  */
1313 loop_vec_info
1314 vect_analyze_loop (struct loop *loop)
1315 {
1316   bool ok;
1317   loop_vec_info loop_vinfo;
1318
1319   if (vect_print_dump_info (REPORT_DETAILS))
1320     fprintf (vect_dump, "===== analyze_loop_nest =====");
1321
1322   if (loop_outer (loop)
1323       && loop_vec_info_for_loop (loop_outer (loop))
1324       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1325     {
1326       if (vect_print_dump_info (REPORT_DETAILS))
1327         fprintf (vect_dump, "outer-loop already vectorized.");
1328       return NULL;
1329     }
1330
1331   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1332
1333   loop_vinfo = vect_analyze_loop_form (loop);
1334   if (!loop_vinfo)
1335     {
1336       if (vect_print_dump_info (REPORT_DETAILS))
1337         fprintf (vect_dump, "bad loop form.");
1338       return NULL;
1339     }
1340
1341   /* Find all data references in the loop (which correspond to vdefs/vuses)
1342      and analyze their evolution in the loop.
1343
1344      FORNOW: Handle only simple, array references, which
1345      alignment can be forced, and aligned pointer-references.  */
1346
1347   ok = vect_analyze_data_refs (loop_vinfo, NULL);
1348   if (!ok)
1349     {
1350       if (vect_print_dump_info (REPORT_DETAILS))
1351         fprintf (vect_dump, "bad data references.");
1352       destroy_loop_vec_info (loop_vinfo, true);
1353       return NULL;
1354     }
1355
1356   /* Classify all cross-iteration scalar data-flow cycles.
1357      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1358
1359   vect_analyze_scalar_cycles (loop_vinfo);
1360
1361   vect_pattern_recog (loop_vinfo);
1362
1363   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1364
1365   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1366   if (!ok)
1367     {
1368       if (vect_print_dump_info (REPORT_DETAILS))
1369         fprintf (vect_dump, "unexpected pattern.");
1370       destroy_loop_vec_info (loop_vinfo, true);
1371       return NULL;
1372     }
1373
1374   /* Analyze the alignment of the data-refs in the loop.
1375      Fail if a data reference is found that cannot be vectorized.  */
1376
1377   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1378   if (!ok)
1379     {
1380       if (vect_print_dump_info (REPORT_DETAILS))
1381         fprintf (vect_dump, "bad data alignment.");
1382       destroy_loop_vec_info (loop_vinfo, true);
1383       return NULL;
1384     }
1385
1386   ok = vect_determine_vectorization_factor (loop_vinfo);
1387   if (!ok)
1388     {
1389       if (vect_print_dump_info (REPORT_DETAILS))
1390         fprintf (vect_dump, "can't determine vectorization factor.");
1391       destroy_loop_vec_info (loop_vinfo, true);
1392       return NULL;
1393     }
1394
1395   /* Analyze data dependences between the data-refs in the loop.
1396      FORNOW: fail at the first data dependence that we encounter.  */
1397
1398   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL);
1399   if (!ok)
1400     {
1401       if (vect_print_dump_info (REPORT_DETAILS))
1402         fprintf (vect_dump, "bad data dependence.");
1403       destroy_loop_vec_info (loop_vinfo, true);
1404       return NULL;
1405     }
1406
1407   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1408      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1409
1410   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1411   if (!ok)
1412     {
1413       if (vect_print_dump_info (REPORT_DETAILS))
1414         fprintf (vect_dump, "bad data access.");
1415       destroy_loop_vec_info (loop_vinfo, true);
1416       return NULL;
1417     }
1418
1419   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1420      It is important to call pruning after vect_analyze_data_ref_accesses,
1421      since we use grouping information gathered by interleaving analysis.  */
1422   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1423   if (!ok)
1424     {
1425       if (vect_print_dump_info (REPORT_DETAILS))
1426         fprintf (vect_dump, "too long list of versioning for alias "
1427                             "run-time tests.");
1428       destroy_loop_vec_info (loop_vinfo, true);
1429       return NULL;
1430     }
1431
1432   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1433   ok = vect_analyze_slp (loop_vinfo, NULL);
1434   if (ok)
1435     {
1436       /* Decide which possible SLP instances to SLP.  */
1437       vect_make_slp_decision (loop_vinfo);
1438
1439       /* Find stmts that need to be both vectorized and SLPed.  */
1440       vect_detect_hybrid_slp (loop_vinfo);
1441     }
1442
1443   /* This pass will decide on using loop versioning and/or loop peeling in
1444      order to enhance the alignment of data references in the loop.  */
1445
1446   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1447   if (!ok)
1448     {
1449       if (vect_print_dump_info (REPORT_DETAILS))
1450         fprintf (vect_dump, "bad data alignment.");
1451       destroy_loop_vec_info (loop_vinfo, true);
1452       return NULL;
1453     }
1454
1455   /* Scan all the operations in the loop and make sure they are
1456      vectorizable.  */
1457
1458   ok = vect_analyze_loop_operations (loop_vinfo);
1459   if (!ok)
1460     {
1461       if (vect_print_dump_info (REPORT_DETAILS))
1462         fprintf (vect_dump, "bad operation or unsupported loop bound.");
1463       destroy_loop_vec_info (loop_vinfo, true);
1464       return NULL;
1465     }
1466
1467   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1468
1469   return loop_vinfo;
1470 }
1471
1472
1473 /* Function reduction_code_for_scalar_code
1474
1475    Input:
1476    CODE - tree_code of a reduction operations.
1477
1478    Output:
1479    REDUC_CODE - the corresponding tree-code to be used to reduce the
1480       vector of partial results into a single scalar result (which
1481       will also reside in a vector) or ERROR_MARK if the operation is
1482       a supported reduction operation, but does not have such tree-code.
1483
1484    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1485
1486 static bool
1487 reduction_code_for_scalar_code (enum tree_code code,
1488                                 enum tree_code *reduc_code)
1489 {
1490   switch (code)
1491     {
1492       case MAX_EXPR:
1493         *reduc_code = REDUC_MAX_EXPR;
1494         return true;
1495
1496       case MIN_EXPR:
1497         *reduc_code = REDUC_MIN_EXPR;
1498         return true;
1499
1500       case PLUS_EXPR:
1501         *reduc_code = REDUC_PLUS_EXPR;
1502         return true;
1503
1504       case MULT_EXPR:
1505       case MINUS_EXPR:
1506       case BIT_IOR_EXPR:
1507       case BIT_XOR_EXPR:
1508       case BIT_AND_EXPR:
1509         *reduc_code = ERROR_MARK;
1510         return true;
1511
1512       default:
1513        return false;
1514     }
1515 }
1516
1517
1518 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1519    STMT is printed with a message MSG. */
1520
1521 static void
1522 report_vect_op (gimple stmt, const char *msg)
1523 {
1524   fprintf (vect_dump, "%s", msg);
1525   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1526 }
1527
1528
1529 /* Function vect_is_simple_reduction
1530
1531    (1) Detect a cross-iteration def-use cycle that represents a simple
1532    reduction computation. We look for the following pattern:
1533
1534    loop_header:
1535      a1 = phi < a0, a2 >
1536      a3 = ...
1537      a2 = operation (a3, a1)
1538
1539    such that:
1540    1. operation is commutative and associative and it is safe to
1541       change the order of the computation (if CHECK_REDUCTION is true)
1542    2. no uses for a2 in the loop (a2 is used out of the loop)
1543    3. no uses of a1 in the loop besides the reduction operation.
1544
1545    Condition 1 is tested here.
1546    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1547
1548    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1549    nested cycles, if CHECK_REDUCTION is false.
1550
1551    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1552    reductions:
1553
1554      a1 = phi < a0, a2 >
1555      inner loop (def of a3)
1556      a2 = phi < a3 >
1557 */
1558
1559 gimple
1560 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
1561                           bool check_reduction, bool *double_reduc)
1562 {
1563   struct loop *loop = (gimple_bb (phi))->loop_father;
1564   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1565   edge latch_e = loop_latch_edge (loop);
1566   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1567   gimple def_stmt, def1 = NULL, def2 = NULL;
1568   enum tree_code code;
1569   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1570   tree type;
1571   int nloop_uses;
1572   tree name;
1573   imm_use_iterator imm_iter;
1574   use_operand_p use_p;
1575   bool phi_def;
1576
1577   *double_reduc = false;
1578
1579   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1580      otherwise, we assume outer loop vectorization.  */
1581   gcc_assert ((check_reduction && loop == vect_loop)
1582               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1583
1584   name = PHI_RESULT (phi);
1585   nloop_uses = 0;
1586   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1587     {
1588       gimple use_stmt = USE_STMT (use_p);
1589       if (is_gimple_debug (use_stmt))
1590         continue;
1591       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1592           && vinfo_for_stmt (use_stmt)
1593           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1594         nloop_uses++;
1595       if (nloop_uses > 1)
1596         {
1597           if (vect_print_dump_info (REPORT_DETAILS))
1598             fprintf (vect_dump, "reduction used in loop.");
1599           return NULL;
1600         }
1601     }
1602
1603   if (TREE_CODE (loop_arg) != SSA_NAME)
1604     {
1605       if (vect_print_dump_info (REPORT_DETAILS))
1606         {
1607           fprintf (vect_dump, "reduction: not ssa_name: ");
1608           print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1609         }
1610       return NULL;
1611     }
1612
1613   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1614   if (!def_stmt)
1615     {
1616       if (vect_print_dump_info (REPORT_DETAILS))
1617         fprintf (vect_dump, "reduction: no def_stmt.");
1618       return NULL;
1619     }
1620
1621   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1622     {
1623       if (vect_print_dump_info (REPORT_DETAILS))
1624         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1625       return NULL;
1626     }
1627
1628   if (is_gimple_assign (def_stmt))
1629     {
1630       name = gimple_assign_lhs (def_stmt);
1631       phi_def = false;
1632     }
1633   else
1634     {
1635       name = PHI_RESULT (def_stmt);
1636       phi_def = true;
1637     }
1638
1639   nloop_uses = 0;
1640   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1641     {
1642       gimple use_stmt = USE_STMT (use_p);
1643       if (is_gimple_debug (use_stmt))
1644         continue;
1645       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1646           && vinfo_for_stmt (use_stmt)
1647           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1648         nloop_uses++;
1649       if (nloop_uses > 1)
1650         {
1651           if (vect_print_dump_info (REPORT_DETAILS))
1652             fprintf (vect_dump, "reduction used in loop.");
1653           return NULL;
1654         }
1655     }
1656
1657   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1658      defined in the inner loop.  */
1659   if (phi_def)
1660     {
1661       op1 = PHI_ARG_DEF (def_stmt, 0);
1662
1663       if (gimple_phi_num_args (def_stmt) != 1
1664           || TREE_CODE (op1) != SSA_NAME)
1665         {
1666           if (vect_print_dump_info (REPORT_DETAILS))
1667             fprintf (vect_dump, "unsupported phi node definition.");
1668
1669           return NULL;
1670         }
1671
1672       def1 = SSA_NAME_DEF_STMT (op1);
1673       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1674           && loop->inner
1675           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1676           && is_gimple_assign (def1))
1677         {
1678           if (vect_print_dump_info (REPORT_DETAILS))
1679             report_vect_op (def_stmt, "detected double reduction: ");
1680
1681           *double_reduc = true;
1682           return def_stmt;
1683         }
1684
1685       return NULL;
1686     }
1687
1688   code = gimple_assign_rhs_code (def_stmt);
1689
1690   if (check_reduction
1691       && (!commutative_tree_code (code) || !associative_tree_code (code)))
1692     {
1693       if (vect_print_dump_info (REPORT_DETAILS))
1694         report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1695       return NULL;
1696     }
1697
1698   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1699     {
1700       if (code != COND_EXPR)
1701         {
1702           if (vect_print_dump_info (REPORT_DETAILS))
1703             report_vect_op (def_stmt, "reduction: not binary operation: ");
1704
1705           return NULL;
1706         }
1707
1708       op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1709       if (COMPARISON_CLASS_P (op3))
1710         {
1711           op4 = TREE_OPERAND (op3, 1);
1712           op3 = TREE_OPERAND (op3, 0);
1713         }
1714
1715       op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1716       op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1717
1718       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1719         {
1720           if (vect_print_dump_info (REPORT_DETAILS))
1721             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1722
1723           return NULL;
1724         }
1725     }
1726   else
1727     {
1728       op1 = gimple_assign_rhs1 (def_stmt);
1729       op2 = gimple_assign_rhs2 (def_stmt);
1730
1731       if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1732         {
1733           if (vect_print_dump_info (REPORT_DETAILS))
1734             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1735
1736           return NULL;
1737         }
1738    }
1739
1740   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1741   if ((TREE_CODE (op1) == SSA_NAME
1742        && !types_compatible_p (type,TREE_TYPE (op1)))
1743       || (TREE_CODE (op2) == SSA_NAME
1744           && !types_compatible_p (type, TREE_TYPE (op2)))
1745       || (op3 && TREE_CODE (op3) == SSA_NAME
1746           && !types_compatible_p (type, TREE_TYPE (op3)))
1747       || (op4 && TREE_CODE (op4) == SSA_NAME
1748           && !types_compatible_p (type, TREE_TYPE (op4))))
1749     {
1750       if (vect_print_dump_info (REPORT_DETAILS))
1751         {
1752           fprintf (vect_dump, "reduction: multiple types: operation type: ");
1753           print_generic_expr (vect_dump, type, TDF_SLIM);
1754           fprintf (vect_dump, ", operands types: ");
1755           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1756           fprintf (vect_dump, ",");
1757           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1758           if (op3)
1759             {
1760               fprintf (vect_dump, ",");
1761               print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1762             }
1763
1764           if (op4)
1765             {
1766               fprintf (vect_dump, ",");
1767               print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1768             }
1769         }
1770
1771       return NULL;
1772     }
1773
1774   /* Check that it's ok to change the order of the computation.
1775      Generally, when vectorizing a reduction we change the order of the
1776      computation.  This may change the behavior of the program in some
1777      cases, so we need to check that this is ok.  One exception is when
1778      vectorizing an outer-loop: the inner-loop is executed sequentially,
1779      and therefore vectorizing reductions in the inner-loop during
1780      outer-loop vectorization is safe.  */
1781
1782   /* CHECKME: check for !flag_finite_math_only too?  */
1783   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1784       && check_reduction)
1785     {
1786       /* Changing the order of operations changes the semantics.  */
1787       if (vect_print_dump_info (REPORT_DETAILS))
1788         report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1789       return NULL;
1790     }
1791   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1792            && check_reduction)
1793     {
1794       /* Changing the order of operations changes the semantics.  */
1795       if (vect_print_dump_info (REPORT_DETAILS))
1796         report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1797       return NULL;
1798     }
1799   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1800     {
1801       /* Changing the order of operations changes the semantics.  */
1802       if (vect_print_dump_info (REPORT_DETAILS))
1803         report_vect_op (def_stmt,
1804                         "reduction: unsafe fixed-point math optimization: ");
1805       return NULL;
1806     }
1807
1808   /* Reduction is safe. We're dealing with one of the following:
1809      1) integer arithmetic and no trapv
1810      2) floating point arithmetic, and special flags permit this optimization
1811      3) nested cycle (i.e., outer loop vectorization).  */
1812   if (TREE_CODE (op1) == SSA_NAME)
1813     def1 = SSA_NAME_DEF_STMT (op1);
1814
1815   if (TREE_CODE (op2) == SSA_NAME)
1816     def2 = SSA_NAME_DEF_STMT (op2);
1817
1818   if (code != COND_EXPR
1819       && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1820     {
1821       if (vect_print_dump_info (REPORT_DETAILS))
1822         report_vect_op (def_stmt, "reduction: no defs for operands: ");
1823       return NULL;
1824     }
1825
1826   /* Check that one def is the reduction def, defined by PHI,
1827      the other def is either defined in the loop ("vect_internal_def"),
1828      or it's an induction (defined by a loop-header phi-node).  */
1829
1830   if (def2 && def2 == phi
1831       && (code == COND_EXPR
1832           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1833               && (is_gimple_assign (def1)
1834                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1835                       == vect_induction_def
1836                   || (gimple_code (def1) == GIMPLE_PHI
1837                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1838                           == vect_internal_def
1839                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
1840     {
1841       if (vect_print_dump_info (REPORT_DETAILS))
1842         report_vect_op (def_stmt, "detected reduction: ");
1843       return def_stmt;
1844     }
1845   else if (def1 && def1 == phi
1846            && (code == COND_EXPR
1847                || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1848                    && (is_gimple_assign (def2)
1849                        || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1850                            == vect_induction_def
1851                        || (gimple_code (def2) == GIMPLE_PHI
1852                            && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1853                                == vect_internal_def
1854                            && !is_loop_header_bb_p (gimple_bb (def2)))))))
1855     {
1856       if (check_reduction)
1857         {
1858           /* Swap operands (just for simplicity - so that the rest of the code
1859              can assume that the reduction variable is always the last (second)
1860              argument).  */
1861           if (vect_print_dump_info (REPORT_DETAILS))
1862             report_vect_op (def_stmt,
1863                             "detected reduction: need to swap operands: ");
1864
1865           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
1866                               gimple_assign_rhs2_ptr (def_stmt));
1867         }
1868       else
1869         {
1870           if (vect_print_dump_info (REPORT_DETAILS))
1871             report_vect_op (def_stmt, "detected reduction: ");
1872         }
1873
1874       return def_stmt;
1875     }
1876   else
1877     {
1878       if (vect_print_dump_info (REPORT_DETAILS))
1879         report_vect_op (def_stmt, "reduction: unknown pattern: ");
1880
1881       return NULL;
1882     }
1883 }
1884
1885
1886 /* Function vect_estimate_min_profitable_iters
1887
1888    Return the number of iterations required for the vector version of the
1889    loop to be profitable relative to the cost of the scalar version of the
1890    loop.
1891
1892    TODO: Take profile info into account before making vectorization
1893    decisions, if available.  */
1894
1895 int
1896 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
1897 {
1898   int i;
1899   int min_profitable_iters;
1900   int peel_iters_prologue;
1901   int peel_iters_epilogue;
1902   int vec_inside_cost = 0;
1903   int vec_outside_cost = 0;
1904   int scalar_single_iter_cost = 0;
1905   int scalar_outside_cost = 0;
1906   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1907   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1908   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1909   int nbbs = loop->num_nodes;
1910   int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
1911   int peel_guard_costs = 0;
1912   int innerloop_iters = 0, factor;
1913   VEC (slp_instance, heap) *slp_instances;
1914   slp_instance instance;
1915
1916   /* Cost model disabled.  */
1917   if (!flag_vect_cost_model)
1918     {
1919       if (vect_print_dump_info (REPORT_COST))
1920         fprintf (vect_dump, "cost model disabled.");
1921       return 0;
1922     }
1923
1924   /* Requires loop versioning tests to handle misalignment.  */
1925   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1926     {
1927       /*  FIXME: Make cost depend on complexity of individual check.  */
1928       vec_outside_cost +=
1929         VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1930       if (vect_print_dump_info (REPORT_COST))
1931         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1932                  "versioning to treat misalignment.\n");
1933     }
1934
1935   /* Requires loop versioning with alias checks.  */
1936   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1937     {
1938       /*  FIXME: Make cost depend on complexity of individual check.  */
1939       vec_outside_cost +=
1940         VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1941       if (vect_print_dump_info (REPORT_COST))
1942         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1943                  "versioning aliasing.\n");
1944     }
1945
1946   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
1947       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1948     vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
1949
1950   /* Count statements in scalar loop.  Using this as scalar cost for a single
1951      iteration for now.
1952
1953      TODO: Add outer loop support.
1954
1955      TODO: Consider assigning different costs to different scalar
1956      statements.  */
1957
1958   /* FORNOW.  */
1959   if (loop->inner)
1960     innerloop_iters = 50; /* FIXME */
1961
1962   for (i = 0; i < nbbs; i++)
1963     {
1964       gimple_stmt_iterator si;
1965       basic_block bb = bbs[i];
1966
1967       if (bb->loop_father == loop->inner)
1968         factor = innerloop_iters;
1969       else
1970         factor = 1;
1971
1972       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1973         {
1974           gimple stmt = gsi_stmt (si);
1975           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1976           /* Skip stmts that are not vectorized inside the loop.  */
1977           if (!STMT_VINFO_RELEVANT_P (stmt_info)
1978               && (!STMT_VINFO_LIVE_P (stmt_info)
1979                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
1980             continue;
1981           scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
1982           vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
1983           /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
1984              some of the "outside" costs are generated inside the outer-loop.  */
1985           vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
1986         }
1987     }
1988
1989   /* Add additional cost for the peeled instructions in prologue and epilogue
1990      loop.
1991
1992      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
1993      at compile-time - we assume it's vf/2 (the worst would be vf-1).
1994
1995      TODO: Build an expression that represents peel_iters for prologue and
1996      epilogue to be used in a run-time test.  */
1997
1998   if (byte_misalign < 0)
1999     {
2000       peel_iters_prologue = vf/2;
2001       if (vect_print_dump_info (REPORT_COST))
2002         fprintf (vect_dump, "cost model: "
2003                  "prologue peel iters set to vf/2.");
2004
2005       /* If peeling for alignment is unknown, loop bound of main loop becomes
2006          unknown.  */
2007       peel_iters_epilogue = vf/2;
2008       if (vect_print_dump_info (REPORT_COST))
2009         fprintf (vect_dump, "cost model: "
2010                  "epilogue peel iters set to vf/2 because "
2011                  "peeling for alignment is unknown .");
2012
2013       /* If peeled iterations are unknown, count a taken branch and a not taken
2014          branch per peeled loop. Even if scalar loop iterations are known,
2015          vector iterations are not known since peeled prologue iterations are
2016          not known. Hence guards remain the same.  */
2017       peel_guard_costs +=  2 * (TARG_COND_TAKEN_BRANCH_COST
2018                               + TARG_COND_NOT_TAKEN_BRANCH_COST);
2019     }
2020   else
2021     {
2022       if (byte_misalign)
2023         {
2024           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2025           int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
2026           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2027           int nelements = TYPE_VECTOR_SUBPARTS (vectype);
2028
2029           peel_iters_prologue = nelements - (byte_misalign / element_size);
2030         }
2031       else
2032         peel_iters_prologue = 0;
2033
2034       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2035         {
2036           peel_iters_epilogue = vf/2;
2037           if (vect_print_dump_info (REPORT_COST))
2038             fprintf (vect_dump, "cost model: "
2039                      "epilogue peel iters set to vf/2 because "
2040                      "loop iterations are unknown .");
2041
2042           /* If peeled iterations are known but number of scalar loop
2043              iterations are unknown, count a taken branch per peeled loop.  */
2044           peel_guard_costs +=  2 * TARG_COND_TAKEN_BRANCH_COST;
2045
2046         }
2047       else
2048         {
2049           int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2050           peel_iters_prologue = niters < peel_iters_prologue ?
2051                                         niters : peel_iters_prologue;
2052           peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2053         }
2054     }
2055
2056   vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2057                       + (peel_iters_epilogue * scalar_single_iter_cost)
2058                       + peel_guard_costs;
2059
2060   /* FORNOW: The scalar outside cost is incremented in one of the
2061      following ways:
2062
2063      1. The vectorizer checks for alignment and aliasing and generates
2064      a condition that allows dynamic vectorization.  A cost model
2065      check is ANDED with the versioning condition.  Hence scalar code
2066      path now has the added cost of the versioning check.
2067
2068        if (cost > th & versioning_check)
2069          jmp to vector code
2070
2071      Hence run-time scalar is incremented by not-taken branch cost.
2072
2073      2. The vectorizer then checks if a prologue is required.  If the
2074      cost model check was not done before during versioning, it has to
2075      be done before the prologue check.
2076
2077        if (cost <= th)
2078          prologue = scalar_iters
2079        if (prologue == 0)
2080          jmp to vector code
2081        else
2082          execute prologue
2083        if (prologue == num_iters)
2084          go to exit
2085
2086      Hence the run-time scalar cost is incremented by a taken branch,
2087      plus a not-taken branch, plus a taken branch cost.
2088
2089      3. The vectorizer then checks if an epilogue is required.  If the
2090      cost model check was not done before during prologue check, it
2091      has to be done with the epilogue check.
2092
2093        if (prologue == 0)
2094          jmp to vector code
2095        else
2096          execute prologue
2097        if (prologue == num_iters)
2098          go to exit
2099        vector code:
2100          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2101            jmp to epilogue
2102
2103      Hence the run-time scalar cost should be incremented by 2 taken
2104      branches.
2105
2106      TODO: The back end may reorder the BBS's differently and reverse
2107      conditions/branch directions.  Change the estimates below to
2108      something more reasonable.  */
2109
2110   /* If the number of iterations is known and we do not do versioning, we can
2111      decide whether to vectorize at compile time. Hence the scalar version
2112      do not carry cost model guard costs.  */
2113   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2114       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2115       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2116     {
2117       /* Cost model check occurs at versioning.  */
2118       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2119           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2120         scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
2121       else
2122         {
2123           /* Cost model check occurs at prologue generation.  */
2124           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2125             scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
2126               + TARG_COND_NOT_TAKEN_BRANCH_COST;
2127           /* Cost model check occurs at epilogue generation.  */
2128           else
2129             scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
2130         }
2131     }
2132
2133   /* Add SLP costs.  */
2134   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2135   for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
2136     {
2137       vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2138       vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2139     }
2140
2141   /* Calculate number of iterations required to make the vector version
2142      profitable, relative to the loop bodies only. The following condition
2143      must hold true:
2144      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2145      where
2146      SIC = scalar iteration cost, VIC = vector iteration cost,
2147      VOC = vector outside cost, VF = vectorization factor,
2148      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2149      SOC = scalar outside cost for run time cost model check.  */
2150
2151   if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2152     {
2153       if (vec_outside_cost <= 0)
2154         min_profitable_iters = 1;
2155       else
2156         {
2157           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2158                                   - vec_inside_cost * peel_iters_prologue
2159                                   - vec_inside_cost * peel_iters_epilogue)
2160                                  / ((scalar_single_iter_cost * vf)
2161                                     - vec_inside_cost);
2162
2163           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2164               <= ((vec_inside_cost * min_profitable_iters)
2165                   + ((vec_outside_cost - scalar_outside_cost) * vf)))
2166             min_profitable_iters++;
2167         }
2168     }
2169   /* vector version will never be profitable.  */
2170   else
2171     {
2172       if (vect_print_dump_info (REPORT_COST))
2173         fprintf (vect_dump, "cost model: vector iteration cost = %d "
2174                  "is divisible by scalar iteration cost = %d by a factor "
2175                  "greater than or equal to the vectorization factor = %d .",
2176                  vec_inside_cost, scalar_single_iter_cost, vf);
2177       return -1;
2178     }
2179
2180   if (vect_print_dump_info (REPORT_COST))
2181     {
2182       fprintf (vect_dump, "Cost model analysis: \n");
2183       fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
2184                vec_inside_cost);
2185       fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
2186                vec_outside_cost);
2187       fprintf (vect_dump, "  Scalar iteration cost: %d\n",
2188                scalar_single_iter_cost);
2189       fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
2190       fprintf (vect_dump, "  prologue iterations: %d\n",
2191                peel_iters_prologue);
2192       fprintf (vect_dump, "  epilogue iterations: %d\n",
2193                peel_iters_epilogue);
2194       fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
2195                min_profitable_iters);
2196     }
2197
2198   min_profitable_iters =
2199         min_profitable_iters < vf ? vf : min_profitable_iters;
2200
2201   /* Because the condition we create is:
2202      if (niters <= min_profitable_iters)
2203        then skip the vectorized loop.  */
2204   min_profitable_iters--;
2205
2206   if (vect_print_dump_info (REPORT_COST))
2207     fprintf (vect_dump, "  Profitability threshold = %d\n",
2208              min_profitable_iters);
2209
2210   return min_profitable_iters;
2211 }
2212
2213
2214 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2215    functions. Design better to avoid maintenance issues.  */
2216
2217 /* Function vect_model_reduction_cost.
2218
2219    Models cost for a reduction operation, including the vector ops
2220    generated within the strip-mine loop, the initial definition before
2221    the loop, and the epilogue code that must be generated.  */
2222
2223 static bool
2224 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2225                            int ncopies)
2226 {
2227   int outer_cost = 0;
2228   enum tree_code code;
2229   optab optab;
2230   tree vectype;
2231   gimple stmt, orig_stmt;
2232   tree reduction_op;
2233   enum machine_mode mode;
2234   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2235   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2236
2237
2238   /* Cost of reduction op inside loop.  */
2239   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
2240
2241   stmt = STMT_VINFO_STMT (stmt_info);
2242
2243   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2244     {
2245     case GIMPLE_SINGLE_RHS:
2246       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2247       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2248       break;
2249     case GIMPLE_UNARY_RHS:
2250       reduction_op = gimple_assign_rhs1 (stmt);
2251       break;
2252     case GIMPLE_BINARY_RHS:
2253       reduction_op = gimple_assign_rhs2 (stmt);
2254       break;
2255     default:
2256       gcc_unreachable ();
2257     }
2258
2259   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2260   if (!vectype)
2261     {
2262       if (vect_print_dump_info (REPORT_COST))
2263         {
2264           fprintf (vect_dump, "unsupported data-type ");
2265           print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2266         }
2267       return false;
2268    }
2269
2270   mode = TYPE_MODE (vectype);
2271   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2272
2273   if (!orig_stmt)
2274     orig_stmt = STMT_VINFO_STMT (stmt_info);
2275
2276   code = gimple_assign_rhs_code (orig_stmt);
2277
2278   /* Add in cost for initial definition.  */
2279   outer_cost += TARG_SCALAR_TO_VEC_COST;
2280
2281   /* Determine cost of epilogue code.
2282
2283      We have a reduction operator that will reduce the vector in one statement.
2284      Also requires scalar extract.  */
2285
2286   if (!nested_in_vect_loop_p (loop, orig_stmt))
2287     {
2288       if (reduc_code != ERROR_MARK)
2289         outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
2290       else
2291         {
2292           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2293           tree bitsize =
2294             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2295           int element_bitsize = tree_low_cst (bitsize, 1);
2296           int nelements = vec_size_in_bits / element_bitsize;
2297
2298           optab = optab_for_tree_code (code, vectype, optab_default);
2299
2300           /* We have a whole vector shift available.  */
2301           if (VECTOR_MODE_P (mode)
2302               && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
2303               && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2304             /* Final reduction via vector shifts and the reduction operator. Also
2305                requires scalar extract.  */
2306             outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
2307                                 + TARG_VEC_TO_SCALAR_COST);
2308           else
2309             /* Use extracts and reduction op for final reduction.  For N elements,
2310                we have N extracts and N-1 reduction ops.  */
2311             outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
2312         }
2313     }
2314
2315   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2316
2317   if (vect_print_dump_info (REPORT_COST))
2318     fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2319              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2320              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2321
2322   return true;
2323 }
2324
2325
2326 /* Function vect_model_induction_cost.
2327
2328    Models cost for induction operations.  */
2329
2330 static void
2331 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2332 {
2333   /* loop cost for vec_loop.  */
2334   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
2335   /* prologue cost for vec_init and vec_step.  */
2336   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
2337
2338   if (vect_print_dump_info (REPORT_COST))
2339     fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2340              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2341              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2342 }
2343
2344
2345 /* Function get_initial_def_for_induction
2346
2347    Input:
2348    STMT - a stmt that performs an induction operation in the loop.
2349    IV_PHI - the initial value of the induction variable
2350
2351    Output:
2352    Return a vector variable, initialized with the first VF values of
2353    the induction variable. E.g., for an iv with IV_PHI='X' and
2354    evolution S, for a vector of 4 units, we want to return:
2355    [X, X + S, X + 2*S, X + 3*S].  */
2356
2357 static tree
2358 get_initial_def_for_induction (gimple iv_phi)
2359 {
2360   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2361   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2362   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2363   tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2364   tree vectype;
2365   int nunits;
2366   edge pe = loop_preheader_edge (loop);
2367   struct loop *iv_loop;
2368   basic_block new_bb;
2369   tree vec, vec_init, vec_step, t;
2370   tree access_fn;
2371   tree new_var;
2372   tree new_name;
2373   gimple init_stmt, induction_phi, new_stmt;
2374   tree induc_def, vec_def, vec_dest;
2375   tree init_expr, step_expr;
2376   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2377   int i;
2378   bool ok;
2379   int ncopies;
2380   tree expr;
2381   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2382   bool nested_in_vect_loop = false;
2383   gimple_seq stmts = NULL;
2384   imm_use_iterator imm_iter;
2385   use_operand_p use_p;
2386   gimple exit_phi;
2387   edge latch_e;
2388   tree loop_arg;
2389   gimple_stmt_iterator si;
2390   basic_block bb = gimple_bb (iv_phi);
2391   tree stepvectype;
2392
2393   vectype = get_vectype_for_scalar_type (scalar_type);
2394   gcc_assert (vectype);
2395   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2396   ncopies = vf / nunits;
2397
2398   gcc_assert (phi_info);
2399   gcc_assert (ncopies >= 1);
2400
2401   /* Find the first insertion point in the BB.  */
2402   si = gsi_after_labels (bb);
2403
2404   if (INTEGRAL_TYPE_P (scalar_type))
2405     step_expr = build_int_cst (scalar_type, 0);
2406   else if (POINTER_TYPE_P (scalar_type))
2407     step_expr = build_int_cst (sizetype, 0);
2408   else
2409     step_expr = build_real (scalar_type, dconst0);
2410
2411   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
2412   if (nested_in_vect_loop_p (loop, iv_phi))
2413     {
2414       nested_in_vect_loop = true;
2415       iv_loop = loop->inner;
2416     }
2417   else
2418     iv_loop = loop;
2419   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2420
2421   latch_e = loop_latch_edge (iv_loop);
2422   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2423
2424   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2425   gcc_assert (access_fn);
2426   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2427                                     &init_expr, &step_expr);
2428   gcc_assert (ok);
2429   pe = loop_preheader_edge (iv_loop);
2430
2431   /* Create the vector that holds the initial_value of the induction.  */
2432   if (nested_in_vect_loop)
2433     {
2434       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
2435          been created during vectorization of previous stmts; We obtain it from
2436          the STMT_VINFO_VEC_STMT of the defining stmt. */
2437       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2438                                            loop_preheader_edge (iv_loop));
2439       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2440     }
2441   else
2442     {
2443       /* iv_loop is the loop to be vectorized. Create:
2444          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
2445       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2446       add_referenced_var (new_var);
2447
2448       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2449       if (stmts)
2450         {
2451           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2452           gcc_assert (!new_bb);
2453         }
2454
2455       t = NULL_TREE;
2456       t = tree_cons (NULL_TREE, init_expr, t);
2457       for (i = 1; i < nunits; i++)
2458         {
2459           /* Create: new_name_i = new_name + step_expr  */
2460           enum tree_code code = POINTER_TYPE_P (scalar_type)
2461                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2462           init_stmt = gimple_build_assign_with_ops (code, new_var,
2463                                                     new_name, step_expr);
2464           new_name = make_ssa_name (new_var, init_stmt);
2465           gimple_assign_set_lhs (init_stmt, new_name);
2466
2467           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2468           gcc_assert (!new_bb);
2469
2470           if (vect_print_dump_info (REPORT_DETAILS))
2471             {
2472               fprintf (vect_dump, "created new init_stmt: ");
2473               print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2474             }
2475           t = tree_cons (NULL_TREE, new_name, t);
2476         }
2477       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
2478       vec = build_constructor_from_list (vectype, nreverse (t));
2479       vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2480     }
2481
2482
2483   /* Create the vector that holds the step of the induction.  */
2484   if (nested_in_vect_loop)
2485     /* iv_loop is nested in the loop to be vectorized. Generate:
2486        vec_step = [S, S, S, S]  */
2487     new_name = step_expr;
2488   else
2489     {
2490       /* iv_loop is the loop to be vectorized. Generate:
2491           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
2492       expr = build_int_cst (TREE_TYPE (step_expr), vf);
2493       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2494                               expr, step_expr);
2495     }
2496
2497   t = NULL_TREE;
2498   for (i = 0; i < nunits; i++)
2499     t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2500   gcc_assert (CONSTANT_CLASS_P (new_name));
2501   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2502   gcc_assert (stepvectype);
2503   vec = build_vector (stepvectype, t);
2504   vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2505
2506
2507   /* Create the following def-use cycle:
2508      loop prolog:
2509          vec_init = ...
2510          vec_step = ...
2511      loop:
2512          vec_iv = PHI <vec_init, vec_loop>
2513          ...
2514          STMT
2515          ...
2516          vec_loop = vec_iv + vec_step;  */
2517
2518   /* Create the induction-phi that defines the induction-operand.  */
2519   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2520   add_referenced_var (vec_dest);
2521   induction_phi = create_phi_node (vec_dest, iv_loop->header);
2522   set_vinfo_for_stmt (induction_phi,
2523                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2524   induc_def = PHI_RESULT (induction_phi);
2525
2526   /* Create the iv update inside the loop  */
2527   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2528                                            induc_def, vec_step);
2529   vec_def = make_ssa_name (vec_dest, new_stmt);
2530   gimple_assign_set_lhs (new_stmt, vec_def);
2531   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2532   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2533                                                    NULL));
2534
2535   /* Set the arguments of the phi node:  */
2536   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2537   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2538                UNKNOWN_LOCATION);
2539
2540
2541   /* In case that vectorization factor (VF) is bigger than the number
2542      of elements that we can fit in a vectype (nunits), we have to generate
2543      more than one vector stmt - i.e - we need to "unroll" the
2544      vector stmt by a factor VF/nunits.  For more details see documentation
2545      in vectorizable_operation.  */
2546
2547   if (ncopies > 1)
2548     {
2549       stmt_vec_info prev_stmt_vinfo;
2550       /* FORNOW. This restriction should be relaxed.  */
2551       gcc_assert (!nested_in_vect_loop);
2552
2553       /* Create the vector that holds the step of the induction.  */
2554       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2555       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2556                               expr, step_expr);
2557       t = NULL_TREE;
2558       for (i = 0; i < nunits; i++)
2559         t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2560       gcc_assert (CONSTANT_CLASS_P (new_name));
2561       vec = build_vector (stepvectype, t);
2562       vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2563
2564       vec_def = induc_def;
2565       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2566       for (i = 1; i < ncopies; i++)
2567         {
2568           /* vec_i = vec_prev + vec_step  */
2569           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2570                                                    vec_def, vec_step);
2571           vec_def = make_ssa_name (vec_dest, new_stmt);
2572           gimple_assign_set_lhs (new_stmt, vec_def);
2573
2574           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2575           set_vinfo_for_stmt (new_stmt,
2576                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2577           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2578           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2579         }
2580     }
2581
2582   if (nested_in_vect_loop)
2583     {
2584       /* Find the loop-closed exit-phi of the induction, and record
2585          the final vector of induction results:  */
2586       exit_phi = NULL;
2587       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2588         {
2589           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2590             {
2591               exit_phi = USE_STMT (use_p);
2592               break;
2593             }
2594         }
2595       if (exit_phi)
2596         {
2597           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2598           /* FORNOW. Currently not supporting the case that an inner-loop induction
2599              is not used in the outer-loop (i.e. only outside the outer-loop).  */
2600           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2601                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
2602
2603           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2604           if (vect_print_dump_info (REPORT_DETAILS))
2605             {
2606               fprintf (vect_dump, "vector of inductions after inner-loop:");
2607               print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2608             }
2609         }
2610     }
2611
2612
2613   if (vect_print_dump_info (REPORT_DETAILS))
2614     {
2615       fprintf (vect_dump, "transform induction: created def-use cycle: ");
2616       print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2617       fprintf (vect_dump, "\n");
2618       print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2619     }
2620
2621   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2622   return induc_def;
2623 }
2624
2625
2626 /* Function get_initial_def_for_reduction
2627
2628    Input:
2629    STMT - a stmt that performs a reduction operation in the loop.
2630    INIT_VAL - the initial value of the reduction variable
2631
2632    Output:
2633    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2634         of the reduction (used for adjusting the epilog - see below).
2635    Return a vector variable, initialized according to the operation that STMT
2636         performs. This vector will be used as the initial value of the
2637         vector of partial results.
2638
2639    Option1 (adjust in epilog): Initialize the vector as follows:
2640      add/bit or/xor:    [0,0,...,0,0]
2641      mult/bit and:      [1,1,...,1,1]
2642      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2643    and when necessary (e.g. add/mult case) let the caller know
2644    that it needs to adjust the result by init_val.
2645
2646    Option2: Initialize the vector as follows:
2647      add/bit or/xor:    [init_val,0,0,...,0]
2648      mult/bit and:      [init_val,1,1,...,1]
2649      min/max/cond_expr: [init_val,init_val,...,init_val]
2650    and no adjustments are needed.
2651
2652    For example, for the following code:
2653
2654    s = init_val;
2655    for (i=0;i<n;i++)
2656      s = s + a[i];
2657
2658    STMT is 's = s + a[i]', and the reduction variable is 's'.
2659    For a vector of 4 units, we want to return either [0,0,0,init_val],
2660    or [0,0,0,0] and let the caller know that it needs to adjust
2661    the result at the end by 'init_val'.
2662
2663    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2664    initialization vector is simpler (same element in all entries), if
2665    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2666
2667    A cost model should help decide between these two schemes.  */
2668
2669 tree
2670 get_initial_def_for_reduction (gimple stmt, tree init_val,
2671                                tree *adjustment_def)
2672 {
2673   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2674   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2675   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2676   tree scalar_type = TREE_TYPE (init_val);
2677   tree vectype = get_vectype_for_scalar_type (scalar_type);
2678   int nunits;
2679   enum tree_code code = gimple_assign_rhs_code (stmt);
2680   tree def_for_init;
2681   tree init_def;
2682   tree t = NULL_TREE;
2683   int i;
2684   bool nested_in_vect_loop = false;
2685   tree init_value;
2686   REAL_VALUE_TYPE real_init_val = dconst0;
2687   int int_init_val = 0;
2688   gimple def_stmt = NULL;
2689
2690   gcc_assert (vectype);
2691   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2692
2693   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2694               || SCALAR_FLOAT_TYPE_P (scalar_type));
2695
2696   if (nested_in_vect_loop_p (loop, stmt))
2697     nested_in_vect_loop = true;
2698   else
2699     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2700
2701   /* In case of double reduction we only create a vector variable to be put
2702      in the reduction phi node. The actual statement creation is done in
2703      vect_create_epilog_for_reduction.  */
2704   if (adjustment_def && nested_in_vect_loop
2705       && TREE_CODE (init_val) == SSA_NAME
2706       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2707       && gimple_code (def_stmt) == GIMPLE_PHI
2708       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2709       && vinfo_for_stmt (def_stmt)
2710       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2711           == vect_double_reduction_def)
2712     {
2713       *adjustment_def = NULL;
2714       return vect_create_destination_var (init_val, vectype);
2715     }
2716
2717   if (TREE_CONSTANT (init_val))
2718     {
2719       if (SCALAR_FLOAT_TYPE_P (scalar_type))
2720         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2721       else
2722         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2723     }
2724   else
2725     init_value = init_val;
2726
2727   switch (code)
2728     {
2729       case WIDEN_SUM_EXPR:
2730       case DOT_PROD_EXPR:
2731       case PLUS_EXPR:
2732       case MINUS_EXPR:
2733       case BIT_IOR_EXPR:
2734       case BIT_XOR_EXPR:
2735       case MULT_EXPR:
2736       case BIT_AND_EXPR:
2737         /* ADJUSMENT_DEF is NULL when called from
2738            vect_create_epilog_for_reduction to vectorize double reduction.  */
2739         if (adjustment_def)
2740           {
2741             if (nested_in_vect_loop)
2742               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2743                                                               NULL);
2744             else
2745               *adjustment_def = init_val;
2746           }
2747
2748         if (code == MULT_EXPR || code == BIT_AND_EXPR)
2749           {
2750             real_init_val = dconst1;
2751             int_init_val = 1;
2752           }
2753
2754         if (SCALAR_FLOAT_TYPE_P (scalar_type))
2755           def_for_init = build_real (scalar_type, real_init_val);
2756         else
2757           def_for_init = build_int_cst (scalar_type, int_init_val);
2758
2759         /* Create a vector of '0' or '1' except the first element.  */
2760         for (i = nunits - 2; i >= 0; --i)
2761           t = tree_cons (NULL_TREE, def_for_init, t);
2762
2763         /* Option1: the first element is '0' or '1' as well.  */
2764         if (adjustment_def)
2765           {
2766             t = tree_cons (NULL_TREE, def_for_init, t);
2767             init_def = build_vector (vectype, t);
2768             break;
2769           }
2770
2771         /* Option2: the first element is INIT_VAL.  */
2772         t = tree_cons (NULL_TREE, init_value, t);
2773         if (TREE_CONSTANT (init_val))
2774           init_def = build_vector (vectype, t);
2775         else
2776           init_def = build_constructor_from_list (vectype, t);
2777
2778         break;
2779
2780       case MIN_EXPR:
2781       case MAX_EXPR:
2782       case COND_EXPR:
2783         if (adjustment_def)
2784           {
2785             *adjustment_def = NULL_TREE;
2786             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2787             break;
2788           }
2789
2790         for (i = nunits - 1; i >= 0; --i)
2791           t = tree_cons (NULL_TREE, init_value, t);
2792
2793         if (TREE_CONSTANT (init_val))
2794           init_def = build_vector (vectype, t);
2795         else
2796           init_def = build_constructor_from_list (vectype, t);
2797
2798         break;
2799
2800       default:
2801         gcc_unreachable ();
2802     }
2803
2804   return init_def;
2805 }
2806
2807
2808 /* Function vect_create_epilog_for_reduction
2809
2810    Create code at the loop-epilog to finalize the result of a reduction
2811    computation.
2812
2813    VECT_DEF is a vector of partial results.
2814    REDUC_CODE is the tree-code for the epilog reduction.
2815    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2816      number of elements that we can fit in a vectype (nunits). In this case
2817      we have to generate more than one vector stmt - i.e - we need to "unroll"
2818      the vector stmt by a factor VF/nunits.  For more details see documentation
2819      in vectorizable_operation.
2820    STMT is the scalar reduction stmt that is being vectorized.
2821    REDUCTION_PHI is the phi-node that carries the reduction computation.
2822    REDUC_INDEX is the index of the operand in the right hand side of the
2823      statement that is defined by REDUCTION_PHI.
2824    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
2825
2826    This function:
2827    1. Creates the reduction def-use cycle: sets the arguments for
2828       REDUCTION_PHI:
2829       The loop-entry argument is the vectorized initial-value of the reduction.
2830       The loop-latch argument is VECT_DEF - the vector of partial sums.
2831    2. "Reduces" the vector of partial results VECT_DEF into a single result,
2832       by applying the operation specified by REDUC_CODE if available, or by
2833       other means (whole-vector shifts or a scalar loop).
2834       The function also creates a new phi node at the loop exit to preserve
2835       loop-closed form, as illustrated below.
2836
2837      The flow at the entry to this function:
2838
2839         loop:
2840           vec_def = phi <null, null>            # REDUCTION_PHI
2841           VECT_DEF = vector_stmt                # vectorized form of STMT
2842           s_loop = scalar_stmt                  # (scalar) STMT
2843         loop_exit:
2844           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2845           use <s_out0>
2846           use <s_out0>
2847
2848      The above is transformed by this function into:
2849
2850         loop:
2851           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
2852           VECT_DEF = vector_stmt                # vectorized form of STMT
2853           s_loop = scalar_stmt                  # (scalar) STMT
2854         loop_exit:
2855           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2856           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
2857           v_out2 = reduce <v_out1>
2858           s_out3 = extract_field <v_out2, 0>
2859           s_out4 = adjust_result <s_out3>
2860           use <s_out4>
2861           use <s_out4>
2862 */
2863
2864 static void
2865 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2866                                   int ncopies,
2867                                   enum tree_code reduc_code,
2868                                   gimple reduction_phi,
2869                                   int reduc_index,
2870                                   bool double_reduc)
2871 {
2872   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2873   stmt_vec_info prev_phi_info;
2874   tree vectype;
2875   enum machine_mode mode;
2876   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2877   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
2878   basic_block exit_bb;
2879   tree scalar_dest;
2880   tree scalar_type;
2881   gimple new_phi = NULL, phi;
2882   gimple_stmt_iterator exit_gsi;
2883   tree vec_dest;
2884   tree new_temp = NULL_TREE;
2885   tree new_name;
2886   gimple epilog_stmt = NULL;
2887   tree new_scalar_dest, new_dest;
2888   gimple exit_phi;
2889   tree bitsize, bitpos;
2890   enum tree_code code = gimple_assign_rhs_code (stmt);
2891   tree adjustment_def;
2892   tree vec_initial_def, def;
2893   tree orig_name;
2894   imm_use_iterator imm_iter;
2895   use_operand_p use_p;
2896   bool extract_scalar_result = false;
2897   tree reduction_op, expr;
2898   gimple orig_stmt;
2899   gimple use_stmt;
2900   bool nested_in_vect_loop = false;
2901   VEC(gimple,heap) *phis = NULL;
2902   enum vect_def_type dt = vect_unknown_def_type;
2903   int j, i;
2904
2905   if (nested_in_vect_loop_p (loop, stmt))
2906     {
2907       outer_loop = loop;
2908       loop = loop->inner;
2909       nested_in_vect_loop = true;
2910     }
2911
2912   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2913     {
2914     case GIMPLE_SINGLE_RHS:
2915       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
2916                                        == ternary_op);
2917       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
2918       break;
2919     case GIMPLE_UNARY_RHS:
2920       reduction_op = gimple_assign_rhs1 (stmt);
2921       break;
2922     case GIMPLE_BINARY_RHS:
2923       reduction_op = reduc_index ?
2924                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
2925       break;
2926     default:
2927       gcc_unreachable ();
2928     }
2929
2930   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2931   gcc_assert (vectype);
2932   mode = TYPE_MODE (vectype);
2933
2934   /*** 1. Create the reduction def-use cycle  ***/
2935
2936   /* For the case of reduction, vect_get_vec_def_for_operand returns
2937      the scalar def before the loop, that defines the initial value
2938      of the reduction variable.  */
2939   vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2940                                                   &adjustment_def);
2941
2942   phi = reduction_phi;
2943   def = vect_def;
2944   for (j = 0; j < ncopies; j++)
2945     {
2946       /* 1.1 set the loop-entry arg of the reduction-phi:  */
2947       add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop),
2948                    UNKNOWN_LOCATION);
2949
2950       /* 1.2 set the loop-latch arg for the reduction-phi:  */
2951       if (j > 0)
2952         def = vect_get_vec_def_for_stmt_copy (dt, def);
2953       add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
2954
2955       if (vect_print_dump_info (REPORT_DETAILS))
2956         {
2957           fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2958           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2959           fprintf (vect_dump, "\n");
2960           print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2961         }
2962
2963       phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2964     }
2965
2966   /*** 2. Create epilog code
2967           The reduction epilog code operates across the elements of the vector
2968           of partial results computed by the vectorized loop.
2969           The reduction epilog code consists of:
2970           step 1: compute the scalar result in a vector (v_out2)
2971           step 2: extract the scalar result (s_out3) from the vector (v_out2)
2972           step 3: adjust the scalar result (s_out3) if needed.
2973
2974           Step 1 can be accomplished using one the following three schemes:
2975           (scheme 1) using reduc_code, if available.
2976           (scheme 2) using whole-vector shifts, if available.
2977           (scheme 3) using a scalar loop. In this case steps 1+2 above are
2978                      combined.
2979
2980           The overall epilog code looks like this:
2981
2982           s_out0 = phi <s_loop>         # original EXIT_PHI
2983           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
2984           v_out2 = reduce <v_out1>              # step 1
2985           s_out3 = extract_field <v_out2, 0>    # step 2
2986           s_out4 = adjust_result <s_out3>       # step 3
2987
2988           (step 3 is optional, and steps 1 and 2 may be combined).
2989           Lastly, the uses of s_out0 are replaced by s_out4.
2990
2991           ***/
2992
2993   /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2994         v_out1 = phi <v_loop>  */
2995
2996   exit_bb = single_exit (loop)->dest;
2997   def = vect_def;
2998   prev_phi_info = NULL;
2999   for (j = 0; j < ncopies; j++)
3000     {
3001       phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
3002       set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3003       if (j == 0)
3004         new_phi = phi;
3005       else
3006         {
3007           def = vect_get_vec_def_for_stmt_copy (dt, def);
3008           STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3009         }
3010       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3011       prev_phi_info = vinfo_for_stmt (phi);
3012     }
3013
3014   exit_gsi = gsi_after_labels (exit_bb);
3015
3016   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3017          (i.e. when reduc_code is not available) and in the final adjustment
3018          code (if needed).  Also get the original scalar reduction variable as
3019          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3020          represents a reduction pattern), the tree-code and scalar-def are
3021          taken from the original stmt that the pattern-stmt (STMT) replaces.
3022          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3023          are taken from STMT.  */
3024
3025   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3026   if (!orig_stmt)
3027     {
3028       /* Regular reduction  */
3029       orig_stmt = stmt;
3030     }
3031   else
3032     {
3033       /* Reduction pattern  */
3034       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3035       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3036       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3037     }
3038
3039   code = gimple_assign_rhs_code (orig_stmt);
3040   scalar_dest = gimple_assign_lhs (orig_stmt);
3041   scalar_type = TREE_TYPE (scalar_dest);
3042   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3043   bitsize = TYPE_SIZE (scalar_type);
3044
3045   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3046      partial results are added and not subtracted.  */
3047   if (code == MINUS_EXPR)
3048     code = PLUS_EXPR;
3049
3050   /* In case this is a reduction in an inner-loop while vectorizing an outer
3051      loop - we don't need to extract a single scalar result at the end of the
3052      inner-loop (unless it is double reduction, i.e., the use of reduction is
3053      outside the outer-loop). The final vector of partial results will be used
3054      in the vectorized outer-loop, or reduced to a scalar result at the end of
3055      the outer-loop.  */
3056   if (nested_in_vect_loop && !double_reduc)
3057     goto vect_finalize_reduction;
3058
3059   /* The epilogue is created for the outer-loop, i.e., for the loop being
3060      vectorized.  */
3061   if (double_reduc)
3062     loop = outer_loop;
3063
3064   /* FORNOW */
3065   gcc_assert (ncopies == 1);
3066
3067   /* 2.3 Create the reduction code, using one of the three schemes described
3068          above.  */
3069
3070   if (reduc_code != ERROR_MARK)
3071     {
3072       tree tmp;
3073
3074       /*** Case 1:  Create:
3075            v_out2 = reduc_expr <v_out1>  */
3076
3077       if (vect_print_dump_info (REPORT_DETAILS))
3078         fprintf (vect_dump, "Reduce using direct vector reduction.");
3079
3080       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3081       tmp = build1 (reduc_code, vectype,  PHI_RESULT (new_phi));
3082       epilog_stmt = gimple_build_assign (vec_dest, tmp);
3083       new_temp = make_ssa_name (vec_dest, epilog_stmt);
3084       gimple_assign_set_lhs (epilog_stmt, new_temp);
3085       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3086
3087       extract_scalar_result = true;
3088     }
3089   else
3090     {
3091       enum tree_code shift_code = ERROR_MARK;
3092       bool have_whole_vector_shift = true;
3093       int bit_offset;
3094       int element_bitsize = tree_low_cst (bitsize, 1);
3095       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3096       tree vec_temp;
3097
3098       if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
3099         shift_code = VEC_RSHIFT_EXPR;
3100       else
3101         have_whole_vector_shift = false;
3102
3103       /* Regardless of whether we have a whole vector shift, if we're
3104          emulating the operation via tree-vect-generic, we don't want
3105          to use it.  Only the first round of the reduction is likely
3106          to still be profitable via emulation.  */
3107       /* ??? It might be better to emit a reduction tree code here, so that
3108          tree-vect-generic can expand the first round via bit tricks.  */
3109       if (!VECTOR_MODE_P (mode))
3110         have_whole_vector_shift = false;
3111       else
3112         {
3113           optab optab = optab_for_tree_code (code, vectype, optab_default);
3114           if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
3115             have_whole_vector_shift = false;
3116         }
3117
3118       if (have_whole_vector_shift)
3119         {
3120           /*** Case 2: Create:
3121              for (offset = VS/2; offset >= element_size; offset/=2)
3122                 {
3123                   Create:  va' = vec_shift <va, offset>
3124                   Create:  va = vop <va, va'>
3125                 }  */
3126
3127           if (vect_print_dump_info (REPORT_DETAILS))
3128             fprintf (vect_dump, "Reduce using vector shifts");
3129
3130           vec_dest = vect_create_destination_var (scalar_dest, vectype);
3131           new_temp = PHI_RESULT (new_phi);
3132
3133           for (bit_offset = vec_size_in_bits/2;
3134                bit_offset >= element_bitsize;
3135                bit_offset /= 2)
3136             {
3137               tree bitpos = size_int (bit_offset);
3138
3139               epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
3140                                                           new_temp, bitpos);
3141               new_name = make_ssa_name (vec_dest, epilog_stmt);
3142               gimple_assign_set_lhs (epilog_stmt, new_name);
3143               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3144
3145               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3146                                                           new_name, new_temp);
3147               new_temp = make_ssa_name (vec_dest, epilog_stmt);
3148               gimple_assign_set_lhs (epilog_stmt, new_temp);
3149               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3150             }
3151
3152           extract_scalar_result = true;
3153         }
3154       else
3155         {
3156           tree rhs;
3157
3158           /*** Case 3: Create:
3159              s = extract_field <v_out2, 0>
3160              for (offset = element_size;
3161                   offset < vector_size;
3162                   offset += element_size;)
3163                {
3164                  Create:  s' = extract_field <v_out2, offset>
3165                  Create:  s = op <s, s'>
3166                }  */
3167
3168           if (vect_print_dump_info (REPORT_DETAILS))
3169             fprintf (vect_dump, "Reduce using scalar code. ");
3170
3171           vec_temp = PHI_RESULT (new_phi);
3172           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3173           rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3174                          bitsize_zero_node);
3175           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3176           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3177           gimple_assign_set_lhs (epilog_stmt, new_temp);
3178           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3179
3180           for (bit_offset = element_bitsize;
3181                bit_offset < vec_size_in_bits;
3182                bit_offset += element_bitsize)
3183             {
3184               tree bitpos = bitsize_int (bit_offset);
3185               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3186                                  bitpos);
3187
3188               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3189               new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3190               gimple_assign_set_lhs (epilog_stmt, new_name);
3191               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3192
3193               epilog_stmt = gimple_build_assign_with_ops (code,
3194                                                           new_scalar_dest,
3195                                                           new_name, new_temp);
3196               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3197               gimple_assign_set_lhs (epilog_stmt, new_temp);
3198               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3199             }
3200
3201           extract_scalar_result = false;
3202         }
3203     }
3204
3205   /* 2.4  Extract the final scalar result.  Create:
3206          s_out3 = extract_field <v_out2, bitpos>  */
3207
3208   if (extract_scalar_result)
3209     {
3210       tree rhs;
3211
3212       gcc_assert (!nested_in_vect_loop || double_reduc);
3213       if (vect_print_dump_info (REPORT_DETAILS))
3214         fprintf (vect_dump, "extract scalar result");
3215
3216       if (BYTES_BIG_ENDIAN)
3217         bitpos = size_binop (MULT_EXPR,
3218                        bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3219                        TYPE_SIZE (scalar_type));
3220       else
3221         bitpos = bitsize_zero_node;
3222
3223       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3224       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3225       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3226       gimple_assign_set_lhs (epilog_stmt, new_temp);
3227       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3228     }
3229
3230 vect_finalize_reduction:
3231
3232   if (double_reduc)
3233     loop = loop->inner;
3234
3235   /* 2.5 Adjust the final result by the initial value of the reduction
3236          variable. (When such adjustment is not needed, then
3237          'adjustment_def' is zero).  For example, if code is PLUS we create:
3238          new_temp = loop_exit_def + adjustment_def  */
3239
3240   if (adjustment_def)
3241     {
3242       if (nested_in_vect_loop)
3243         {
3244           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3245           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3246           new_dest = vect_create_destination_var (scalar_dest, vectype);
3247         }
3248       else
3249         {
3250           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3251           expr = build2 (code, scalar_type, new_temp, adjustment_def);
3252           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3253         }
3254
3255       epilog_stmt = gimple_build_assign (new_dest, expr);
3256       new_temp = make_ssa_name (new_dest, epilog_stmt);
3257       gimple_assign_set_lhs (epilog_stmt, new_temp);
3258       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3259       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3260     }
3261
3262
3263   /* 2.6  Handle the loop-exit phi  */
3264
3265   /* Replace uses of s_out0 with uses of s_out3:
3266      Find the loop-closed-use at the loop exit of the original scalar result.
3267      (The reduction result is expected to have two immediate uses - one at the
3268      latch block, and one at the loop exit).  */
3269   phis = VEC_alloc (gimple, heap, 10);
3270   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3271     {
3272       if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3273         {
3274           exit_phi = USE_STMT (use_p);
3275           VEC_quick_push (gimple, phis, exit_phi);
3276         }
3277     }
3278
3279   /* We expect to have found an exit_phi because of loop-closed-ssa form.  */
3280   gcc_assert (!VEC_empty (gimple, phis));
3281
3282   for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
3283     {
3284       if (nested_in_vect_loop)
3285         {
3286           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3287           gimple vect_phi;
3288
3289           /* FORNOW. Currently not supporting the case that an inner-loop
3290              reduction is not used in the outer-loop (but only outside the
3291              outer-loop), unless it is double reduction.  */
3292           gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo)
3293                       && !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc);
3294
3295           epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
3296           STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
3297           set_vinfo_for_stmt (epilog_stmt,
3298                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
3299                                                  NULL));
3300           if (adjustment_def)
3301             STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3302                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3303
3304           if (!double_reduc
3305               || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def)
3306             continue;
3307
3308           /* Handle double reduction:
3309
3310              stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
3311              stmt2:   s3 = phi <s1, s4> - (regular) reduction phi (inner loop)
3312              stmt3:   s4 = use (s3)     - (regular) reduction stmt (inner loop)
3313              stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
3314
3315              At that point the regular reduction (stmt2 and stmt3) is already
3316              vectorized, as well as the exit phi node, stmt4.
3317              Here we vectorize the phi node of double reduction, stmt1, and
3318              update all relevant statements.  */
3319
3320           /* Go through all the uses of s2 to find double reduction phi node,
3321              i.e., stmt1 above.  */
3322           orig_name = PHI_RESULT (exit_phi);
3323           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3324             {
3325               stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3326               stmt_vec_info new_phi_vinfo;
3327               tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3328               basic_block bb = gimple_bb (use_stmt);
3329               gimple use;
3330
3331               /* Check that USE_STMT is really double reduction phi node.  */
3332               if (gimple_code (use_stmt) != GIMPLE_PHI
3333                   || gimple_phi_num_args (use_stmt) != 2
3334                   || !use_stmt_vinfo
3335                   || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3336                       != vect_double_reduction_def
3337                   || bb->loop_father != outer_loop)
3338                 continue;
3339
3340               /* Create vector phi node for double reduction:
3341                  vs1 = phi <vs0, vs2>
3342                  vs1 was created previously in this function by a call to
3343                  vect_get_vec_def_for_operand and is stored in vec_initial_def;
3344                  vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3345                  vs0 is created here.  */
3346
3347               /* Create vector phi node.  */
3348               vect_phi = create_phi_node (vec_initial_def, bb);
3349               new_phi_vinfo = new_stmt_vec_info (vect_phi,
3350                                     loop_vec_info_for_loop (outer_loop), NULL);
3351               set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3352
3353               /* Create vs0 - initial def of the double reduction phi.  */
3354               preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3355                                              loop_preheader_edge (outer_loop));
3356               init_def = get_initial_def_for_reduction (stmt, preheader_arg,
3357                                                         NULL);
3358               vect_phi_init = vect_init_vector (use_stmt, init_def, vectype,
3359                                                 NULL);
3360
3361               /* Update phi node arguments with vs0 and vs2.  */
3362               add_phi_arg (vect_phi, vect_phi_init,
3363                            loop_preheader_edge (outer_loop), UNKNOWN_LOCATION);
3364               add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3365                            loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3366               if (vect_print_dump_info (REPORT_DETAILS))
3367                 {
3368                   fprintf (vect_dump, "created double reduction phi node: ");
3369                   print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3370                 }
3371
3372               vect_phi_res = PHI_RESULT (vect_phi);
3373
3374               /* Replace the use, i.e., set the correct vs1 in the regular
3375                  reduction phi node. FORNOW, NCOPIES is always 1, so the loop
3376                  is redundant.  */
3377               use = reduction_phi;
3378               for (j = 0; j < ncopies; j++)
3379                 {
3380                   edge pr_edge = loop_preheader_edge (loop);
3381                   SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3382                   use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3383                 }
3384             }
3385         }
3386
3387       /* Replace the uses:  */
3388       orig_name = PHI_RESULT (exit_phi);
3389       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3390         FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3391           SET_USE (use_p, new_temp);
3392     }
3393
3394   VEC_free (gimple, heap, phis);
3395 }
3396
3397
3398 /* Function vectorizable_reduction.
3399
3400    Check if STMT performs a reduction operation that can be vectorized.
3401    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3402    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3403    Return FALSE if not a vectorizable STMT, TRUE otherwise.
3404
3405    This function also handles reduction idioms (patterns) that have been
3406    recognized in advance during vect_pattern_recog. In this case, STMT may be
3407    of this form:
3408      X = pattern_expr (arg0, arg1, ..., X)
3409    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3410    sequence that had been detected and replaced by the pattern-stmt (STMT).
3411
3412    In some cases of reduction patterns, the type of the reduction variable X is
3413    different than the type of the other arguments of STMT.
3414    In such cases, the vectype that is used when transforming STMT into a vector
3415    stmt is different than the vectype that is used to determine the
3416    vectorization factor, because it consists of a different number of elements
3417    than the actual number of elements that are being operated upon in parallel.
3418
3419    For example, consider an accumulation of shorts into an int accumulator.
3420    On some targets it's possible to vectorize this pattern operating on 8
3421    shorts at a time (hence, the vectype for purposes of determining the
3422    vectorization factor should be V8HI); on the other hand, the vectype that
3423    is used to create the vector form is actually V4SI (the type of the result).
3424
3425    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3426    indicates what is the actual level of parallelism (V8HI in the example), so
3427    that the right vectorization factor would be derived. This vectype
3428    corresponds to the type of arguments to the reduction stmt, and should *NOT*
3429    be used to create the vectorized stmt. The right vectype for the vectorized
3430    stmt is obtained from the type of the result X:
3431         get_vectype_for_scalar_type (TREE_TYPE (X))
3432
3433    This means that, contrary to "regular" reductions (or "regular" stmts in
3434    general), the following equation:
3435       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3436    does *NOT* necessarily hold for reduction patterns.  */
3437
3438 bool
3439 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3440                         gimple *vec_stmt)
3441 {
3442   tree vec_dest;
3443   tree scalar_dest;
3444   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3445   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3446   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3447   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3448   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3449   enum tree_code code, orig_code, epilog_reduc_code;
3450   enum machine_mode vec_mode;
3451   int op_type;
3452   optab optab, reduc_optab;
3453   tree new_temp = NULL_TREE;
3454   tree def;
3455   gimple def_stmt;
3456   enum vect_def_type dt;
3457   gimple new_phi = NULL;
3458   tree scalar_type;
3459   bool is_simple_use;
3460   gimple orig_stmt;
3461   stmt_vec_info orig_stmt_info;
3462   tree expr = NULL_TREE;
3463   int i;
3464   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3465   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3466   int epilog_copies;
3467   stmt_vec_info prev_stmt_info, prev_phi_info;
3468   gimple first_phi = NULL;
3469   bool single_defuse_cycle = false;
3470   tree reduc_def = NULL_TREE;
3471   gimple new_stmt = NULL;
3472   int j;
3473   tree ops[3];
3474   bool nested_cycle = false, found_nested_cycle_def = false;
3475   gimple reduc_def_stmt = NULL;
3476   /* The default is that the reduction variable is the last in statement.  */
3477   int reduc_index = 2;
3478   bool double_reduc = false, dummy;
3479   basic_block def_bb;
3480   struct loop * def_stmt_loop, *outer_loop = NULL;
3481   tree def_arg;
3482   gimple def_arg_stmt;
3483
3484   if (nested_in_vect_loop_p (loop, stmt))
3485     {
3486       outer_loop = loop;
3487       loop = loop->inner;
3488       nested_cycle = true;
3489     }
3490
3491   gcc_assert (ncopies >= 1);
3492
3493   /* FORNOW: SLP not supported.  */
3494   if (STMT_SLP_TYPE (stmt_info))
3495     return false;
3496
3497   /* 1. Is vectorizable reduction?  */
3498   /* Not supportable if the reduction variable is used in the loop.  */
3499   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3500     return false;
3501
3502   /* Reductions that are not used even in an enclosing outer-loop,
3503      are expected to be "live" (used out of the loop).  */
3504   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3505       && !STMT_VINFO_LIVE_P (stmt_info))
3506     return false;
3507
3508   /* Make sure it was already recognized as a reduction computation.  */
3509   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3510       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3511     return false;
3512
3513   /* 2. Has this been recognized as a reduction pattern?
3514
3515      Check if STMT represents a pattern that has been recognized
3516      in earlier analysis stages.  For stmts that represent a pattern,
3517      the STMT_VINFO_RELATED_STMT field records the last stmt in
3518      the original sequence that constitutes the pattern.  */
3519
3520   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3521   if (orig_stmt)
3522     {
3523       orig_stmt_info = vinfo_for_stmt (orig_stmt);
3524       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3525       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3526       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3527     }
3528
3529   /* 3. Check the operands of the operation. The first operands are defined
3530         inside the loop body. The last operand is the reduction variable,
3531         which is defined by the loop-header-phi.  */
3532
3533   gcc_assert (is_gimple_assign (stmt));
3534
3535   /* Flatten RHS */
3536   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3537     {
3538     case GIMPLE_SINGLE_RHS:
3539       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3540       if (op_type == ternary_op)
3541         {
3542           tree rhs = gimple_assign_rhs1 (stmt);
3543           ops[0] = TREE_OPERAND (rhs, 0);
3544           ops[1] = TREE_OPERAND (rhs, 1);
3545           ops[2] = TREE_OPERAND (rhs, 2);
3546           code = TREE_CODE (rhs);
3547         }
3548       else
3549         return false;
3550       break;
3551
3552     case GIMPLE_BINARY_RHS:
3553       code = gimple_assign_rhs_code (stmt);
3554       op_type = TREE_CODE_LENGTH (code);
3555       gcc_assert (op_type == binary_op);
3556       ops[0] = gimple_assign_rhs1 (stmt);
3557       ops[1] = gimple_assign_rhs2 (stmt);
3558       break;
3559
3560     case GIMPLE_UNARY_RHS:
3561       return false;
3562
3563     default:
3564       gcc_unreachable ();
3565     }
3566
3567   scalar_dest = gimple_assign_lhs (stmt);
3568   scalar_type = TREE_TYPE (scalar_dest);
3569   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3570       && !SCALAR_FLOAT_TYPE_P (scalar_type))
3571     return false;
3572
3573   /* All uses but the last are expected to be defined in the loop.
3574      The last use is the reduction variable. In case of nested cycle this
3575      assumption is not true: we use reduc_index to record the index of the
3576      reduction variable.  */
3577   for (i = 0; i < op_type-1; i++)
3578     {
3579       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
3580       if (i == 0 && code == COND_EXPR)
3581         continue;
3582
3583       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3584                                           &def, &dt);
3585       gcc_assert (is_simple_use);
3586       if (dt != vect_internal_def
3587           && dt != vect_external_def
3588           && dt != vect_constant_def
3589           && dt != vect_induction_def
3590           && !(dt == vect_nested_cycle && nested_cycle))
3591         return false;
3592
3593       if (dt == vect_nested_cycle)
3594         {
3595           found_nested_cycle_def = true;
3596           reduc_def_stmt = def_stmt;
3597           reduc_index = i;
3598         }
3599     }
3600
3601   is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3602                                       &def, &dt);
3603   gcc_assert (is_simple_use);
3604   gcc_assert (dt == vect_reduction_def
3605               || dt == vect_nested_cycle
3606               || ((dt == vect_internal_def || dt == vect_external_def
3607                    || dt == vect_constant_def || dt == vect_induction_def)
3608                    && nested_cycle && found_nested_cycle_def));
3609   if (!found_nested_cycle_def)
3610     reduc_def_stmt = def_stmt;
3611
3612   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
3613   if (orig_stmt)
3614     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
3615                                                        reduc_def_stmt,
3616                                                        !nested_cycle,
3617                                                        &dummy));
3618   else
3619     gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
3620                                                   !nested_cycle, &dummy));
3621
3622   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
3623     return false;
3624
3625   vec_mode = TYPE_MODE (vectype);
3626
3627   if (code == COND_EXPR)
3628     {
3629       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
3630         {
3631           if (vect_print_dump_info (REPORT_DETAILS))
3632             fprintf (vect_dump, "unsupported condition in reduction");
3633
3634             return false;
3635         }
3636     }
3637   else
3638     {
3639       /* 4. Supportable by target?  */
3640
3641       /* 4.1. check support for the operation in the loop  */
3642       optab = optab_for_tree_code (code, vectype, optab_default);
3643       if (!optab)
3644         {
3645           if (vect_print_dump_info (REPORT_DETAILS))
3646             fprintf (vect_dump, "no optab.");
3647
3648           return false;
3649         }
3650
3651       if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
3652         {
3653           if (vect_print_dump_info (REPORT_DETAILS))
3654             fprintf (vect_dump, "op not supported by target.");
3655
3656           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3657               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3658                   < vect_min_worthwhile_factor (code))
3659             return false;
3660
3661           if (vect_print_dump_info (REPORT_DETAILS))
3662             fprintf (vect_dump, "proceeding using word mode.");
3663         }
3664
3665       /* Worthwhile without SIMD support?  */
3666       if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3667           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3668              < vect_min_worthwhile_factor (code))
3669         {
3670           if (vect_print_dump_info (REPORT_DETAILS))
3671             fprintf (vect_dump, "not worthwhile without SIMD support.");
3672
3673           return false;
3674         }
3675     }
3676
3677   /* 4.2. Check support for the epilog operation.
3678
3679           If STMT represents a reduction pattern, then the type of the
3680           reduction variable may be different than the type of the rest
3681           of the arguments.  For example, consider the case of accumulation
3682           of shorts into an int accumulator; The original code:
3683                         S1: int_a = (int) short_a;
3684           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
3685
3686           was replaced with:
3687                         STMT: int_acc = widen_sum <short_a, int_acc>
3688
3689           This means that:
3690           1. The tree-code that is used to create the vector operation in the
3691              epilog code (that reduces the partial results) is not the
3692              tree-code of STMT, but is rather the tree-code of the original
3693              stmt from the pattern that STMT is replacing. I.e, in the example
3694              above we want to use 'widen_sum' in the loop, but 'plus' in the
3695              epilog.
3696           2. The type (mode) we use to check available target support
3697              for the vector operation to be created in the *epilog*, is
3698              determined by the type of the reduction variable (in the example
3699              above we'd check this: plus_optab[vect_int_mode]).
3700              However the type (mode) we use to check available target support
3701              for the vector operation to be created *inside the loop*, is
3702              determined by the type of the other arguments to STMT (in the
3703              example we'd check this: widen_sum_optab[vect_short_mode]).
3704
3705           This is contrary to "regular" reductions, in which the types of all
3706           the arguments are the same as the type of the reduction variable.
3707           For "regular" reductions we can therefore use the same vector type
3708           (and also the same tree-code) when generating the epilog code and
3709           when generating the code inside the loop.  */
3710
3711   if (orig_stmt)
3712     {
3713       /* This is a reduction pattern: get the vectype from the type of the
3714          reduction variable, and get the tree-code from orig_stmt.  */
3715       orig_code = gimple_assign_rhs_code (orig_stmt);
3716       vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3717       if (!vectype)
3718         {
3719           if (vect_print_dump_info (REPORT_DETAILS))
3720             {
3721               fprintf (vect_dump, "unsupported data-type ");
3722               print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3723             }
3724           return false;
3725         }
3726
3727       vec_mode = TYPE_MODE (vectype);
3728     }
3729   else
3730     {
3731       /* Regular reduction: use the same vectype and tree-code as used for
3732          the vector code inside the loop can be used for the epilog code. */
3733       orig_code = code;
3734     }
3735
3736   if (nested_cycle)
3737     {
3738       def_bb = gimple_bb (reduc_def_stmt);
3739       def_stmt_loop = def_bb->loop_father;
3740       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
3741                                        loop_preheader_edge (def_stmt_loop));
3742       if (TREE_CODE (def_arg) == SSA_NAME
3743           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
3744           && gimple_code (def_arg_stmt) == GIMPLE_PHI
3745           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
3746           && vinfo_for_stmt (def_arg_stmt)
3747           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
3748               == vect_double_reduction_def)
3749         double_reduc = true;
3750     }
3751
3752   epilog_reduc_code = ERROR_MARK;
3753   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3754     {
3755       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype,
3756                                          optab_default);
3757       if (!reduc_optab)
3758         {
3759           if (vect_print_dump_info (REPORT_DETAILS))
3760             fprintf (vect_dump, "no optab for reduction.");
3761
3762           epilog_reduc_code = ERROR_MARK;
3763         }
3764
3765       if (reduc_optab
3766           && optab_handler (reduc_optab, vec_mode)->insn_code
3767               == CODE_FOR_nothing)
3768         {
3769           if (vect_print_dump_info (REPORT_DETAILS))
3770             fprintf (vect_dump, "reduc op not supported by target.");
3771
3772           epilog_reduc_code = ERROR_MARK;
3773         }
3774     }
3775   else
3776     {
3777       if (!nested_cycle || double_reduc)
3778         {
3779           if (vect_print_dump_info (REPORT_DETAILS))
3780             fprintf (vect_dump, "no reduc code for scalar code.");
3781
3782           return false;
3783         }
3784     }
3785
3786   if (double_reduc && ncopies > 1)
3787     {
3788       if (vect_print_dump_info (REPORT_DETAILS))
3789         fprintf (vect_dump, "multiple types in double reduction");
3790
3791       return false;
3792     }
3793
3794   if (!vec_stmt) /* transformation not required.  */
3795     {
3796       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3797       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3798         return false;
3799       return true;
3800     }
3801
3802   /** Transform.  **/
3803
3804   if (vect_print_dump_info (REPORT_DETAILS))
3805     fprintf (vect_dump, "transform reduction.");
3806
3807   /* FORNOW: Multiple types are not supported for condition.  */
3808   if (code == COND_EXPR)
3809     gcc_assert (ncopies == 1);
3810
3811   /* Create the destination vector  */
3812   vec_dest = vect_create_destination_var (scalar_dest, vectype);
3813
3814   /* In case the vectorization factor (VF) is bigger than the number
3815      of elements that we can fit in a vectype (nunits), we have to generate
3816      more than one vector stmt - i.e - we need to "unroll" the
3817      vector stmt by a factor VF/nunits.  For more details see documentation
3818      in vectorizable_operation.  */
3819
3820   /* If the reduction is used in an outer loop we need to generate
3821      VF intermediate results, like so (e.g. for ncopies=2):
3822         r0 = phi (init, r0)
3823         r1 = phi (init, r1)
3824         r0 = x0 + r0;
3825         r1 = x1 + r1;
3826     (i.e. we generate VF results in 2 registers).
3827     In this case we have a separate def-use cycle for each copy, and therefore
3828     for each copy we get the vector def for the reduction variable from the
3829     respective phi node created for this copy.
3830
3831     Otherwise (the reduction is unused in the loop nest), we can combine
3832     together intermediate results, like so (e.g. for ncopies=2):
3833         r = phi (init, r)
3834         r = x0 + r;
3835         r = x1 + r;
3836    (i.e. we generate VF/2 results in a single register).
3837    In this case for each copy we get the vector def for the reduction variable
3838    from the vectorized reduction operation generated in the previous iteration.
3839   */
3840
3841   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
3842     {
3843       single_defuse_cycle = true;
3844       epilog_copies = 1;
3845     }
3846   else
3847     epilog_copies = ncopies;
3848
3849   prev_stmt_info = NULL;
3850   prev_phi_info = NULL;
3851   for (j = 0; j < ncopies; j++)
3852     {
3853       if (j == 0 || !single_defuse_cycle)
3854         {
3855           /* Create the reduction-phi that defines the reduction-operand.  */
3856           new_phi = create_phi_node (vec_dest, loop->header);
3857           set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo,
3858                                                           NULL));
3859           /* Get the vector def for the reduction variable from the phi
3860              node.  */
3861           reduc_def = PHI_RESULT (new_phi);
3862         }
3863
3864       if (code == COND_EXPR)
3865         {
3866           first_phi = new_phi;
3867           vectorizable_condition (stmt, gsi, vec_stmt, reduc_def, reduc_index);
3868           /* Multiple types are not supported for condition.  */
3869           break;
3870         }
3871
3872       /* Handle uses.  */
3873       if (j == 0)
3874         {
3875           loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
3876                                                         stmt, NULL);
3877           if (op_type == ternary_op)
3878             {
3879               if (reduc_index == 0)
3880                 loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
3881                                                               NULL);
3882               else
3883                 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
3884                                                               NULL);
3885             }
3886
3887           /* Get the vector def for the reduction variable from the phi
3888              node.  */
3889           first_phi = new_phi;
3890         }
3891       else
3892         {
3893           enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3894           loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3895           if (op_type == ternary_op)
3896             loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3897
3898           if (single_defuse_cycle)
3899             reduc_def = gimple_assign_lhs (new_stmt);
3900           else
3901             reduc_def = PHI_RESULT (new_phi);
3902
3903           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3904         }
3905
3906       /* Arguments are ready. Create the new vector stmt.  */
3907       if (op_type == binary_op)
3908         {
3909           if (reduc_index == 0)
3910             expr = build2 (code, vectype, reduc_def, loop_vec_def0);
3911           else
3912             expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3913         }
3914       else
3915         {
3916           if (reduc_index == 0)
3917             expr = build3 (code, vectype, reduc_def, loop_vec_def0,
3918                            loop_vec_def1);
3919           else
3920             {
3921               if (reduc_index == 1)
3922                 expr = build3 (code, vectype, loop_vec_def0, reduc_def,
3923                                loop_vec_def1);
3924               else
3925                 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3926                                reduc_def);
3927             }
3928         }
3929
3930       new_stmt = gimple_build_assign (vec_dest, expr);
3931       new_temp = make_ssa_name (vec_dest, new_stmt);
3932       gimple_assign_set_lhs (new_stmt, new_temp);
3933       vect_finish_stmt_generation (stmt, new_stmt, gsi);
3934
3935       if (j == 0)
3936         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3937       else
3938         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3939
3940       prev_stmt_info = vinfo_for_stmt (new_stmt);
3941       prev_phi_info = vinfo_for_stmt (new_phi);
3942     }
3943
3944   /* Finalize the reduction-phi (set its arguments) and create the
3945      epilog reduction code.  */
3946   if (!single_defuse_cycle || code == COND_EXPR)
3947     new_temp = gimple_assign_lhs (*vec_stmt);
3948
3949   vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3950                                     epilog_reduc_code, first_phi, reduc_index,
3951                                     double_reduc);
3952   return true;
3953 }
3954
3955 /* Function vect_min_worthwhile_factor.
3956
3957    For a loop where we could vectorize the operation indicated by CODE,
3958    return the minimum vectorization factor that makes it worthwhile
3959    to use generic vectors.  */
3960 int
3961 vect_min_worthwhile_factor (enum tree_code code)
3962 {
3963   switch (code)
3964     {
3965     case PLUS_EXPR:
3966     case MINUS_EXPR:
3967     case NEGATE_EXPR:
3968       return 4;
3969
3970     case BIT_AND_EXPR:
3971     case BIT_IOR_EXPR:
3972     case BIT_XOR_EXPR:
3973     case BIT_NOT_EXPR:
3974       return 2;
3975
3976     default:
3977       return INT_MAX;
3978     }
3979 }
3980
3981
3982 /* Function vectorizable_induction
3983
3984    Check if PHI performs an induction computation that can be vectorized.
3985    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3986    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3987    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
3988
3989 bool
3990 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3991                         gimple *vec_stmt)
3992 {
3993   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3994   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3995   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3996   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3997   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3998   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3999   tree vec_def;
4000
4001   gcc_assert (ncopies >= 1);
4002   /* FORNOW. This restriction should be relaxed.  */
4003   if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4004     {
4005       if (vect_print_dump_info (REPORT_DETAILS))
4006         fprintf (vect_dump, "multiple types in nested loop.");
4007       return false;
4008     }
4009
4010   if (!STMT_VINFO_RELEVANT_P (stmt_info))
4011     return false;
4012
4013   /* FORNOW: SLP not supported.  */
4014   if (STMT_SLP_TYPE (stmt_info))
4015     return false;
4016
4017   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4018
4019   if (gimple_code (phi) != GIMPLE_PHI)
4020     return false;
4021
4022   if (!vec_stmt) /* transformation not required.  */
4023     {
4024       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4025       if (vect_print_dump_info (REPORT_DETAILS))
4026         fprintf (vect_dump, "=== vectorizable_induction ===");
4027       vect_model_induction_cost (stmt_info, ncopies);
4028       return true;
4029     }
4030
4031   /** Transform.  **/
4032
4033   if (vect_print_dump_info (REPORT_DETAILS))
4034     fprintf (vect_dump, "transform induction phi.");
4035
4036   vec_def = get_initial_def_for_induction (phi);
4037   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4038   return true;
4039 }
4040
4041 /* Function vectorizable_live_operation.
4042
4043    STMT computes a value that is used outside the loop. Check if
4044    it can be supported.  */
4045
4046 bool
4047 vectorizable_live_operation (gimple stmt,
4048                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4049                              gimple *vec_stmt ATTRIBUTE_UNUSED)
4050 {
4051   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4052   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4053   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4054   int i;
4055   int op_type;
4056   tree op;
4057   tree def;
4058   gimple def_stmt;
4059   enum vect_def_type dt;
4060   enum tree_code code;
4061   enum gimple_rhs_class rhs_class;
4062
4063   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4064
4065   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4066     return false;
4067
4068   if (!is_gimple_assign (stmt))
4069     return false;
4070
4071   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4072     return false;
4073
4074   /* FORNOW. CHECKME. */
4075   if (nested_in_vect_loop_p (loop, stmt))
4076     return false;
4077
4078   code = gimple_assign_rhs_code (stmt);
4079   op_type = TREE_CODE_LENGTH (code);
4080   rhs_class = get_gimple_rhs_class (code);
4081   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4082   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4083
4084   /* FORNOW: support only if all uses are invariant. This means
4085      that the scalar operations can remain in place, unvectorized.
4086      The original last scalar value that they compute will be used.  */
4087
4088   for (i = 0; i < op_type; i++)
4089     {
4090       if (rhs_class == GIMPLE_SINGLE_RHS)
4091         op = TREE_OPERAND (gimple_op (stmt, 1), i);
4092       else
4093         op = gimple_op (stmt, i + 1);
4094       if (op
4095           && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4096         {
4097           if (vect_print_dump_info (REPORT_DETAILS))
4098             fprintf (vect_dump, "use not simple.");
4099           return false;
4100         }
4101
4102       if (dt != vect_external_def && dt != vect_constant_def)
4103         return false;
4104     }
4105
4106   /* No transformation is required for the cases we currently support.  */
4107   return true;
4108 }
4109
4110 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
4111
4112 static void
4113 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4114 {
4115   ssa_op_iter op_iter;
4116   imm_use_iterator imm_iter;
4117   def_operand_p def_p;
4118   gimple ustmt;
4119
4120   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4121     {
4122       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4123         {
4124           basic_block bb;
4125
4126           if (!is_gimple_debug (ustmt))
4127             continue;
4128
4129           bb = gimple_bb (ustmt);
4130
4131           if (!flow_bb_inside_loop_p (loop, bb))
4132             {
4133               if (gimple_debug_bind_p (ustmt))
4134                 {
4135                   if (vect_print_dump_info (REPORT_DETAILS))
4136                     fprintf (vect_dump, "killing debug use");
4137
4138                   gimple_debug_bind_reset_value (ustmt);
4139                   update_stmt (ustmt);
4140                 }
4141               else
4142                 gcc_unreachable ();
4143             }
4144         }
4145     }
4146 }
4147
4148 /* Function vect_transform_loop.
4149
4150    The analysis phase has determined that the loop is vectorizable.
4151    Vectorize the loop - created vectorized stmts to replace the scalar
4152    stmts in the loop, and update the loop exit condition.  */
4153
4154 void
4155 vect_transform_loop (loop_vec_info loop_vinfo)
4156 {
4157   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4158   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4159   int nbbs = loop->num_nodes;
4160   gimple_stmt_iterator si;
4161   int i;
4162   tree ratio = NULL;
4163   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4164   bool strided_store;
4165   bool slp_scheduled = false;
4166   unsigned int nunits;
4167   tree cond_expr = NULL_TREE;
4168   gimple_seq cond_expr_stmt_list = NULL;
4169   bool do_peeling_for_loop_bound;
4170
4171   if (vect_print_dump_info (REPORT_DETAILS))
4172     fprintf (vect_dump, "=== vec_transform_loop ===");
4173
4174   /* Peel the loop if there are data refs with unknown alignment.
4175      Only one data ref with unknown store is allowed.  */
4176
4177   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4178     vect_do_peeling_for_alignment (loop_vinfo);
4179
4180   do_peeling_for_loop_bound
4181     = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4182        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4183            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4184
4185   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4186       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4187     vect_loop_versioning (loop_vinfo,
4188                           !do_peeling_for_loop_bound,
4189                           &cond_expr, &cond_expr_stmt_list);
4190
4191   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4192      compile time constant), or it is a constant that doesn't divide by the
4193      vectorization factor, then an epilog loop needs to be created.
4194      We therefore duplicate the loop: the original loop will be vectorized,
4195      and will compute the first (n/VF) iterations. The second copy of the loop
4196      will remain scalar and will compute the remaining (n%VF) iterations.
4197      (VF is the vectorization factor).  */
4198
4199   if (do_peeling_for_loop_bound)
4200     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4201                                     cond_expr, cond_expr_stmt_list);
4202   else
4203     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4204                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4205
4206   /* 1) Make sure the loop header has exactly two entries
4207      2) Make sure we have a preheader basic block.  */
4208
4209   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4210
4211   split_edge (loop_preheader_edge (loop));
4212
4213   /* FORNOW: the vectorizer supports only loops which body consist
4214      of one basic block (header + empty latch). When the vectorizer will
4215      support more involved loop forms, the order by which the BBs are
4216      traversed need to be reconsidered.  */
4217
4218   for (i = 0; i < nbbs; i++)
4219     {
4220       basic_block bb = bbs[i];
4221       stmt_vec_info stmt_info;
4222       gimple phi;
4223
4224       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4225         {
4226           phi = gsi_stmt (si);
4227           if (vect_print_dump_info (REPORT_DETAILS))
4228             {
4229               fprintf (vect_dump, "------>vectorizing phi: ");
4230               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4231             }
4232           stmt_info = vinfo_for_stmt (phi);
4233           if (!stmt_info)
4234             continue;
4235
4236           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4237               && !STMT_VINFO_LIVE_P (stmt_info))
4238             {
4239               if (MAY_HAVE_DEBUG_STMTS)
4240                 vect_loop_kill_debug_uses (loop, phi);
4241               continue;
4242             }
4243
4244           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4245                 != (unsigned HOST_WIDE_INT) vectorization_factor)
4246               && vect_print_dump_info (REPORT_DETAILS))
4247             fprintf (vect_dump, "multiple-types.");
4248
4249           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4250             {
4251               if (vect_print_dump_info (REPORT_DETAILS))
4252                 fprintf (vect_dump, "transform phi.");
4253               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4254             }
4255         }
4256
4257       for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4258         {
4259           gimple stmt = gsi_stmt (si);
4260           bool is_store;
4261
4262           if (vect_print_dump_info (REPORT_DETAILS))
4263             {
4264               fprintf (vect_dump, "------>vectorizing statement: ");
4265               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4266             }
4267
4268           stmt_info = vinfo_for_stmt (stmt);
4269
4270           /* vector stmts created in the outer-loop during vectorization of
4271              stmts in an inner-loop may not have a stmt_info, and do not
4272              need to be vectorized.  */
4273           if (!stmt_info)
4274             {
4275               gsi_next (&si);
4276               continue;
4277             }
4278
4279           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4280               && !STMT_VINFO_LIVE_P (stmt_info))
4281             {
4282               if (MAY_HAVE_DEBUG_STMTS)
4283                 vect_loop_kill_debug_uses (loop, stmt);
4284               gsi_next (&si);
4285               continue;
4286             }
4287
4288           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4289           nunits =
4290             (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4291           if (!STMT_SLP_TYPE (stmt_info)
4292               && nunits != (unsigned int) vectorization_factor
4293               && vect_print_dump_info (REPORT_DETAILS))
4294             /* For SLP VF is set according to unrolling factor, and not to
4295                vector size, hence for SLP this print is not valid.  */
4296             fprintf (vect_dump, "multiple-types.");
4297
4298           /* SLP. Schedule all the SLP instances when the first SLP stmt is
4299              reached.  */
4300           if (STMT_SLP_TYPE (stmt_info))
4301             {
4302               if (!slp_scheduled)
4303                 {
4304                   slp_scheduled = true;
4305
4306                   if (vect_print_dump_info (REPORT_DETAILS))
4307                     fprintf (vect_dump, "=== scheduling SLP instances ===");
4308
4309                   vect_schedule_slp (loop_vinfo, NULL);
4310                 }
4311
4312               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
4313               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4314                 {
4315                   gsi_next (&si);
4316                   continue;
4317                 }
4318             }
4319
4320           /* -------- vectorize statement ------------ */
4321           if (vect_print_dump_info (REPORT_DETAILS))
4322             fprintf (vect_dump, "transform statement.");
4323
4324           strided_store = false;
4325           is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4326           if (is_store)
4327             {
4328               if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4329                 {
4330                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4331                      interleaving chain was completed - free all the stores in
4332                      the chain.  */
4333                   vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4334                   gsi_remove (&si, true);
4335                   continue;
4336                 }
4337               else
4338                 {
4339                   /* Free the attached stmt_vec_info and remove the stmt.  */
4340                   free_stmt_vec_info (stmt);
4341                   gsi_remove (&si, true);
4342                   continue;
4343                 }
4344             }
4345           gsi_next (&si);
4346         }                       /* stmts in BB */
4347     }                           /* BBs in loop */
4348
4349   slpeel_make_loop_iterate_ntimes (loop, ratio);
4350
4351   /* The memory tags and pointers in vectorized statements need to
4352      have their SSA forms updated.  FIXME, why can't this be delayed
4353      until all the loops have been transformed?  */
4354   update_ssa (TODO_update_ssa);
4355
4356   if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4357     fprintf (vect_dump, "LOOP VECTORIZED.");
4358   if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4359     fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
4360 }