gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software
   3    Foundation, Inc.
   4    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   5    Ira Rosen <irar@il.ibm.com>
   6
   7 This file is part of GCC.
   8
   9 GCC is free software; you can redistribute it and/or modify it under
  10 the terms of the GNU General Public License as published by the Free
  11 Software Foundation; either version 3, or (at your option) any later
  12 version.
  13
  14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  17 for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with GCC; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "diagnostic.h"
  31 #include "tree-flow.h"
  32 #include "tree-dump.h"
  33 #include "cfgloop.h"
  34 #include "cfglayout.h"
  35 #include "expr.h"
  36 #include "recog.h"
  37 #include "optabs.h"
  38 #include "params.h"
  39 #include "toplev.h"
  40 #include "tree-chrec.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43
  44 /* Loop Vectorization Pass.
  45
  46    This pass tries to vectorize loops.
  47
  48    For example, the vectorizer transforms the following simple loop:
  49
  50         short a[N]; short b[N]; short c[N]; int i;
  51
  52         for (i=0; i<N; i++){
  53           a[i] = b[i] + c[i];
  54         }
  55
  56    as if it was manually vectorized by rewriting the source code into:
  57
  58         typedef int __attribute__((mode(V8HI))) v8hi;
  59         short a[N];  short b[N]; short c[N];   int i;
  60         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  61         v8hi va, vb, vc;
  62
  63         for (i=0; i<N/8; i++){
  64           vb = pb[i];
  65           vc = pc[i];
  66           va = vb + vc;
  67           pa[i] = va;
  68         }
  69
  70         The main entry to this pass is vectorize_loops(), in which
  71    the vectorizer applies a set of analyses on a given set of loops,
  72    followed by the actual vectorization transformation for the loops that
  73    had successfully passed the analysis phase.
  74         Throughout this pass we make a distinction between two types of
  75    data: scalars (which are represented by SSA_NAMES), and memory references
  76    ("data-refs"). These two types of data require different handling both
  77    during analysis and transformation. The types of data-refs that the
  78    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  79    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  80    accesses are required to have a simple (consecutive) access pattern.
  81
  82    Analysis phase:
  83    ===============
  84         The driver for the analysis phase is vect_analyze_loop().
  85    It applies a set of analyses, some of which rely on the scalar evolution
  86    analyzer (scev) developed by Sebastian Pop.
  87
  88         During the analysis phase the vectorizer records some information
  89    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  90    loop, as well as general information about the loop as a whole, which is
  91    recorded in a "loop_vec_info" struct attached to each loop.
  92
  93    Transformation phase:
  94    =====================
  95         The loop transformation phase scans all the stmts in the loop, and
  96    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  97    the loop that needs to be vectorized. It inserts the vector code sequence
  98    just before the scalar stmt S, and records a pointer to the vector code
  99    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 100    attached to S). This pointer will be used for the vectorization of following
 101    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 102    otherwise, we rely on dead code elimination for removing it.
 103
 104         For example, say stmt S1 was vectorized into stmt VS1:
 105
 106    VS1: vb = px[i];
 107    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 108    S2:  a = b;
 109
 110    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 111    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 112    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
 113    resulting sequence would be:
 114
 115    VS1: vb = px[i];
 116    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 117    VS2: va = vb;
 118    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 119
 120         Operands that are not SSA_NAMEs, are data-refs that appear in
 121    load/store operations (like 'x[i]' in S1), and are handled differently.
 122
 123    Target modeling:
 124    =================
 125         Currently the only target specific information that is used is the
 126    size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
 127    support different sizes of vectors, for now will need to specify one value
 128    for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
 129
 130         Since we only vectorize operations which vector form can be
 131    expressed using existing tree codes, to verify that an operation is
 132    supported, the vectorizer checks the relevant optab at the relevant
 133    machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
 134    the value found is CODE_FOR_nothing, then there's no target support, and
 135    we can't vectorize the stmt.
 136
 137    For additional information on this project see:
 138    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 139 */
 140
 141 /* Function vect_determine_vectorization_factor
 142
 143    Determine the vectorization factor (VF). VF is the number of data elements
 144    that are operated upon in parallel in a single iteration of the vectorized
 145    loop. For example, when vectorizing a loop that operates on 4byte elements,
 146    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 147    elements can fit in a single vector register.
 148
 149    We currently support vectorization of loops in which all types operated upon
 150    are of the same size. Therefore this function currently sets VF according to
 151    the size of the types operated upon, and fails if there are multiple sizes
 152    in the loop.
 153
 154    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 155    original loop:
 156         for (i=0; i<N; i++){
 157           a[i] = b[i] + c[i];
 158         }
 159
 160    vectorized loop:
 161         for (i=0; i<N; i+=VF){
 162           a[i:VF] = b[i:VF] + c[i:VF];
 163         }
 164 */
 165
 166 static bool
 167 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 168 {
 169   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 170   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 171   int nbbs = loop->num_nodes;
 172   gimple_stmt_iterator si;
 173   unsigned int vectorization_factor = 0;
 174   tree scalar_type;
 175   gimple phi;
 176   tree vectype;
 177   unsigned int nunits;
 178   stmt_vec_info stmt_info;
 179   int i;
 180   HOST_WIDE_INT dummy;
 181
 182   if (vect_print_dump_info (REPORT_DETAILS))
 183     fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
 184
 185   for (i = 0; i < nbbs; i++)
 186     {
 187       basic_block bb = bbs[i];
 188
 189       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 190         {
 191           phi = gsi_stmt (si);
 192           stmt_info = vinfo_for_stmt (phi);
 193           if (vect_print_dump_info (REPORT_DETAILS))
 194             {
 195               fprintf (vect_dump, "==> examining phi: ");
 196               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 197             }
 198
 199           gcc_assert (stmt_info);
 200
 201           if (STMT_VINFO_RELEVANT_P (stmt_info))
 202             {
 203               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 204               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 205
 206               if (vect_print_dump_info (REPORT_DETAILS))
 207                 {
 208                   fprintf (vect_dump, "get vectype for scalar type:  ");
 209                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 210                 }
 211
 212               vectype = get_vectype_for_scalar_type (scalar_type);
 213               if (!vectype)
 214                 {
 215                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 216                     {
 217                       fprintf (vect_dump,
 218                                "not vectorized: unsupported data-type ");
 219                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 220                     }
 221                   return false;
 222                 }
 223               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 224
 225               if (vect_print_dump_info (REPORT_DETAILS))
 226                 {
 227                   fprintf (vect_dump, "vectype: ");
 228                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 229                 }
 230
 231               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 232               if (vect_print_dump_info (REPORT_DETAILS))
 233                 fprintf (vect_dump, "nunits = %d", nunits);
 234
 235               if (!vectorization_factor
 236                   || (nunits > vectorization_factor))
 237                 vectorization_factor = nunits;
 238             }
 239         }
 240
 241       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 242         {
 243           gimple stmt = gsi_stmt (si);
 244           stmt_info = vinfo_for_stmt (stmt);
 245
 246           if (vect_print_dump_info (REPORT_DETAILS))
 247             {
 248               fprintf (vect_dump, "==> examining statement: ");
 249               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 250             }
 251
 252           gcc_assert (stmt_info);
 253
 254           /* skip stmts which do not need to be vectorized.  */
 255           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 256               && !STMT_VINFO_LIVE_P (stmt_info))
 257             {
 258               if (vect_print_dump_info (REPORT_DETAILS))
 259                 fprintf (vect_dump, "skip.");
 260               continue;
 261             }
 262
 263           if (gimple_get_lhs (stmt) == NULL_TREE)
 264             {
 265               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 266                 {
 267                   fprintf (vect_dump, "not vectorized: irregular stmt.");
 268                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 269                 }
 270               return false;
 271             }
 272
 273           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 274             {
 275               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 276                 {
 277                   fprintf (vect_dump, "not vectorized: vector stmt in loop:");
 278                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 279                 }
 280               return false;
 281             }
 282
 283           if (STMT_VINFO_VECTYPE (stmt_info))
 284             {
 285               /* The only case when a vectype had been already set is for stmts
 286                  that contain a dataref, or for "pattern-stmts" (stmts generated
 287                  by the vectorizer to represent/replace a certain idiom).  */
 288               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 289                           || is_pattern_stmt_p (stmt_info));
 290               vectype = STMT_VINFO_VECTYPE (stmt_info);
 291             }
 292           else
 293             {
 294               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
 295                           && !is_pattern_stmt_p (stmt_info));
 296
 297               scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 298                                                            &dummy);
 299               if (vect_print_dump_info (REPORT_DETAILS))
 300                 {
 301                   fprintf (vect_dump, "get vectype for scalar type:  ");
 302                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 303                 }
 304
 305               vectype = get_vectype_for_scalar_type (scalar_type);
 306               if (!vectype)
 307                 {
 308                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 309                     {
 310                       fprintf (vect_dump,
 311                                "not vectorized: unsupported data-type ");
 312                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 313                     }
 314                   return false;
 315                 }
 316               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 317             }
 318
 319           if (vect_print_dump_info (REPORT_DETAILS))
 320             {
 321               fprintf (vect_dump, "vectype: ");
 322               print_generic_expr (vect_dump, vectype, TDF_SLIM);
 323             }
 324
 325           nunits = TYPE_VECTOR_SUBPARTS (vectype);
 326           if (vect_print_dump_info (REPORT_DETAILS))
 327             fprintf (vect_dump, "nunits = %d", nunits);
 328
 329           if (!vectorization_factor
 330               || (nunits > vectorization_factor))
 331             vectorization_factor = nunits;
 332
 333         }
 334     }
 335
 336   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 337   if (vect_print_dump_info (REPORT_DETAILS))
 338     fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
 339   if (vectorization_factor <= 1)
 340     {
 341       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 342         fprintf (vect_dump, "not vectorized: unsupported data-type");
 343       return false;
 344     }
 345   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 346
 347   return true;
 348 }
 349
 350
 351 /* Function vect_is_simple_iv_evolution.
 352
 353    FORNOW: A simple evolution of an induction variables in the loop is
 354    considered a polynomial evolution with constant step.  */
 355
 356 static bool
 357 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 358                              tree * step)
 359 {
 360   tree init_expr;
 361   tree step_expr;
 362   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 363
 364   /* When there is no evolution in this loop, the evolution function
 365      is not "simple".  */
 366   if (evolution_part == NULL_TREE)
 367     return false;
 368
 369   /* When the evolution is a polynomial of degree >= 2
 370      the evolution function is not "simple".  */
 371   if (tree_is_chrec (evolution_part))
 372     return false;
 373
 374   step_expr = evolution_part;
 375   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 376
 377   if (vect_print_dump_info (REPORT_DETAILS))
 378     {
 379       fprintf (vect_dump, "step: ");
 380       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
 381       fprintf (vect_dump, ",  init: ");
 382       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
 383     }
 384
 385   *init = init_expr;
 386   *step = step_expr;
 387
 388   if (TREE_CODE (step_expr) != INTEGER_CST)
 389     {
 390       if (vect_print_dump_info (REPORT_DETAILS))
 391         fprintf (vect_dump, "step unknown.");
 392       return false;
 393     }
 394
 395   return true;
 396 }
 397
 398 /* Function vect_analyze_scalar_cycles_1.
 399
 400    Examine the cross iteration def-use cycles of scalar variables
 401    in LOOP. LOOP_VINFO represents the loop that is now being
 402    considered for vectorization (can be LOOP, or an outer-loop
 403    enclosing LOOP).  */
 404
 405 static void
 406 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 407 {
 408   basic_block bb = loop->header;
 409   tree dumy;
 410   VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
 411   gimple_stmt_iterator gsi;
 412   bool double_reduc;
 413
 414   if (vect_print_dump_info (REPORT_DETAILS))
 415     fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
 416
 417   /* First - identify all inductions. Reduction detection assumes that all the
 418      inductions have been identified, therefore, this order must not be
 419      changed.  */
 420   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 421     {
 422       gimple phi = gsi_stmt (gsi);
 423       tree access_fn = NULL;
 424       tree def = PHI_RESULT (phi);
 425       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 426
 427       if (vect_print_dump_info (REPORT_DETAILS))
 428         {
 429           fprintf (vect_dump, "Analyze phi: ");
 430           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 431         }
 432
 433       /* Skip virtual phi's. The data dependences that are associated with
 434          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 435       if (!is_gimple_reg (SSA_NAME_VAR (def)))
 436         continue;
 437
 438       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 439
 440       /* Analyze the evolution function.  */
 441       access_fn = analyze_scalar_evolution (loop, def);
 442       if (access_fn && vect_print_dump_info (REPORT_DETAILS))
 443         {
 444           fprintf (vect_dump, "Access function of PHI: ");
 445           print_generic_expr (vect_dump, access_fn, TDF_SLIM);
 446         }
 447
 448       if (!access_fn
 449           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 450         {
 451           VEC_safe_push (gimple, heap, worklist, phi);
 452           continue;
 453         }
 454
 455       if (vect_print_dump_info (REPORT_DETAILS))
 456         fprintf (vect_dump, "Detected induction.");
 457       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 458     }
 459
 460
 461   /* Second - identify all reductions and nested cycles.  */
 462   while (VEC_length (gimple, worklist) > 0)
 463     {
 464       gimple phi = VEC_pop (gimple, worklist);
 465       tree def = PHI_RESULT (phi);
 466       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 467       gimple reduc_stmt;
 468       bool nested_cycle;
 469
 470       if (vect_print_dump_info (REPORT_DETAILS))
 471         {
 472           fprintf (vect_dump, "Analyze phi: ");
 473           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 474         }
 475
 476       gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
 477       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 478
 479       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 480       reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle,
 481                                              &double_reduc);
 482       if (reduc_stmt)
 483         {
 484           if (double_reduc)
 485             {
 486               if (vect_print_dump_info (REPORT_DETAILS))
 487                 fprintf (vect_dump, "Detected double reduction.");
 488
 489               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 490               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 491                                                     vect_double_reduction_def;
 492             }
 493           else
 494             {
 495               if (nested_cycle)
 496                 {
 497                   if (vect_print_dump_info (REPORT_DETAILS))
 498                     fprintf (vect_dump, "Detected vectorizable nested cycle.");
 499
 500                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 501                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 502                                                              vect_nested_cycle;
 503                 }
 504               else
 505                 {
 506                   if (vect_print_dump_info (REPORT_DETAILS))
 507                     fprintf (vect_dump, "Detected reduction.");
 508
 509                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 510                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 511                                                            vect_reduction_def;
 512                 }
 513             }
 514         }
 515       else
 516         if (vect_print_dump_info (REPORT_DETAILS))
 517           fprintf (vect_dump, "Unknown def-use cycle pattern.");
 518     }
 519
 520   VEC_free (gimple, heap, worklist);
 521 }
 522
 523
 524 /* Function vect_analyze_scalar_cycles.
 525
 526    Examine the cross iteration def-use cycles of scalar variables, by
 527    analyzing the loop-header PHIs of scalar variables; Classify each
 528    cycle as one of the following: invariant, induction, reduction, unknown.
 529    We do that for the loop represented by LOOP_VINFO, and also to its
 530    inner-loop, if exists.
 531    Examples for scalar cycles:
 532
 533    Example1: reduction:
 534
 535               loop1:
 536               for (i=0; i<N; i++)
 537                  sum += a[i];
 538
 539    Example2: induction:
 540
 541               loop2:
 542               for (i=0; i<N; i++)
 543                  a[i] = i;  */
 544
 545 static void
 546 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 547 {
 548   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 549
 550   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 551
 552   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 553      Reductions in such inner-loop therefore have different properties than
 554      the reductions in the nest that gets vectorized:
 555      1. When vectorized, they are executed in the same order as in the original
 556         scalar loop, so we can't change the order of computation when
 557         vectorizing them.
 558      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 559         current checks are too strict.  */
 560
 561   if (loop->inner)
 562     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 563 }
 564
 565 /* Function vect_get_loop_niters.
 566
 567    Determine how many iterations the loop is executed.
 568    If an expression that represents the number of iterations
 569    can be constructed, place it in NUMBER_OF_ITERATIONS.
 570    Return the loop exit condition.  */
 571
 572 static gimple
 573 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 574 {
 575   tree niters;
 576
 577   if (vect_print_dump_info (REPORT_DETAILS))
 578     fprintf (vect_dump, "=== get_loop_niters ===");
 579
 580   niters = number_of_exit_cond_executions (loop);
 581
 582   if (niters != NULL_TREE
 583       && niters != chrec_dont_know)
 584     {
 585       *number_of_iterations = niters;
 586
 587       if (vect_print_dump_info (REPORT_DETAILS))
 588         {
 589           fprintf (vect_dump, "==> get_loop_niters:" );
 590           print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
 591         }
 592     }
 593
 594   return get_loop_exit_condition (loop);
 595 }
 596
 597
 598 /* Function bb_in_loop_p
 599
 600    Used as predicate for dfs order traversal of the loop bbs.  */
 601
 602 static bool
 603 bb_in_loop_p (const_basic_block bb, const void *data)
 604 {
 605   const struct loop *const loop = (const struct loop *)data;
 606   if (flow_bb_inside_loop_p (loop, bb))
 607     return true;
 608   return false;
 609 }
 610
 611
 612 /* Function new_loop_vec_info.
 613
 614    Create and initialize a new loop_vec_info struct for LOOP, as well as
 615    stmt_vec_info structs for all the stmts in LOOP.  */
 616
 617 static loop_vec_info
 618 new_loop_vec_info (struct loop *loop)
 619 {
 620   loop_vec_info res;
 621   basic_block *bbs;
 622   gimple_stmt_iterator si;
 623   unsigned int i, nbbs;
 624
 625   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 626   LOOP_VINFO_LOOP (res) = loop;
 627
 628   bbs = get_loop_body (loop);
 629
 630   /* Create/Update stmt_info for all stmts in the loop.  */
 631   for (i = 0; i < loop->num_nodes; i++)
 632     {
 633       basic_block bb = bbs[i];
 634
 635       /* BBs in a nested inner-loop will have been already processed (because
 636          we will have called vect_analyze_loop_form for any nested inner-loop).
 637          Therefore, for stmts in an inner-loop we just want to update the
 638          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 639          loop_info of the outer-loop we are currently considering to vectorize
 640          (instead of the loop_info of the inner-loop).
 641          For stmts in other BBs we need to create a stmt_info from scratch.  */
 642       if (bb->loop_father != loop)
 643         {
 644           /* Inner-loop bb.  */
 645           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 646           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 647             {
 648               gimple phi = gsi_stmt (si);
 649               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 650               loop_vec_info inner_loop_vinfo =
 651                 STMT_VINFO_LOOP_VINFO (stmt_info);
 652               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 653               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 654             }
 655           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 656            {
 657               gimple stmt = gsi_stmt (si);
 658               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 659               loop_vec_info inner_loop_vinfo =
 660                  STMT_VINFO_LOOP_VINFO (stmt_info);
 661               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 662               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 663            }
 664         }
 665       else
 666         {
 667           /* bb in current nest.  */
 668           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 669             {
 670               gimple phi = gsi_stmt (si);
 671               gimple_set_uid (phi, 0);
 672               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 673             }
 674
 675           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 676             {
 677               gimple stmt = gsi_stmt (si);
 678               gimple_set_uid (stmt, 0);
 679               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 680             }
 681         }
 682     }
 683
 684   /* CHECKME: We want to visit all BBs before their successors (except for
 685      latch blocks, for which this assertion wouldn't hold).  In the simple
 686      case of the loop forms we allow, a dfs order of the BBs would the same
 687      as reversed postorder traversal, so we are safe.  */
 688
 689    free (bbs);
 690    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 691    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 692                               bbs, loop->num_nodes, loop);
 693    gcc_assert (nbbs == loop->num_nodes);
 694
 695   LOOP_VINFO_BBS (res) = bbs;
 696   LOOP_VINFO_NITERS (res) = NULL;
 697   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 698   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 699   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 700   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 701   LOOP_VINFO_VECT_FACTOR (res) = 0;
 702   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
 703   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
 704   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 705   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
 706     VEC_alloc (gimple, heap,
 707                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 708   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
 709     VEC_alloc (ddr_p, heap,
 710                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 711   LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
 712   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
 713   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 714
 715   return res;
 716 }
 717
 718
 719 /* Function destroy_loop_vec_info.
 720
 721    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 722    stmts in the loop.  */
 723
 724 void
 725 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 726 {
 727   struct loop *loop;
 728   basic_block *bbs;
 729   int nbbs;
 730   gimple_stmt_iterator si;
 731   int j;
 732   VEC (slp_instance, heap) *slp_instances;
 733   slp_instance instance;
 734
 735   if (!loop_vinfo)
 736     return;
 737
 738   loop = LOOP_VINFO_LOOP (loop_vinfo);
 739
 740   bbs = LOOP_VINFO_BBS (loop_vinfo);
 741   nbbs = loop->num_nodes;
 742
 743   if (!clean_stmts)
 744     {
 745       free (LOOP_VINFO_BBS (loop_vinfo));
 746       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 747       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 748       VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 749
 750       free (loop_vinfo);
 751       loop->aux = NULL;
 752       return;
 753     }
 754
 755   for (j = 0; j < nbbs; j++)
 756     {
 757       basic_block bb = bbs[j];
 758       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 759         free_stmt_vec_info (gsi_stmt (si));
 760
 761       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 762         {
 763           gimple stmt = gsi_stmt (si);
 764           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 765
 766           if (stmt_info)
 767             {
 768               /* Check if this is a "pattern stmt" (introduced by the
 769                  vectorizer during the pattern recognition pass).  */
 770               bool remove_stmt_p = false;
 771               gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
 772               if (orig_stmt)
 773                 {
 774                   stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
 775                   if (orig_stmt_info
 776                       && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
 777                     remove_stmt_p = true;
 778                 }
 779
 780               /* Free stmt_vec_info.  */
 781               free_stmt_vec_info (stmt);
 782
 783               /* Remove dead "pattern stmts".  */
 784               if (remove_stmt_p)
 785                 gsi_remove (&si, true);
 786             }
 787           gsi_next (&si);
 788         }
 789     }
 790
 791   free (LOOP_VINFO_BBS (loop_vinfo));
 792   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 793   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 794   VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 795   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 796   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 797   for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
 798     vect_free_slp_instance (instance);
 799
 800   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
 801   VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
 802
 803   free (loop_vinfo);
 804   loop->aux = NULL;
 805 }
 806
 807
 808 /* Function vect_analyze_loop_1.
 809
 810    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 811    for it. The different analyses will record information in the
 812    loop_vec_info struct.  This is a subset of the analyses applied in
 813    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 814    that is now considered for (outer-loop) vectorization.  */
 815
 816 static loop_vec_info
 817 vect_analyze_loop_1 (struct loop *loop)
 818 {
 819   loop_vec_info loop_vinfo;
 820
 821   if (vect_print_dump_info (REPORT_DETAILS))
 822     fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
 823
 824   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 825
 826   loop_vinfo = vect_analyze_loop_form (loop);
 827   if (!loop_vinfo)
 828     {
 829       if (vect_print_dump_info (REPORT_DETAILS))
 830         fprintf (vect_dump, "bad inner-loop form.");
 831       return NULL;
 832     }
 833
 834   return loop_vinfo;
 835 }
 836
 837
 838 /* Function vect_analyze_loop_form.
 839
 840    Verify that certain CFG restrictions hold, including:
 841    - the loop has a pre-header
 842    - the loop has a single entry and exit
 843    - the loop exit condition is simple enough, and the number of iterations
 844      can be analyzed (a countable loop).  */
 845
 846 loop_vec_info
 847 vect_analyze_loop_form (struct loop *loop)
 848 {
 849   loop_vec_info loop_vinfo;
 850   gimple loop_cond;
 851   tree number_of_iterations = NULL;
 852   loop_vec_info inner_loop_vinfo = NULL;
 853
 854   if (vect_print_dump_info (REPORT_DETAILS))
 855     fprintf (vect_dump, "=== vect_analyze_loop_form ===");
 856
 857   /* Different restrictions apply when we are considering an inner-most loop,
 858      vs. an outer (nested) loop.
 859      (FORNOW. May want to relax some of these restrictions in the future).  */
 860
 861   if (!loop->inner)
 862     {
 863       /* Inner-most loop.  We currently require that the number of BBs is
 864          exactly 2 (the header and latch).  Vectorizable inner-most loops
 865          look like this:
 866
 867                         (pre-header)
 868                            |
 869                           header <--------+
 870                            | |            |
 871                            | +--> latch --+
 872                            |
 873                         (exit-bb)  */
 874
 875       if (loop->num_nodes != 2)
 876         {
 877           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 878             fprintf (vect_dump, "not vectorized: control flow in loop.");
 879           return NULL;
 880         }
 881
 882       if (empty_block_p (loop->header))
 883     {
 884           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 885             fprintf (vect_dump, "not vectorized: empty loop.");
 886       return NULL;
 887     }
 888     }
 889   else
 890     {
 891       struct loop *innerloop = loop->inner;
 892       edge entryedge;
 893
 894       /* Nested loop. We currently require that the loop is doubly-nested,
 895          contains a single inner loop, and the number of BBs is exactly 5.
 896          Vectorizable outer-loops look like this:
 897
 898                         (pre-header)
 899                            |
 900                           header <---+
 901                            |         |
 902                           inner-loop |
 903                            |         |
 904                           tail ------+
 905                            |
 906                         (exit-bb)
 907
 908          The inner-loop has the properties expected of inner-most loops
 909          as described above.  */
 910
 911       if ((loop->inner)->inner || (loop->inner)->next)
 912         {
 913           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 914             fprintf (vect_dump, "not vectorized: multiple nested loops.");
 915           return NULL;
 916         }
 917
 918       /* Analyze the inner-loop.  */
 919       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
 920       if (!inner_loop_vinfo)
 921         {
 922           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 923             fprintf (vect_dump, "not vectorized: Bad inner loop.");
 924           return NULL;
 925         }
 926
 927       if (!expr_invariant_in_loop_p (loop,
 928                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
 929         {
 930           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 931             fprintf (vect_dump,
 932                      "not vectorized: inner-loop count not invariant.");
 933           destroy_loop_vec_info (inner_loop_vinfo, true);
 934           return NULL;
 935         }
 936
 937       if (loop->num_nodes != 5)
 938         {
 939           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 940             fprintf (vect_dump, "not vectorized: control flow in loop.");
 941           destroy_loop_vec_info (inner_loop_vinfo, true);
 942           return NULL;
 943         }
 944
 945       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
 946       entryedge = EDGE_PRED (innerloop->header, 0);
 947       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
 948         entryedge = EDGE_PRED (innerloop->header, 1);
 949
 950       if (entryedge->src != loop->header
 951           || !single_exit (innerloop)
 952           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
 953         {
 954           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 955             fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
 956           destroy_loop_vec_info (inner_loop_vinfo, true);
 957           return NULL;
 958         }
 959
 960       if (vect_print_dump_info (REPORT_DETAILS))
 961         fprintf (vect_dump, "Considering outer-loop vectorization.");
 962     }
 963
 964   if (!single_exit (loop)
 965       || EDGE_COUNT (loop->header->preds) != 2)
 966     {
 967       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 968         {
 969           if (!single_exit (loop))
 970             fprintf (vect_dump, "not vectorized: multiple exits.");
 971           else if (EDGE_COUNT (loop->header->preds) != 2)
 972             fprintf (vect_dump, "not vectorized: too many incoming edges.");
 973         }
 974       if (inner_loop_vinfo)
 975         destroy_loop_vec_info (inner_loop_vinfo, true);
 976       return NULL;
 977     }
 978
 979   /* We assume that the loop exit condition is at the end of the loop. i.e,
 980      that the loop is represented as a do-while (with a proper if-guard
 981      before the loop if needed), where the loop header contains all the
 982      executable statements, and the latch is empty.  */
 983   if (!empty_block_p (loop->latch)
 984         || !gimple_seq_empty_p (phi_nodes (loop->latch)))
 985     {
 986       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 987         fprintf (vect_dump, "not vectorized: unexpected loop form.");
 988       if (inner_loop_vinfo)
 989         destroy_loop_vec_info (inner_loop_vinfo, true);
 990       return NULL;
 991     }
 992
 993   /* Make sure there exists a single-predecessor exit bb:  */
 994   if (!single_pred_p (single_exit (loop)->dest))
 995     {
 996       edge e = single_exit (loop);
 997       if (!(e->flags & EDGE_ABNORMAL))
 998         {
 999           split_loop_exit_edge (e);
1000           if (vect_print_dump_info (REPORT_DETAILS))
1001             fprintf (vect_dump, "split exit edge.");
1002         }
1003       else
1004         {
1005           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1006             fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1007           if (inner_loop_vinfo)
1008             destroy_loop_vec_info (inner_loop_vinfo, true);
1009           return NULL;
1010         }
1011     }
1012
1013   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1014   if (!loop_cond)
1015     {
1016       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1017         fprintf (vect_dump, "not vectorized: complicated exit condition.");
1018       if (inner_loop_vinfo)
1019         destroy_loop_vec_info (inner_loop_vinfo, true);
1020       return NULL;
1021     }
1022
1023   if (!number_of_iterations)
1024     {
1025       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1026         fprintf (vect_dump,
1027                  "not vectorized: number of iterations cannot be computed.");
1028       if (inner_loop_vinfo)
1029         destroy_loop_vec_info (inner_loop_vinfo, true);
1030       return NULL;
1031     }
1032
1033   if (chrec_contains_undetermined (number_of_iterations))
1034     {
1035       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1036         fprintf (vect_dump, "Infinite number of iterations.");
1037       if (inner_loop_vinfo)
1038         destroy_loop_vec_info (inner_loop_vinfo, true);
1039       return NULL;
1040     }
1041
1042   if (!NITERS_KNOWN_P (number_of_iterations))
1043     {
1044       if (vect_print_dump_info (REPORT_DETAILS))
1045         {
1046           fprintf (vect_dump, "Symbolic number of iterations is ");
1047           print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1048         }
1049     }
1050   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1051     {
1052       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1053         fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1054       if (inner_loop_vinfo)
1055         destroy_loop_vec_info (inner_loop_vinfo, false);
1056       return NULL;
1057     }
1058
1059   loop_vinfo = new_loop_vec_info (loop);
1060   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1061   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1062
1063   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1064
1065   /* CHECKME: May want to keep it around it in the future.  */
1066   if (inner_loop_vinfo)
1067     destroy_loop_vec_info (inner_loop_vinfo, false);
1068
1069   gcc_assert (!loop->aux);
1070   loop->aux = loop_vinfo;
1071   return loop_vinfo;
1072 }
1073
1074
1075 /* Function vect_analyze_loop_operations.
1076
1077    Scan the loop stmts and make sure they are all vectorizable.  */
1078
1079 static bool
1080 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1081 {
1082   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1083   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1084   int nbbs = loop->num_nodes;
1085   gimple_stmt_iterator si;
1086   unsigned int vectorization_factor = 0;
1087   int i;
1088   gimple phi;
1089   stmt_vec_info stmt_info;
1090   bool need_to_vectorize = false;
1091   int min_profitable_iters;
1092   int min_scalar_loop_bound;
1093   unsigned int th;
1094   bool only_slp_in_loop = true, ok;
1095
1096   if (vect_print_dump_info (REPORT_DETAILS))
1097     fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1098
1099   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1100   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1101
1102   for (i = 0; i < nbbs; i++)
1103     {
1104       basic_block bb = bbs[i];
1105
1106       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1107         {
1108           phi = gsi_stmt (si);
1109           ok = true;
1110
1111           stmt_info = vinfo_for_stmt (phi);
1112           if (vect_print_dump_info (REPORT_DETAILS))
1113             {
1114               fprintf (vect_dump, "examining phi: ");
1115               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1116             }
1117
1118           if (! is_loop_header_bb_p (bb))
1119             {
1120               /* inner-loop loop-closed exit phi in outer-loop vectorization
1121                  (i.e. a phi in the tail of the outer-loop).
1122                  FORNOW: we currently don't support the case that these phis
1123                  are not used in the outerloop (unless it is double reduction,
1124                  i.e., this phi is vect_reduction_def), cause this case
1125                  requires to actually do something here.  */
1126               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1127                    || STMT_VINFO_LIVE_P (stmt_info))
1128                   && STMT_VINFO_DEF_TYPE (stmt_info)
1129                      != vect_double_reduction_def)
1130                 {
1131                   if (vect_print_dump_info (REPORT_DETAILS))
1132                     fprintf (vect_dump,
1133                              "Unsupported loop-closed phi in outer-loop.");
1134                   return false;
1135                 }
1136               continue;
1137             }
1138
1139           gcc_assert (stmt_info);
1140
1141           if (STMT_VINFO_LIVE_P (stmt_info))
1142             {
1143               /* FORNOW: not yet supported.  */
1144               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1145                 fprintf (vect_dump, "not vectorized: value used after loop.");
1146               return false;
1147             }
1148
1149           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1150               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1151             {
1152               /* A scalar-dependence cycle that we don't support.  */
1153               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1154                 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1155               return false;
1156             }
1157
1158           if (STMT_VINFO_RELEVANT_P (stmt_info))
1159             {
1160               need_to_vectorize = true;
1161               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1162                 ok = vectorizable_induction (phi, NULL, NULL);
1163             }
1164
1165           if (!ok)
1166             {
1167               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1168                 {
1169                   fprintf (vect_dump,
1170                            "not vectorized: relevant phi not supported: ");
1171                   print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1172                 }
1173               return false;
1174             }
1175         }
1176
1177       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1178         {
1179           gimple stmt = gsi_stmt (si);
1180           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1181
1182           gcc_assert (stmt_info);
1183
1184           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1185             return false;
1186
1187           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1188                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1189               && !PURE_SLP_STMT (stmt_info))
1190
1191             /* STMT needs both SLP and loop-based vectorization.  */
1192             only_slp_in_loop = false;
1193         }
1194     } /* bbs */
1195
1196   /* All operations in the loop are either irrelevant (deal with loop
1197      control, or dead), or only used outside the loop and can be moved
1198      out of the loop (e.g. invariants, inductions).  The loop can be
1199      optimized away by scalar optimizations.  We're better off not
1200      touching this loop.  */
1201   if (!need_to_vectorize)
1202     {
1203       if (vect_print_dump_info (REPORT_DETAILS))
1204         fprintf (vect_dump,
1205                  "All the computation can be taken out of the loop.");
1206       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1207         fprintf (vect_dump,
1208                  "not vectorized: redundant loop. no profit to vectorize.");
1209       return false;
1210     }
1211
1212   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1213      vectorization factor of the loop is the unrolling factor required by the
1214      SLP instances.  If that unrolling factor is 1, we say, that we perform
1215      pure SLP on loop - cross iteration parallelism is not exploited.  */
1216   if (only_slp_in_loop)
1217     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1218   else
1219     vectorization_factor = least_common_multiple (vectorization_factor,
1220                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1221
1222   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1223
1224   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1225       && vect_print_dump_info (REPORT_DETAILS))
1226     fprintf (vect_dump,
1227         "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1228         vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1229
1230   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1231       && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1232     {
1233       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1234         fprintf (vect_dump, "not vectorized: iteration count too small.");
1235       if (vect_print_dump_info (REPORT_DETAILS))
1236         fprintf (vect_dump,"not vectorized: iteration count smaller than "
1237                  "vectorization factor.");
1238       return false;
1239     }
1240
1241   /* Analyze cost. Decide if worth while to vectorize.  */
1242
1243   /* Once VF is set, SLP costs should be updated since the number of created
1244      vector stmts depends on VF.  */
1245   vect_update_slp_costs_according_to_vf (loop_vinfo);
1246
1247   min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1248   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1249
1250   if (min_profitable_iters < 0)
1251     {
1252       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1253         fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1254       if (vect_print_dump_info (REPORT_DETAILS))
1255         fprintf (vect_dump, "not vectorized: vector version will never be "
1256                  "profitable.");
1257       return false;
1258     }
1259
1260   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1261                             * vectorization_factor) - 1);
1262
1263   /* Use the cost model only if it is more conservative than user specified
1264      threshold.  */
1265
1266   th = (unsigned) min_scalar_loop_bound;
1267   if (min_profitable_iters
1268       && (!min_scalar_loop_bound
1269           || min_profitable_iters > min_scalar_loop_bound))
1270     th = (unsigned) min_profitable_iters;
1271
1272   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1273       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1274     {
1275       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1276         fprintf (vect_dump, "not vectorized: vectorization not "
1277                  "profitable.");
1278       if (vect_print_dump_info (REPORT_DETAILS))
1279         fprintf (vect_dump, "not vectorized: iteration count smaller than "
1280                  "user specified loop bound parameter or minimum "
1281                  "profitable iterations (whichever is more conservative).");
1282       return false;
1283     }
1284
1285   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1286       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1287       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1288     {
1289       if (vect_print_dump_info (REPORT_DETAILS))
1290         fprintf (vect_dump, "epilog loop required.");
1291       if (!vect_can_advance_ivs_p (loop_vinfo))
1292         {
1293           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1294             fprintf (vect_dump,
1295                      "not vectorized: can't create epilog loop 1.");
1296           return false;
1297         }
1298       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1299         {
1300           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1301             fprintf (vect_dump,
1302                      "not vectorized: can't create epilog loop 2.");
1303           return false;
1304         }
1305     }
1306
1307   return true;
1308 }
1309
1310
1311 /* Function vect_analyze_loop.
1312
1313    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1314    for it. The different analyses will record information in the
1315    loop_vec_info struct.  */
1316 loop_vec_info
1317 vect_analyze_loop (struct loop *loop)
1318 {
1319   bool ok;
1320   loop_vec_info loop_vinfo;
1321
1322   if (vect_print_dump_info (REPORT_DETAILS))
1323     fprintf (vect_dump, "===== analyze_loop_nest =====");
1324
1325   if (loop_outer (loop)
1326       && loop_vec_info_for_loop (loop_outer (loop))
1327       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1328     {
1329       if (vect_print_dump_info (REPORT_DETAILS))
1330         fprintf (vect_dump, "outer-loop already vectorized.");
1331       return NULL;
1332     }
1333
1334   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1335
1336   loop_vinfo = vect_analyze_loop_form (loop);
1337   if (!loop_vinfo)
1338     {
1339       if (vect_print_dump_info (REPORT_DETAILS))
1340         fprintf (vect_dump, "bad loop form.");
1341       return NULL;
1342     }
1343
1344   /* Find all data references in the loop (which correspond to vdefs/vuses)
1345      and analyze their evolution in the loop.
1346
1347      FORNOW: Handle only simple, array references, which
1348      alignment can be forced, and aligned pointer-references.  */
1349
1350   ok = vect_analyze_data_refs (loop_vinfo, NULL);
1351   if (!ok)
1352     {
1353       if (vect_print_dump_info (REPORT_DETAILS))
1354         fprintf (vect_dump, "bad data references.");
1355       destroy_loop_vec_info (loop_vinfo, true);
1356       return NULL;
1357     }
1358
1359   /* Classify all cross-iteration scalar data-flow cycles.
1360      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1361
1362   vect_analyze_scalar_cycles (loop_vinfo);
1363
1364   vect_pattern_recog (loop_vinfo);
1365
1366   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1367
1368   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1369   if (!ok)
1370     {
1371       if (vect_print_dump_info (REPORT_DETAILS))
1372         fprintf (vect_dump, "unexpected pattern.");
1373       destroy_loop_vec_info (loop_vinfo, true);
1374       return NULL;
1375     }
1376
1377   /* Analyze the alignment of the data-refs in the loop.
1378      Fail if a data reference is found that cannot be vectorized.  */
1379
1380   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1381   if (!ok)
1382     {
1383       if (vect_print_dump_info (REPORT_DETAILS))
1384         fprintf (vect_dump, "bad data alignment.");
1385       destroy_loop_vec_info (loop_vinfo, true);
1386       return NULL;
1387     }
1388
1389   ok = vect_determine_vectorization_factor (loop_vinfo);
1390   if (!ok)
1391     {
1392       if (vect_print_dump_info (REPORT_DETAILS))
1393         fprintf (vect_dump, "can't determine vectorization factor.");
1394       destroy_loop_vec_info (loop_vinfo, true);
1395       return NULL;
1396     }
1397
1398   /* Analyze data dependences between the data-refs in the loop.
1399      FORNOW: fail at the first data dependence that we encounter.  */
1400
1401   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL);
1402   if (!ok)
1403     {
1404       if (vect_print_dump_info (REPORT_DETAILS))
1405         fprintf (vect_dump, "bad data dependence.");
1406       destroy_loop_vec_info (loop_vinfo, true);
1407       return NULL;
1408     }
1409
1410   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1411      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1412
1413   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1414   if (!ok)
1415     {
1416       if (vect_print_dump_info (REPORT_DETAILS))
1417         fprintf (vect_dump, "bad data access.");
1418       destroy_loop_vec_info (loop_vinfo, true);
1419       return NULL;
1420     }
1421
1422   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1423      It is important to call pruning after vect_analyze_data_ref_accesses,
1424      since we use grouping information gathered by interleaving analysis.  */
1425   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1426   if (!ok)
1427     {
1428       if (vect_print_dump_info (REPORT_DETAILS))
1429         fprintf (vect_dump, "too long list of versioning for alias "
1430                             "run-time tests.");
1431       destroy_loop_vec_info (loop_vinfo, true);
1432       return NULL;
1433     }
1434
1435   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1436   ok = vect_analyze_slp (loop_vinfo, NULL);
1437   if (ok)
1438     {
1439       /* Decide which possible SLP instances to SLP.  */
1440       vect_make_slp_decision (loop_vinfo);
1441
1442       /* Find stmts that need to be both vectorized and SLPed.  */
1443       vect_detect_hybrid_slp (loop_vinfo);
1444     }
1445
1446   /* This pass will decide on using loop versioning and/or loop peeling in
1447      order to enhance the alignment of data references in the loop.  */
1448
1449   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1450   if (!ok)
1451     {
1452       if (vect_print_dump_info (REPORT_DETAILS))
1453         fprintf (vect_dump, "bad data alignment.");
1454       destroy_loop_vec_info (loop_vinfo, true);
1455       return NULL;
1456     }
1457
1458   /* Scan all the operations in the loop and make sure they are
1459      vectorizable.  */
1460
1461   ok = vect_analyze_loop_operations (loop_vinfo);
1462   if (!ok)
1463     {
1464       if (vect_print_dump_info (REPORT_DETAILS))
1465         fprintf (vect_dump, "bad operation or unsupported loop bound.");
1466       destroy_loop_vec_info (loop_vinfo, true);
1467       return NULL;
1468     }
1469
1470   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1471
1472   return loop_vinfo;
1473 }
1474
1475
1476 /* Function reduction_code_for_scalar_code
1477
1478    Input:
1479    CODE - tree_code of a reduction operations.
1480
1481    Output:
1482    REDUC_CODE - the corresponding tree-code to be used to reduce the
1483       vector of partial results into a single scalar result (which
1484       will also reside in a vector) or ERROR_MARK if the operation is
1485       a supported reduction operation, but does not have such tree-code.
1486
1487    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1488
1489 static bool
1490 reduction_code_for_scalar_code (enum tree_code code,
1491                                 enum tree_code *reduc_code)
1492 {
1493   switch (code)
1494     {
1495       case MAX_EXPR:
1496         *reduc_code = REDUC_MAX_EXPR;
1497         return true;
1498
1499       case MIN_EXPR:
1500         *reduc_code = REDUC_MIN_EXPR;
1501         return true;
1502
1503       case PLUS_EXPR:
1504         *reduc_code = REDUC_PLUS_EXPR;
1505         return true;
1506
1507       case MULT_EXPR:
1508       case MINUS_EXPR:
1509       case BIT_IOR_EXPR:
1510       case BIT_XOR_EXPR:
1511       case BIT_AND_EXPR:
1512         *reduc_code = ERROR_MARK;
1513         return true;
1514
1515       default:
1516        return false;
1517     }
1518 }
1519
1520
1521 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1522    STMT is printed with a message MSG. */
1523
1524 static void
1525 report_vect_op (gimple stmt, const char *msg)
1526 {
1527   fprintf (vect_dump, "%s", msg);
1528   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1529 }
1530
1531
1532 /* Function vect_is_simple_reduction
1533
1534    (1) Detect a cross-iteration def-use cycle that represents a simple
1535    reduction computation. We look for the following pattern:
1536
1537    loop_header:
1538      a1 = phi < a0, a2 >
1539      a3 = ...
1540      a2 = operation (a3, a1)
1541
1542    such that:
1543    1. operation is commutative and associative and it is safe to
1544       change the order of the computation (if CHECK_REDUCTION is true)
1545    2. no uses for a2 in the loop (a2 is used out of the loop)
1546    3. no uses of a1 in the loop besides the reduction operation.
1547
1548    Condition 1 is tested here.
1549    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1550
1551    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1552    nested cycles, if CHECK_REDUCTION is false.
1553
1554    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1555    reductions:
1556
1557      a1 = phi < a0, a2 >
1558      inner loop (def of a3)
1559      a2 = phi < a3 >
1560 */
1561
1562 gimple
1563 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
1564                           bool check_reduction, bool *double_reduc)
1565 {
1566   struct loop *loop = (gimple_bb (phi))->loop_father;
1567   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1568   edge latch_e = loop_latch_edge (loop);
1569   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1570   gimple def_stmt, def1 = NULL, def2 = NULL;
1571   enum tree_code code;
1572   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1573   tree type;
1574   int nloop_uses;
1575   tree name;
1576   imm_use_iterator imm_iter;
1577   use_operand_p use_p;
1578   bool phi_def;
1579
1580   *double_reduc = false;
1581
1582   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1583      otherwise, we assume outer loop vectorization.  */
1584   gcc_assert ((check_reduction && loop == vect_loop)
1585               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1586
1587   name = PHI_RESULT (phi);
1588   nloop_uses = 0;
1589   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1590     {
1591       gimple use_stmt = USE_STMT (use_p);
1592       if (is_gimple_debug (use_stmt))
1593         continue;
1594       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1595           && vinfo_for_stmt (use_stmt)
1596           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1597         nloop_uses++;
1598       if (nloop_uses > 1)
1599         {
1600           if (vect_print_dump_info (REPORT_DETAILS))
1601             fprintf (vect_dump, "reduction used in loop.");
1602           return NULL;
1603         }
1604     }
1605
1606   if (TREE_CODE (loop_arg) != SSA_NAME)
1607     {
1608       if (vect_print_dump_info (REPORT_DETAILS))
1609         {
1610           fprintf (vect_dump, "reduction: not ssa_name: ");
1611           print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1612         }
1613       return NULL;
1614     }
1615
1616   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1617   if (!def_stmt)
1618     {
1619       if (vect_print_dump_info (REPORT_DETAILS))
1620         fprintf (vect_dump, "reduction: no def_stmt.");
1621       return NULL;
1622     }
1623
1624   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1625     {
1626       if (vect_print_dump_info (REPORT_DETAILS))
1627         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1628       return NULL;
1629     }
1630
1631   if (is_gimple_assign (def_stmt))
1632     {
1633       name = gimple_assign_lhs (def_stmt);
1634       phi_def = false;
1635     }
1636   else
1637     {
1638       name = PHI_RESULT (def_stmt);
1639       phi_def = true;
1640     }
1641
1642   nloop_uses = 0;
1643   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1644     {
1645       gimple use_stmt = USE_STMT (use_p);
1646       if (is_gimple_debug (use_stmt))
1647         continue;
1648       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1649           && vinfo_for_stmt (use_stmt)
1650           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1651         nloop_uses++;
1652       if (nloop_uses > 1)
1653         {
1654           if (vect_print_dump_info (REPORT_DETAILS))
1655             fprintf (vect_dump, "reduction used in loop.");
1656           return NULL;
1657         }
1658     }
1659
1660   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1661      defined in the inner loop.  */
1662   if (phi_def)
1663     {
1664       op1 = PHI_ARG_DEF (def_stmt, 0);
1665
1666       if (gimple_phi_num_args (def_stmt) != 1
1667           || TREE_CODE (op1) != SSA_NAME)
1668         {
1669           if (vect_print_dump_info (REPORT_DETAILS))
1670             fprintf (vect_dump, "unsupported phi node definition.");
1671
1672           return NULL;
1673         }
1674
1675       def1 = SSA_NAME_DEF_STMT (op1);
1676       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1677           && loop->inner
1678           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1679           && is_gimple_assign (def1))
1680         {
1681           if (vect_print_dump_info (REPORT_DETAILS))
1682             report_vect_op (def_stmt, "detected double reduction: ");
1683
1684           *double_reduc = true;
1685           return def_stmt;
1686         }
1687
1688       return NULL;
1689     }
1690
1691   code = gimple_assign_rhs_code (def_stmt);
1692
1693   if (check_reduction
1694       && (!commutative_tree_code (code) || !associative_tree_code (code)))
1695     {
1696       if (vect_print_dump_info (REPORT_DETAILS))
1697         report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1698       return NULL;
1699     }
1700
1701   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1702     {
1703       if (code != COND_EXPR)
1704         {
1705           if (vect_print_dump_info (REPORT_DETAILS))
1706             report_vect_op (def_stmt, "reduction: not binary operation: ");
1707
1708           return NULL;
1709         }
1710
1711       op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1712       if (COMPARISON_CLASS_P (op3))
1713         {
1714           op4 = TREE_OPERAND (op3, 1);
1715           op3 = TREE_OPERAND (op3, 0);
1716         }
1717
1718       op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1719       op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1720
1721       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1722         {
1723           if (vect_print_dump_info (REPORT_DETAILS))
1724             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1725
1726           return NULL;
1727         }
1728     }
1729   else
1730     {
1731       op1 = gimple_assign_rhs1 (def_stmt);
1732       op2 = gimple_assign_rhs2 (def_stmt);
1733
1734       if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1735         {
1736           if (vect_print_dump_info (REPORT_DETAILS))
1737             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1738
1739           return NULL;
1740         }
1741    }
1742
1743   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1744   if ((TREE_CODE (op1) == SSA_NAME
1745        && !types_compatible_p (type,TREE_TYPE (op1)))
1746       || (TREE_CODE (op2) == SSA_NAME
1747           && !types_compatible_p (type, TREE_TYPE (op2)))
1748       || (op3 && TREE_CODE (op3) == SSA_NAME
1749           && !types_compatible_p (type, TREE_TYPE (op3)))
1750       || (op4 && TREE_CODE (op4) == SSA_NAME
1751           && !types_compatible_p (type, TREE_TYPE (op4))))
1752     {
1753       if (vect_print_dump_info (REPORT_DETAILS))
1754         {
1755           fprintf (vect_dump, "reduction: multiple types: operation type: ");
1756           print_generic_expr (vect_dump, type, TDF_SLIM);
1757           fprintf (vect_dump, ", operands types: ");
1758           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1759           fprintf (vect_dump, ",");
1760           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1761           if (op3)
1762             {
1763               fprintf (vect_dump, ",");
1764               print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1765             }
1766
1767           if (op4)
1768             {
1769               fprintf (vect_dump, ",");
1770               print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1771             }
1772         }
1773
1774       return NULL;
1775     }
1776
1777   /* Check that it's ok to change the order of the computation.
1778      Generally, when vectorizing a reduction we change the order of the
1779      computation.  This may change the behavior of the program in some
1780      cases, so we need to check that this is ok.  One exception is when
1781      vectorizing an outer-loop: the inner-loop is executed sequentially,
1782      and therefore vectorizing reductions in the inner-loop during
1783      outer-loop vectorization is safe.  */
1784
1785   /* CHECKME: check for !flag_finite_math_only too?  */
1786   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1787       && check_reduction)
1788     {
1789       /* Changing the order of operations changes the semantics.  */
1790       if (vect_print_dump_info (REPORT_DETAILS))
1791         report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1792       return NULL;
1793     }
1794   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1795            && check_reduction)
1796     {
1797       /* Changing the order of operations changes the semantics.  */
1798       if (vect_print_dump_info (REPORT_DETAILS))
1799         report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1800       return NULL;
1801     }
1802   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1803     {
1804       /* Changing the order of operations changes the semantics.  */
1805       if (vect_print_dump_info (REPORT_DETAILS))
1806         report_vect_op (def_stmt,
1807                         "reduction: unsafe fixed-point math optimization: ");
1808       return NULL;
1809     }
1810
1811   /* Reduction is safe. We're dealing with one of the following:
1812      1) integer arithmetic and no trapv
1813      2) floating point arithmetic, and special flags permit this optimization
1814      3) nested cycle (i.e., outer loop vectorization).  */
1815   if (TREE_CODE (op1) == SSA_NAME)
1816     def1 = SSA_NAME_DEF_STMT (op1);
1817
1818   if (TREE_CODE (op2) == SSA_NAME)
1819     def2 = SSA_NAME_DEF_STMT (op2);
1820
1821   if (code != COND_EXPR
1822       && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1823     {
1824       if (vect_print_dump_info (REPORT_DETAILS))
1825         report_vect_op (def_stmt, "reduction: no defs for operands: ");
1826       return NULL;
1827     }
1828
1829   /* Check that one def is the reduction def, defined by PHI,
1830      the other def is either defined in the loop ("vect_internal_def"),
1831      or it's an induction (defined by a loop-header phi-node).  */
1832
1833   if (def2 && def2 == phi
1834       && (code == COND_EXPR
1835           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1836               && (is_gimple_assign (def1)
1837                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1838                       == vect_induction_def
1839                   || (gimple_code (def1) == GIMPLE_PHI
1840                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1841                           == vect_internal_def
1842                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
1843     {
1844       if (vect_print_dump_info (REPORT_DETAILS))
1845         report_vect_op (def_stmt, "detected reduction: ");
1846       return def_stmt;
1847     }
1848   else if (def1 && def1 == phi
1849            && (code == COND_EXPR
1850                || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1851                    && (is_gimple_assign (def2)
1852                        || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1853                            == vect_induction_def
1854                        || (gimple_code (def2) == GIMPLE_PHI
1855                            && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1856                                == vect_internal_def
1857                            && !is_loop_header_bb_p (gimple_bb (def2)))))))
1858     {
1859       if (check_reduction)
1860         {
1861           /* Swap operands (just for simplicity - so that the rest of the code
1862              can assume that the reduction variable is always the last (second)
1863              argument).  */
1864           if (vect_print_dump_info (REPORT_DETAILS))
1865             report_vect_op (def_stmt,
1866                             "detected reduction: need to swap operands: ");
1867
1868           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
1869                               gimple_assign_rhs2_ptr (def_stmt));
1870         }
1871       else
1872         {
1873           if (vect_print_dump_info (REPORT_DETAILS))
1874             report_vect_op (def_stmt, "detected reduction: ");
1875         }
1876
1877       return def_stmt;
1878     }
1879   else
1880     {
1881       if (vect_print_dump_info (REPORT_DETAILS))
1882         report_vect_op (def_stmt, "reduction: unknown pattern: ");
1883
1884       return NULL;
1885     }
1886 }
1887
1888
1889 /* Function vect_estimate_min_profitable_iters
1890
1891    Return the number of iterations required for the vector version of the
1892    loop to be profitable relative to the cost of the scalar version of the
1893    loop.
1894
1895    TODO: Take profile info into account before making vectorization
1896    decisions, if available.  */
1897
1898 int
1899 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
1900 {
1901   int i;
1902   int min_profitable_iters;
1903   int peel_iters_prologue;
1904   int peel_iters_epilogue;
1905   int vec_inside_cost = 0;
1906   int vec_outside_cost = 0;
1907   int scalar_single_iter_cost = 0;
1908   int scalar_outside_cost = 0;
1909   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1910   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1911   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1912   int nbbs = loop->num_nodes;
1913   int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
1914   int peel_guard_costs = 0;
1915   int innerloop_iters = 0, factor;
1916   VEC (slp_instance, heap) *slp_instances;
1917   slp_instance instance;
1918
1919   /* Cost model disabled.  */
1920   if (!flag_vect_cost_model)
1921     {
1922       if (vect_print_dump_info (REPORT_COST))
1923         fprintf (vect_dump, "cost model disabled.");
1924       return 0;
1925     }
1926
1927   /* Requires loop versioning tests to handle misalignment.  */
1928   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1929     {
1930       /*  FIXME: Make cost depend on complexity of individual check.  */
1931       vec_outside_cost +=
1932         VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1933       if (vect_print_dump_info (REPORT_COST))
1934         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1935                  "versioning to treat misalignment.\n");
1936     }
1937
1938   /* Requires loop versioning with alias checks.  */
1939   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1940     {
1941       /*  FIXME: Make cost depend on complexity of individual check.  */
1942       vec_outside_cost +=
1943         VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1944       if (vect_print_dump_info (REPORT_COST))
1945         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1946                  "versioning aliasing.\n");
1947     }
1948
1949   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
1950       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1951     vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
1952
1953   /* Count statements in scalar loop.  Using this as scalar cost for a single
1954      iteration for now.
1955
1956      TODO: Add outer loop support.
1957
1958      TODO: Consider assigning different costs to different scalar
1959      statements.  */
1960
1961   /* FORNOW.  */
1962   if (loop->inner)
1963     innerloop_iters = 50; /* FIXME */
1964
1965   for (i = 0; i < nbbs; i++)
1966     {
1967       gimple_stmt_iterator si;
1968       basic_block bb = bbs[i];
1969
1970       if (bb->loop_father == loop->inner)
1971         factor = innerloop_iters;
1972       else
1973         factor = 1;
1974
1975       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1976         {
1977           gimple stmt = gsi_stmt (si);
1978           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1979           /* Skip stmts that are not vectorized inside the loop.  */
1980           if (!STMT_VINFO_RELEVANT_P (stmt_info)
1981               && (!STMT_VINFO_LIVE_P (stmt_info)
1982                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
1983             continue;
1984           scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
1985           vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
1986           /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
1987              some of the "outside" costs are generated inside the outer-loop.  */
1988           vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
1989         }
1990     }
1991
1992   /* Add additional cost for the peeled instructions in prologue and epilogue
1993      loop.
1994
1995      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
1996      at compile-time - we assume it's vf/2 (the worst would be vf-1).
1997
1998      TODO: Build an expression that represents peel_iters for prologue and
1999      epilogue to be used in a run-time test.  */
2000
2001   if (byte_misalign < 0)
2002     {
2003       peel_iters_prologue = vf/2;
2004       if (vect_print_dump_info (REPORT_COST))
2005         fprintf (vect_dump, "cost model: "
2006                  "prologue peel iters set to vf/2.");
2007
2008       /* If peeling for alignment is unknown, loop bound of main loop becomes
2009          unknown.  */
2010       peel_iters_epilogue = vf/2;
2011       if (vect_print_dump_info (REPORT_COST))
2012         fprintf (vect_dump, "cost model: "
2013                  "epilogue peel iters set to vf/2 because "
2014                  "peeling for alignment is unknown .");
2015
2016       /* If peeled iterations are unknown, count a taken branch and a not taken
2017          branch per peeled loop. Even if scalar loop iterations are known,
2018          vector iterations are not known since peeled prologue iterations are
2019          not known. Hence guards remain the same.  */
2020       peel_guard_costs +=  2 * (TARG_COND_TAKEN_BRANCH_COST
2021                               + TARG_COND_NOT_TAKEN_BRANCH_COST);
2022     }
2023   else
2024     {
2025       if (byte_misalign)
2026         {
2027           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2028           int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
2029           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2030           int nelements = TYPE_VECTOR_SUBPARTS (vectype);
2031
2032           peel_iters_prologue = nelements - (byte_misalign / element_size);
2033         }
2034       else
2035         peel_iters_prologue = 0;
2036
2037       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2038         {
2039           peel_iters_epilogue = vf/2;
2040           if (vect_print_dump_info (REPORT_COST))
2041             fprintf (vect_dump, "cost model: "
2042                      "epilogue peel iters set to vf/2 because "
2043                      "loop iterations are unknown .");
2044
2045           /* If peeled iterations are known but number of scalar loop
2046              iterations are unknown, count a taken branch per peeled loop.  */
2047           peel_guard_costs +=  2 * TARG_COND_TAKEN_BRANCH_COST;
2048
2049         }
2050       else
2051         {
2052           int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2053           peel_iters_prologue = niters < peel_iters_prologue ?
2054                                         niters : peel_iters_prologue;
2055           peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2056         }
2057     }
2058
2059   vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2060                       + (peel_iters_epilogue * scalar_single_iter_cost)
2061                       + peel_guard_costs;
2062
2063   /* FORNOW: The scalar outside cost is incremented in one of the
2064      following ways:
2065
2066      1. The vectorizer checks for alignment and aliasing and generates
2067      a condition that allows dynamic vectorization.  A cost model
2068      check is ANDED with the versioning condition.  Hence scalar code
2069      path now has the added cost of the versioning check.
2070
2071        if (cost > th & versioning_check)
2072          jmp to vector code
2073
2074      Hence run-time scalar is incremented by not-taken branch cost.
2075
2076      2. The vectorizer then checks if a prologue is required.  If the
2077      cost model check was not done before during versioning, it has to
2078      be done before the prologue check.
2079
2080        if (cost <= th)
2081          prologue = scalar_iters
2082        if (prologue == 0)
2083          jmp to vector code
2084        else
2085          execute prologue
2086        if (prologue == num_iters)
2087          go to exit
2088
2089      Hence the run-time scalar cost is incremented by a taken branch,
2090      plus a not-taken branch, plus a taken branch cost.
2091
2092      3. The vectorizer then checks if an epilogue is required.  If the
2093      cost model check was not done before during prologue check, it
2094      has to be done with the epilogue check.
2095
2096        if (prologue == 0)
2097          jmp to vector code
2098        else
2099          execute prologue
2100        if (prologue == num_iters)
2101          go to exit
2102        vector code:
2103          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2104            jmp to epilogue
2105
2106      Hence the run-time scalar cost should be incremented by 2 taken
2107      branches.
2108
2109      TODO: The back end may reorder the BBS's differently and reverse
2110      conditions/branch directions.  Change the estimates below to
2111      something more reasonable.  */
2112
2113   /* If the number of iterations is known and we do not do versioning, we can
2114      decide whether to vectorize at compile time. Hence the scalar version
2115      do not carry cost model guard costs.  */
2116   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2117       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2118       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2119     {
2120       /* Cost model check occurs at versioning.  */
2121       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2122           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2123         scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
2124       else
2125         {
2126           /* Cost model check occurs at prologue generation.  */
2127           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2128             scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
2129               + TARG_COND_NOT_TAKEN_BRANCH_COST;
2130           /* Cost model check occurs at epilogue generation.  */
2131           else
2132             scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
2133         }
2134     }
2135
2136   /* Add SLP costs.  */
2137   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2138   for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
2139     {
2140       vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2141       vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2142     }
2143
2144   /* Calculate number of iterations required to make the vector version
2145      profitable, relative to the loop bodies only. The following condition
2146      must hold true:
2147      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2148      where
2149      SIC = scalar iteration cost, VIC = vector iteration cost,
2150      VOC = vector outside cost, VF = vectorization factor,
2151      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2152      SOC = scalar outside cost for run time cost model check.  */
2153
2154   if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2155     {
2156       if (vec_outside_cost <= 0)
2157         min_profitable_iters = 1;
2158       else
2159         {
2160           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2161                                   - vec_inside_cost * peel_iters_prologue
2162                                   - vec_inside_cost * peel_iters_epilogue)
2163                                  / ((scalar_single_iter_cost * vf)
2164                                     - vec_inside_cost);
2165
2166           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2167               <= ((vec_inside_cost * min_profitable_iters)
2168                   + ((vec_outside_cost - scalar_outside_cost) * vf)))
2169             min_profitable_iters++;
2170         }
2171     }
2172   /* vector version will never be profitable.  */
2173   else
2174     {
2175       if (vect_print_dump_info (REPORT_COST))
2176         fprintf (vect_dump, "cost model: vector iteration cost = %d "
2177                  "is divisible by scalar iteration cost = %d by a factor "
2178                  "greater than or equal to the vectorization factor = %d .",
2179                  vec_inside_cost, scalar_single_iter_cost, vf);
2180       return -1;
2181     }
2182
2183   if (vect_print_dump_info (REPORT_COST))
2184     {
2185       fprintf (vect_dump, "Cost model analysis: \n");
2186       fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
2187                vec_inside_cost);
2188       fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
2189                vec_outside_cost);
2190       fprintf (vect_dump, "  Scalar iteration cost: %d\n",
2191                scalar_single_iter_cost);
2192       fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
2193       fprintf (vect_dump, "  prologue iterations: %d\n",
2194                peel_iters_prologue);
2195       fprintf (vect_dump, "  epilogue iterations: %d\n",
2196                peel_iters_epilogue);
2197       fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
2198                min_profitable_iters);
2199     }
2200
2201   min_profitable_iters =
2202         min_profitable_iters < vf ? vf : min_profitable_iters;
2203
2204   /* Because the condition we create is:
2205      if (niters <= min_profitable_iters)
2206        then skip the vectorized loop.  */
2207   min_profitable_iters--;
2208
2209   if (vect_print_dump_info (REPORT_COST))
2210     fprintf (vect_dump, "  Profitability threshold = %d\n",
2211              min_profitable_iters);
2212
2213   return min_profitable_iters;
2214 }
2215
2216
2217 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2218    functions. Design better to avoid maintenance issues.  */
2219
2220 /* Function vect_model_reduction_cost.
2221
2222    Models cost for a reduction operation, including the vector ops
2223    generated within the strip-mine loop, the initial definition before
2224    the loop, and the epilogue code that must be generated.  */
2225
2226 static bool
2227 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2228                            int ncopies)
2229 {
2230   int outer_cost = 0;
2231   enum tree_code code;
2232   optab optab;
2233   tree vectype;
2234   gimple stmt, orig_stmt;
2235   tree reduction_op;
2236   enum machine_mode mode;
2237   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2238   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2239
2240
2241   /* Cost of reduction op inside loop.  */
2242   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
2243
2244   stmt = STMT_VINFO_STMT (stmt_info);
2245
2246   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2247     {
2248     case GIMPLE_SINGLE_RHS:
2249       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2250       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2251       break;
2252     case GIMPLE_UNARY_RHS:
2253       reduction_op = gimple_assign_rhs1 (stmt);
2254       break;
2255     case GIMPLE_BINARY_RHS:
2256       reduction_op = gimple_assign_rhs2 (stmt);
2257       break;
2258     default:
2259       gcc_unreachable ();
2260     }
2261
2262   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2263   if (!vectype)
2264     {
2265       if (vect_print_dump_info (REPORT_COST))
2266         {
2267           fprintf (vect_dump, "unsupported data-type ");
2268           print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2269         }
2270       return false;
2271    }
2272
2273   mode = TYPE_MODE (vectype);
2274   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2275
2276   if (!orig_stmt)
2277     orig_stmt = STMT_VINFO_STMT (stmt_info);
2278
2279   code = gimple_assign_rhs_code (orig_stmt);
2280
2281   /* Add in cost for initial definition.  */
2282   outer_cost += TARG_SCALAR_TO_VEC_COST;
2283
2284   /* Determine cost of epilogue code.
2285
2286      We have a reduction operator that will reduce the vector in one statement.
2287      Also requires scalar extract.  */
2288
2289   if (!nested_in_vect_loop_p (loop, orig_stmt))
2290     {
2291       if (reduc_code != ERROR_MARK)
2292         outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
2293       else
2294         {
2295           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2296           tree bitsize =
2297             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2298           int element_bitsize = tree_low_cst (bitsize, 1);
2299           int nelements = vec_size_in_bits / element_bitsize;
2300
2301           optab = optab_for_tree_code (code, vectype, optab_default);
2302
2303           /* We have a whole vector shift available.  */
2304           if (VECTOR_MODE_P (mode)
2305               && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
2306               && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2307             /* Final reduction via vector shifts and the reduction operator. Also
2308                requires scalar extract.  */
2309             outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
2310                                 + TARG_VEC_TO_SCALAR_COST);
2311           else
2312             /* Use extracts and reduction op for final reduction.  For N elements,
2313                we have N extracts and N-1 reduction ops.  */
2314             outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
2315         }
2316     }
2317
2318   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2319
2320   if (vect_print_dump_info (REPORT_COST))
2321     fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2322              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2323              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2324
2325   return true;
2326 }
2327
2328
2329 /* Function vect_model_induction_cost.
2330
2331    Models cost for induction operations.  */
2332
2333 static void
2334 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2335 {
2336   /* loop cost for vec_loop.  */
2337   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
2338   /* prologue cost for vec_init and vec_step.  */
2339   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
2340
2341   if (vect_print_dump_info (REPORT_COST))
2342     fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2343              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2344              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2345 }
2346
2347
2348 /* Function get_initial_def_for_induction
2349
2350    Input:
2351    STMT - a stmt that performs an induction operation in the loop.
2352    IV_PHI - the initial value of the induction variable
2353
2354    Output:
2355    Return a vector variable, initialized with the first VF values of
2356    the induction variable. E.g., for an iv with IV_PHI='X' and
2357    evolution S, for a vector of 4 units, we want to return:
2358    [X, X + S, X + 2*S, X + 3*S].  */
2359
2360 static tree
2361 get_initial_def_for_induction (gimple iv_phi)
2362 {
2363   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2364   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2365   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2366   tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2367   tree vectype;
2368   int nunits;
2369   edge pe = loop_preheader_edge (loop);
2370   struct loop *iv_loop;
2371   basic_block new_bb;
2372   tree vec, vec_init, vec_step, t;
2373   tree access_fn;
2374   tree new_var;
2375   tree new_name;
2376   gimple init_stmt, induction_phi, new_stmt;
2377   tree induc_def, vec_def, vec_dest;
2378   tree init_expr, step_expr;
2379   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2380   int i;
2381   bool ok;
2382   int ncopies;
2383   tree expr;
2384   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2385   bool nested_in_vect_loop = false;
2386   gimple_seq stmts = NULL;
2387   imm_use_iterator imm_iter;
2388   use_operand_p use_p;
2389   gimple exit_phi;
2390   edge latch_e;
2391   tree loop_arg;
2392   gimple_stmt_iterator si;
2393   basic_block bb = gimple_bb (iv_phi);
2394   tree stepvectype;
2395
2396   vectype = get_vectype_for_scalar_type (scalar_type);
2397   gcc_assert (vectype);
2398   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2399   ncopies = vf / nunits;
2400
2401   gcc_assert (phi_info);
2402   gcc_assert (ncopies >= 1);
2403
2404   /* Find the first insertion point in the BB.  */
2405   si = gsi_after_labels (bb);
2406
2407   if (INTEGRAL_TYPE_P (scalar_type))
2408     step_expr = build_int_cst (scalar_type, 0);
2409   else if (POINTER_TYPE_P (scalar_type))
2410     step_expr = build_int_cst (sizetype, 0);
2411   else
2412     step_expr = build_real (scalar_type, dconst0);
2413
2414   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
2415   if (nested_in_vect_loop_p (loop, iv_phi))
2416     {
2417       nested_in_vect_loop = true;
2418       iv_loop = loop->inner;
2419     }
2420   else
2421     iv_loop = loop;
2422   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2423
2424   latch_e = loop_latch_edge (iv_loop);
2425   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2426
2427   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2428   gcc_assert (access_fn);
2429   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2430                                     &init_expr, &step_expr);
2431   gcc_assert (ok);
2432   pe = loop_preheader_edge (iv_loop);
2433
2434   /* Create the vector that holds the initial_value of the induction.  */
2435   if (nested_in_vect_loop)
2436     {
2437       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
2438          been created during vectorization of previous stmts; We obtain it from
2439          the STMT_VINFO_VEC_STMT of the defining stmt. */
2440       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2441                                            loop_preheader_edge (iv_loop));
2442       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2443     }
2444   else
2445     {
2446       /* iv_loop is the loop to be vectorized. Create:
2447          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
2448       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2449       add_referenced_var (new_var);
2450
2451       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2452       if (stmts)
2453         {
2454           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2455           gcc_assert (!new_bb);
2456         }
2457
2458       t = NULL_TREE;
2459       t = tree_cons (NULL_TREE, init_expr, t);
2460       for (i = 1; i < nunits; i++)
2461         {
2462           /* Create: new_name_i = new_name + step_expr  */
2463           enum tree_code code = POINTER_TYPE_P (scalar_type)
2464                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2465           init_stmt = gimple_build_assign_with_ops (code, new_var,
2466                                                     new_name, step_expr);
2467           new_name = make_ssa_name (new_var, init_stmt);
2468           gimple_assign_set_lhs (init_stmt, new_name);
2469
2470           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2471           gcc_assert (!new_bb);
2472
2473           if (vect_print_dump_info (REPORT_DETAILS))
2474             {
2475               fprintf (vect_dump, "created new init_stmt: ");
2476               print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2477             }
2478           t = tree_cons (NULL_TREE, new_name, t);
2479         }
2480       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
2481       vec = build_constructor_from_list (vectype, nreverse (t));
2482       vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2483     }
2484
2485
2486   /* Create the vector that holds the step of the induction.  */
2487   if (nested_in_vect_loop)
2488     /* iv_loop is nested in the loop to be vectorized. Generate:
2489        vec_step = [S, S, S, S]  */
2490     new_name = step_expr;
2491   else
2492     {
2493       /* iv_loop is the loop to be vectorized. Generate:
2494           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
2495       expr = build_int_cst (TREE_TYPE (step_expr), vf);
2496       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2497                               expr, step_expr);
2498     }
2499
2500   t = NULL_TREE;
2501   for (i = 0; i < nunits; i++)
2502     t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2503   gcc_assert (CONSTANT_CLASS_P (new_name));
2504   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2505   gcc_assert (stepvectype);
2506   vec = build_vector (stepvectype, t);
2507   vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2508
2509
2510   /* Create the following def-use cycle:
2511      loop prolog:
2512          vec_init = ...
2513          vec_step = ...
2514      loop:
2515          vec_iv = PHI <vec_init, vec_loop>
2516          ...
2517          STMT
2518          ...
2519          vec_loop = vec_iv + vec_step;  */
2520
2521   /* Create the induction-phi that defines the induction-operand.  */
2522   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2523   add_referenced_var (vec_dest);
2524   induction_phi = create_phi_node (vec_dest, iv_loop->header);
2525   set_vinfo_for_stmt (induction_phi,
2526                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2527   induc_def = PHI_RESULT (induction_phi);
2528
2529   /* Create the iv update inside the loop  */
2530   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2531                                            induc_def, vec_step);
2532   vec_def = make_ssa_name (vec_dest, new_stmt);
2533   gimple_assign_set_lhs (new_stmt, vec_def);
2534   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2535   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2536                                                    NULL));
2537
2538   /* Set the arguments of the phi node:  */
2539   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2540   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2541                UNKNOWN_LOCATION);
2542
2543
2544   /* In case that vectorization factor (VF) is bigger than the number
2545      of elements that we can fit in a vectype (nunits), we have to generate
2546      more than one vector stmt - i.e - we need to "unroll" the
2547      vector stmt by a factor VF/nunits.  For more details see documentation
2548      in vectorizable_operation.  */
2549
2550   if (ncopies > 1)
2551     {
2552       stmt_vec_info prev_stmt_vinfo;
2553       /* FORNOW. This restriction should be relaxed.  */
2554       gcc_assert (!nested_in_vect_loop);
2555
2556       /* Create the vector that holds the step of the induction.  */
2557       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2558       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2559                               expr, step_expr);
2560       t = NULL_TREE;
2561       for (i = 0; i < nunits; i++)
2562         t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2563       gcc_assert (CONSTANT_CLASS_P (new_name));
2564       vec = build_vector (stepvectype, t);
2565       vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2566
2567       vec_def = induc_def;
2568       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2569       for (i = 1; i < ncopies; i++)
2570         {
2571           /* vec_i = vec_prev + vec_step  */
2572           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2573                                                    vec_def, vec_step);
2574           vec_def = make_ssa_name (vec_dest, new_stmt);
2575           gimple_assign_set_lhs (new_stmt, vec_def);
2576
2577           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2578           set_vinfo_for_stmt (new_stmt,
2579                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2580           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2581           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2582         }
2583     }
2584
2585   if (nested_in_vect_loop)
2586     {
2587       /* Find the loop-closed exit-phi of the induction, and record
2588          the final vector of induction results:  */
2589       exit_phi = NULL;
2590       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2591         {
2592           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2593             {
2594               exit_phi = USE_STMT (use_p);
2595               break;
2596             }
2597         }
2598       if (exit_phi)
2599         {
2600           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2601           /* FORNOW. Currently not supporting the case that an inner-loop induction
2602              is not used in the outer-loop (i.e. only outside the outer-loop).  */
2603           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2604                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
2605
2606           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2607           if (vect_print_dump_info (REPORT_DETAILS))
2608             {
2609               fprintf (vect_dump, "vector of inductions after inner-loop:");
2610               print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2611             }
2612         }
2613     }
2614
2615
2616   if (vect_print_dump_info (REPORT_DETAILS))
2617     {
2618       fprintf (vect_dump, "transform induction: created def-use cycle: ");
2619       print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2620       fprintf (vect_dump, "\n");
2621       print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2622     }
2623
2624   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2625   return induc_def;
2626 }
2627
2628
2629 /* Function get_initial_def_for_reduction
2630
2631    Input:
2632    STMT - a stmt that performs a reduction operation in the loop.
2633    INIT_VAL - the initial value of the reduction variable
2634
2635    Output:
2636    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2637         of the reduction (used for adjusting the epilog - see below).
2638    Return a vector variable, initialized according to the operation that STMT
2639         performs. This vector will be used as the initial value of the
2640         vector of partial results.
2641
2642    Option1 (adjust in epilog): Initialize the vector as follows:
2643      add/bit or/xor:    [0,0,...,0,0]
2644      mult/bit and:      [1,1,...,1,1]
2645      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2646    and when necessary (e.g. add/mult case) let the caller know
2647    that it needs to adjust the result by init_val.
2648
2649    Option2: Initialize the vector as follows:
2650      add/bit or/xor:    [init_val,0,0,...,0]
2651      mult/bit and:      [init_val,1,1,...,1]
2652      min/max/cond_expr: [init_val,init_val,...,init_val]
2653    and no adjustments are needed.
2654
2655    For example, for the following code:
2656
2657    s = init_val;
2658    for (i=0;i<n;i++)
2659      s = s + a[i];
2660
2661    STMT is 's = s + a[i]', and the reduction variable is 's'.
2662    For a vector of 4 units, we want to return either [0,0,0,init_val],
2663    or [0,0,0,0] and let the caller know that it needs to adjust
2664    the result at the end by 'init_val'.
2665
2666    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2667    initialization vector is simpler (same element in all entries), if
2668    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2669
2670    A cost model should help decide between these two schemes.  */
2671
2672 tree
2673 get_initial_def_for_reduction (gimple stmt, tree init_val,
2674                                tree *adjustment_def)
2675 {
2676   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2677   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2678   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2679   tree scalar_type = TREE_TYPE (init_val);
2680   tree vectype = get_vectype_for_scalar_type (scalar_type);
2681   int nunits;
2682   enum tree_code code = gimple_assign_rhs_code (stmt);
2683   tree def_for_init;
2684   tree init_def;
2685   tree t = NULL_TREE;
2686   int i;
2687   bool nested_in_vect_loop = false;
2688   tree init_value;
2689   REAL_VALUE_TYPE real_init_val = dconst0;
2690   int int_init_val = 0;
2691   gimple def_stmt = NULL;
2692
2693   gcc_assert (vectype);
2694   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2695
2696   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2697               || SCALAR_FLOAT_TYPE_P (scalar_type));
2698
2699   if (nested_in_vect_loop_p (loop, stmt))
2700     nested_in_vect_loop = true;
2701   else
2702     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2703
2704   /* In case of double reduction we only create a vector variable to be put
2705      in the reduction phi node. The actual statement creation is done in
2706      vect_create_epilog_for_reduction.  */
2707   if (adjustment_def && nested_in_vect_loop
2708       && TREE_CODE (init_val) == SSA_NAME
2709       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2710       && gimple_code (def_stmt) == GIMPLE_PHI
2711       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2712       && vinfo_for_stmt (def_stmt)
2713       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2714           == vect_double_reduction_def)
2715     {
2716       *adjustment_def = NULL;
2717       return vect_create_destination_var (init_val, vectype);
2718     }
2719
2720   if (TREE_CONSTANT (init_val))
2721     {
2722       if (SCALAR_FLOAT_TYPE_P (scalar_type))
2723         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2724       else
2725         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2726     }
2727   else
2728     init_value = init_val;
2729
2730   switch (code)
2731     {
2732       case WIDEN_SUM_EXPR:
2733       case DOT_PROD_EXPR:
2734       case PLUS_EXPR:
2735       case MINUS_EXPR:
2736       case BIT_IOR_EXPR:
2737       case BIT_XOR_EXPR:
2738       case MULT_EXPR:
2739       case BIT_AND_EXPR:
2740         /* ADJUSMENT_DEF is NULL when called from
2741            vect_create_epilog_for_reduction to vectorize double reduction.  */
2742         if (adjustment_def)
2743           {
2744             if (nested_in_vect_loop)
2745               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2746                                                               NULL);
2747             else
2748               *adjustment_def = init_val;
2749           }
2750
2751         if (code == MULT_EXPR || code == BIT_AND_EXPR)
2752           {
2753             real_init_val = dconst1;
2754             int_init_val = 1;
2755           }
2756
2757         if (SCALAR_FLOAT_TYPE_P (scalar_type))
2758           def_for_init = build_real (scalar_type, real_init_val);
2759         else
2760           def_for_init = build_int_cst (scalar_type, int_init_val);
2761
2762         /* Create a vector of '0' or '1' except the first element.  */
2763         for (i = nunits - 2; i >= 0; --i)
2764           t = tree_cons (NULL_TREE, def_for_init, t);
2765
2766         /* Option1: the first element is '0' or '1' as well.  */
2767         if (adjustment_def)
2768           {
2769             t = tree_cons (NULL_TREE, def_for_init, t);
2770             init_def = build_vector (vectype, t);
2771             break;
2772           }
2773
2774         /* Option2: the first element is INIT_VAL.  */
2775         t = tree_cons (NULL_TREE, init_value, t);
2776         if (TREE_CONSTANT (init_val))
2777           init_def = build_vector (vectype, t);
2778         else
2779           init_def = build_constructor_from_list (vectype, t);
2780
2781         break;
2782
2783       case MIN_EXPR:
2784       case MAX_EXPR:
2785       case COND_EXPR:
2786         if (adjustment_def)
2787           {
2788             *adjustment_def = NULL_TREE;
2789             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2790             break;
2791           }
2792
2793         for (i = nunits - 1; i >= 0; --i)
2794           t = tree_cons (NULL_TREE, init_value, t);
2795
2796         if (TREE_CONSTANT (init_val))
2797           init_def = build_vector (vectype, t);
2798         else
2799           init_def = build_constructor_from_list (vectype, t);
2800
2801         break;
2802
2803       default:
2804         gcc_unreachable ();
2805     }
2806
2807   return init_def;
2808 }
2809
2810
2811 /* Function vect_create_epilog_for_reduction
2812
2813    Create code at the loop-epilog to finalize the result of a reduction
2814    computation.
2815
2816    VECT_DEF is a vector of partial results.
2817    REDUC_CODE is the tree-code for the epilog reduction.
2818    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2819      number of elements that we can fit in a vectype (nunits). In this case
2820      we have to generate more than one vector stmt - i.e - we need to "unroll"
2821      the vector stmt by a factor VF/nunits.  For more details see documentation
2822      in vectorizable_operation.
2823    STMT is the scalar reduction stmt that is being vectorized.
2824    REDUCTION_PHI is the phi-node that carries the reduction computation.
2825    REDUC_INDEX is the index of the operand in the right hand side of the
2826      statement that is defined by REDUCTION_PHI.
2827    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
2828
2829    This function:
2830    1. Creates the reduction def-use cycle: sets the arguments for
2831       REDUCTION_PHI:
2832       The loop-entry argument is the vectorized initial-value of the reduction.
2833       The loop-latch argument is VECT_DEF - the vector of partial sums.
2834    2. "Reduces" the vector of partial results VECT_DEF into a single result,
2835       by applying the operation specified by REDUC_CODE if available, or by
2836       other means (whole-vector shifts or a scalar loop).
2837       The function also creates a new phi node at the loop exit to preserve
2838       loop-closed form, as illustrated below.
2839
2840      The flow at the entry to this function:
2841
2842         loop:
2843           vec_def = phi <null, null>            # REDUCTION_PHI
2844           VECT_DEF = vector_stmt                # vectorized form of STMT
2845           s_loop = scalar_stmt                  # (scalar) STMT
2846         loop_exit:
2847           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2848           use <s_out0>
2849           use <s_out0>
2850
2851      The above is transformed by this function into:
2852
2853         loop:
2854           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
2855           VECT_DEF = vector_stmt                # vectorized form of STMT
2856           s_loop = scalar_stmt                  # (scalar) STMT
2857         loop_exit:
2858           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2859           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
2860           v_out2 = reduce <v_out1>
2861           s_out3 = extract_field <v_out2, 0>
2862           s_out4 = adjust_result <s_out3>
2863           use <s_out4>
2864           use <s_out4>
2865 */
2866
2867 static void
2868 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2869                                   int ncopies,
2870                                   enum tree_code reduc_code,
2871                                   gimple reduction_phi,
2872                                   int reduc_index,
2873                                   bool double_reduc)
2874 {
2875   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2876   stmt_vec_info prev_phi_info;
2877   tree vectype;
2878   enum machine_mode mode;
2879   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2880   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
2881   basic_block exit_bb;
2882   tree scalar_dest;
2883   tree scalar_type;
2884   gimple new_phi = NULL, phi;
2885   gimple_stmt_iterator exit_gsi;
2886   tree vec_dest;
2887   tree new_temp = NULL_TREE;
2888   tree new_name;
2889   gimple epilog_stmt = NULL;
2890   tree new_scalar_dest, new_dest;
2891   gimple exit_phi;
2892   tree bitsize, bitpos;
2893   enum tree_code code = gimple_assign_rhs_code (stmt);
2894   tree adjustment_def;
2895   tree vec_initial_def, def;
2896   tree orig_name;
2897   imm_use_iterator imm_iter;
2898   use_operand_p use_p;
2899   bool extract_scalar_result = false;
2900   tree reduction_op, expr;
2901   gimple orig_stmt;
2902   gimple use_stmt;
2903   bool nested_in_vect_loop = false;
2904   VEC(gimple,heap) *phis = NULL;
2905   enum vect_def_type dt = vect_unknown_def_type;
2906   int j, i;
2907
2908   if (nested_in_vect_loop_p (loop, stmt))
2909     {
2910       outer_loop = loop;
2911       loop = loop->inner;
2912       nested_in_vect_loop = true;
2913     }
2914
2915   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2916     {
2917     case GIMPLE_SINGLE_RHS:
2918       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
2919                                        == ternary_op);
2920       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
2921       break;
2922     case GIMPLE_UNARY_RHS:
2923       reduction_op = gimple_assign_rhs1 (stmt);
2924       break;
2925     case GIMPLE_BINARY_RHS:
2926       reduction_op = reduc_index ?
2927                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
2928       break;
2929     default:
2930       gcc_unreachable ();
2931     }
2932
2933   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2934   gcc_assert (vectype);
2935   mode = TYPE_MODE (vectype);
2936
2937   /*** 1. Create the reduction def-use cycle  ***/
2938
2939   /* For the case of reduction, vect_get_vec_def_for_operand returns
2940      the scalar def before the loop, that defines the initial value
2941      of the reduction variable.  */
2942   vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2943                                                   &adjustment_def);
2944
2945   phi = reduction_phi;
2946   def = vect_def;
2947   for (j = 0; j < ncopies; j++)
2948     {
2949       /* 1.1 set the loop-entry arg of the reduction-phi:  */
2950       add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop),
2951                    UNKNOWN_LOCATION);
2952
2953       /* 1.2 set the loop-latch arg for the reduction-phi:  */
2954       if (j > 0)
2955         def = vect_get_vec_def_for_stmt_copy (dt, def);
2956       add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
2957
2958       if (vect_print_dump_info (REPORT_DETAILS))
2959         {
2960           fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2961           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2962           fprintf (vect_dump, "\n");
2963           print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2964         }
2965
2966       phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2967     }
2968
2969   /*** 2. Create epilog code
2970           The reduction epilog code operates across the elements of the vector
2971           of partial results computed by the vectorized loop.
2972           The reduction epilog code consists of:
2973           step 1: compute the scalar result in a vector (v_out2)
2974           step 2: extract the scalar result (s_out3) from the vector (v_out2)
2975           step 3: adjust the scalar result (s_out3) if needed.
2976
2977           Step 1 can be accomplished using one the following three schemes:
2978           (scheme 1) using reduc_code, if available.
2979           (scheme 2) using whole-vector shifts, if available.
2980           (scheme 3) using a scalar loop. In this case steps 1+2 above are
2981                      combined.
2982
2983           The overall epilog code looks like this:
2984
2985           s_out0 = phi <s_loop>         # original EXIT_PHI
2986           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
2987           v_out2 = reduce <v_out1>              # step 1
2988           s_out3 = extract_field <v_out2, 0>    # step 2
2989           s_out4 = adjust_result <s_out3>       # step 3
2990
2991           (step 3 is optional, and steps 1 and 2 may be combined).
2992           Lastly, the uses of s_out0 are replaced by s_out4.
2993
2994           ***/
2995
2996   /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2997         v_out1 = phi <v_loop>  */
2998
2999   exit_bb = single_exit (loop)->dest;
3000   def = vect_def;
3001   prev_phi_info = NULL;
3002   for (j = 0; j < ncopies; j++)
3003     {
3004       phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
3005       set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3006       if (j == 0)
3007         new_phi = phi;
3008       else
3009         {
3010           def = vect_get_vec_def_for_stmt_copy (dt, def);
3011           STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3012         }
3013       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3014       prev_phi_info = vinfo_for_stmt (phi);
3015     }
3016
3017   exit_gsi = gsi_after_labels (exit_bb);
3018
3019   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3020          (i.e. when reduc_code is not available) and in the final adjustment
3021          code (if needed).  Also get the original scalar reduction variable as
3022          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3023          represents a reduction pattern), the tree-code and scalar-def are
3024          taken from the original stmt that the pattern-stmt (STMT) replaces.
3025          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3026          are taken from STMT.  */
3027
3028   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3029   if (!orig_stmt)
3030     {
3031       /* Regular reduction  */
3032       orig_stmt = stmt;
3033     }
3034   else
3035     {
3036       /* Reduction pattern  */
3037       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3038       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3039       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3040     }
3041
3042   code = gimple_assign_rhs_code (orig_stmt);
3043   scalar_dest = gimple_assign_lhs (orig_stmt);
3044   scalar_type = TREE_TYPE (scalar_dest);
3045   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3046   bitsize = TYPE_SIZE (scalar_type);
3047
3048   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3049      partial results are added and not subtracted.  */
3050   if (code == MINUS_EXPR)
3051     code = PLUS_EXPR;
3052
3053   /* In case this is a reduction in an inner-loop while vectorizing an outer
3054      loop - we don't need to extract a single scalar result at the end of the
3055      inner-loop (unless it is double reduction, i.e., the use of reduction is
3056      outside the outer-loop). The final vector of partial results will be used
3057      in the vectorized outer-loop, or reduced to a scalar result at the end of
3058      the outer-loop.  */
3059   if (nested_in_vect_loop && !double_reduc)
3060     goto vect_finalize_reduction;
3061
3062   /* The epilogue is created for the outer-loop, i.e., for the loop being
3063      vectorized.  */
3064   if (double_reduc)
3065     loop = outer_loop;
3066
3067   /* FORNOW */
3068   gcc_assert (ncopies == 1);
3069
3070   /* 2.3 Create the reduction code, using one of the three schemes described
3071          above.  */
3072
3073   if (reduc_code != ERROR_MARK)
3074     {
3075       tree tmp;
3076
3077       /*** Case 1:  Create:
3078            v_out2 = reduc_expr <v_out1>  */
3079
3080       if (vect_print_dump_info (REPORT_DETAILS))
3081         fprintf (vect_dump, "Reduce using direct vector reduction.");
3082
3083       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3084       tmp = build1 (reduc_code, vectype,  PHI_RESULT (new_phi));
3085       epilog_stmt = gimple_build_assign (vec_dest, tmp);
3086       new_temp = make_ssa_name (vec_dest, epilog_stmt);
3087       gimple_assign_set_lhs (epilog_stmt, new_temp);
3088       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3089
3090       extract_scalar_result = true;
3091     }
3092   else
3093     {
3094       enum tree_code shift_code = ERROR_MARK;
3095       bool have_whole_vector_shift = true;
3096       int bit_offset;
3097       int element_bitsize = tree_low_cst (bitsize, 1);
3098       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3099       tree vec_temp;
3100
3101       if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
3102         shift_code = VEC_RSHIFT_EXPR;
3103       else
3104         have_whole_vector_shift = false;
3105
3106       /* Regardless of whether we have a whole vector shift, if we're
3107          emulating the operation via tree-vect-generic, we don't want
3108          to use it.  Only the first round of the reduction is likely
3109          to still be profitable via emulation.  */
3110       /* ??? It might be better to emit a reduction tree code here, so that
3111          tree-vect-generic can expand the first round via bit tricks.  */
3112       if (!VECTOR_MODE_P (mode))
3113         have_whole_vector_shift = false;
3114       else
3115         {
3116           optab optab = optab_for_tree_code (code, vectype, optab_default);
3117           if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
3118             have_whole_vector_shift = false;
3119         }
3120
3121       if (have_whole_vector_shift)
3122         {
3123           /*** Case 2: Create:
3124              for (offset = VS/2; offset >= element_size; offset/=2)
3125                 {
3126                   Create:  va' = vec_shift <va, offset>
3127                   Create:  va = vop <va, va'>
3128                 }  */
3129
3130           if (vect_print_dump_info (REPORT_DETAILS))
3131             fprintf (vect_dump, "Reduce using vector shifts");
3132
3133           vec_dest = vect_create_destination_var (scalar_dest, vectype);
3134           new_temp = PHI_RESULT (new_phi);
3135
3136           for (bit_offset = vec_size_in_bits/2;
3137                bit_offset >= element_bitsize;
3138                bit_offset /= 2)
3139             {
3140               tree bitpos = size_int (bit_offset);
3141
3142               epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
3143                                                           new_temp, bitpos);
3144               new_name = make_ssa_name (vec_dest, epilog_stmt);
3145               gimple_assign_set_lhs (epilog_stmt, new_name);
3146               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3147
3148               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3149                                                           new_name, new_temp);
3150               new_temp = make_ssa_name (vec_dest, epilog_stmt);
3151               gimple_assign_set_lhs (epilog_stmt, new_temp);
3152               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3153             }
3154
3155           extract_scalar_result = true;
3156         }
3157       else
3158         {
3159           tree rhs;
3160
3161           /*** Case 3: Create:
3162              s = extract_field <v_out2, 0>
3163              for (offset = element_size;
3164                   offset < vector_size;
3165                   offset += element_size;)
3166                {
3167                  Create:  s' = extract_field <v_out2, offset>
3168                  Create:  s = op <s, s'>
3169                }  */
3170
3171           if (vect_print_dump_info (REPORT_DETAILS))
3172             fprintf (vect_dump, "Reduce using scalar code. ");
3173
3174           vec_temp = PHI_RESULT (new_phi);
3175           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3176           rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3177                          bitsize_zero_node);
3178           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3179           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3180           gimple_assign_set_lhs (epilog_stmt, new_temp);
3181           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3182
3183           for (bit_offset = element_bitsize;
3184                bit_offset < vec_size_in_bits;
3185                bit_offset += element_bitsize)
3186             {
3187               tree bitpos = bitsize_int (bit_offset);
3188               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3189                                  bitpos);
3190
3191               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3192               new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3193               gimple_assign_set_lhs (epilog_stmt, new_name);
3194               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3195
3196               epilog_stmt = gimple_build_assign_with_ops (code,
3197                                                           new_scalar_dest,
3198                                                           new_name, new_temp);
3199               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3200               gimple_assign_set_lhs (epilog_stmt, new_temp);
3201               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3202             }
3203
3204           extract_scalar_result = false;
3205         }
3206     }
3207
3208   /* 2.4  Extract the final scalar result.  Create:
3209          s_out3 = extract_field <v_out2, bitpos>  */
3210
3211   if (extract_scalar_result)
3212     {
3213       tree rhs;
3214
3215       gcc_assert (!nested_in_vect_loop || double_reduc);
3216       if (vect_print_dump_info (REPORT_DETAILS))
3217         fprintf (vect_dump, "extract scalar result");
3218
3219       if (BYTES_BIG_ENDIAN)
3220         bitpos = size_binop (MULT_EXPR,
3221                        bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3222                        TYPE_SIZE (scalar_type));
3223       else
3224         bitpos = bitsize_zero_node;
3225
3226       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3227       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3228       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3229       gimple_assign_set_lhs (epilog_stmt, new_temp);
3230       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3231     }
3232
3233 vect_finalize_reduction:
3234
3235   if (double_reduc)
3236     loop = loop->inner;
3237
3238   /* 2.5 Adjust the final result by the initial value of the reduction
3239          variable. (When such adjustment is not needed, then
3240          'adjustment_def' is zero).  For example, if code is PLUS we create:
3241          new_temp = loop_exit_def + adjustment_def  */
3242
3243   if (adjustment_def)
3244     {
3245       if (nested_in_vect_loop)
3246         {
3247           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3248           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3249           new_dest = vect_create_destination_var (scalar_dest, vectype);
3250         }
3251       else
3252         {
3253           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3254           expr = build2 (code, scalar_type, new_temp, adjustment_def);
3255           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3256         }
3257
3258       epilog_stmt = gimple_build_assign (new_dest, expr);
3259       new_temp = make_ssa_name (new_dest, epilog_stmt);
3260       gimple_assign_set_lhs (epilog_stmt, new_temp);
3261       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3262       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3263     }
3264
3265
3266   /* 2.6  Handle the loop-exit phi  */
3267
3268   /* Replace uses of s_out0 with uses of s_out3:
3269      Find the loop-closed-use at the loop exit of the original scalar result.
3270      (The reduction result is expected to have two immediate uses - one at the
3271      latch block, and one at the loop exit).  */
3272   phis = VEC_alloc (gimple, heap, 10);
3273   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3274     {
3275       if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3276         {
3277           exit_phi = USE_STMT (use_p);
3278           VEC_quick_push (gimple, phis, exit_phi);
3279         }
3280     }
3281
3282   /* We expect to have found an exit_phi because of loop-closed-ssa form.  */
3283   gcc_assert (!VEC_empty (gimple, phis));
3284
3285   for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
3286     {
3287       if (nested_in_vect_loop)
3288         {
3289           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3290           gimple vect_phi;
3291
3292           /* FORNOW. Currently not supporting the case that an inner-loop
3293              reduction is not used in the outer-loop (but only outside the
3294              outer-loop), unless it is double reduction.  */
3295           gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo)
3296                       && !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc);
3297
3298           epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
3299           STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
3300           set_vinfo_for_stmt (epilog_stmt,
3301                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
3302                                                  NULL));
3303           if (adjustment_def)
3304             STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3305                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3306
3307           if (!double_reduc
3308               || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def)
3309             continue;
3310
3311           /* Handle double reduction:
3312
3313              stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
3314              stmt2:   s3 = phi <s1, s4> - (regular) reduction phi (inner loop)
3315              stmt3:   s4 = use (s3)     - (regular) reduction stmt (inner loop)
3316              stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
3317
3318              At that point the regular reduction (stmt2 and stmt3) is already
3319              vectorized, as well as the exit phi node, stmt4.
3320              Here we vectorize the phi node of double reduction, stmt1, and
3321              update all relevant statements.  */
3322
3323           /* Go through all the uses of s2 to find double reduction phi node,
3324              i.e., stmt1 above.  */
3325           orig_name = PHI_RESULT (exit_phi);
3326           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3327             {
3328               stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3329               stmt_vec_info new_phi_vinfo;
3330               tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3331               basic_block bb = gimple_bb (use_stmt);
3332               gimple use;
3333
3334               /* Check that USE_STMT is really double reduction phi node.  */
3335               if (gimple_code (use_stmt) != GIMPLE_PHI
3336                   || gimple_phi_num_args (use_stmt) != 2
3337                   || !use_stmt_vinfo
3338                   || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3339                       != vect_double_reduction_def
3340                   || bb->loop_father != outer_loop)
3341                 continue;
3342
3343               /* Create vector phi node for double reduction:
3344                  vs1 = phi <vs0, vs2>
3345                  vs1 was created previously in this function by a call to
3346                  vect_get_vec_def_for_operand and is stored in vec_initial_def;
3347                  vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3348                  vs0 is created here.  */
3349
3350               /* Create vector phi node.  */
3351               vect_phi = create_phi_node (vec_initial_def, bb);
3352               new_phi_vinfo = new_stmt_vec_info (vect_phi,
3353                                     loop_vec_info_for_loop (outer_loop), NULL);
3354               set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3355
3356               /* Create vs0 - initial def of the double reduction phi.  */
3357               preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3358                                              loop_preheader_edge (outer_loop));
3359               init_def = get_initial_def_for_reduction (stmt, preheader_arg,
3360                                                         NULL);
3361               vect_phi_init = vect_init_vector (use_stmt, init_def, vectype,
3362                                                 NULL);
3363
3364               /* Update phi node arguments with vs0 and vs2.  */
3365               add_phi_arg (vect_phi, vect_phi_init,
3366                            loop_preheader_edge (outer_loop), UNKNOWN_LOCATION);
3367               add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3368                            loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3369               if (vect_print_dump_info (REPORT_DETAILS))
3370                 {
3371                   fprintf (vect_dump, "created double reduction phi node: ");
3372                   print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3373                 }
3374
3375               vect_phi_res = PHI_RESULT (vect_phi);
3376
3377               /* Replace the use, i.e., set the correct vs1 in the regular
3378                  reduction phi node. FORNOW, NCOPIES is always 1, so the loop
3379                  is redundant.  */
3380               use = reduction_phi;
3381               for (j = 0; j < ncopies; j++)
3382                 {
3383                   edge pr_edge = loop_preheader_edge (loop);
3384                   SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3385                   use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3386                 }
3387             }
3388         }
3389
3390       /* Replace the uses:  */
3391       orig_name = PHI_RESULT (exit_phi);
3392       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3393         FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3394           SET_USE (use_p, new_temp);
3395     }
3396
3397   VEC_free (gimple, heap, phis);
3398 }
3399
3400
3401 /* Function vectorizable_reduction.
3402
3403    Check if STMT performs a reduction operation that can be vectorized.
3404    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3405    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3406    Return FALSE if not a vectorizable STMT, TRUE otherwise.
3407
3408    This function also handles reduction idioms (patterns) that have been
3409    recognized in advance during vect_pattern_recog. In this case, STMT may be
3410    of this form:
3411      X = pattern_expr (arg0, arg1, ..., X)
3412    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3413    sequence that had been detected and replaced by the pattern-stmt (STMT).
3414
3415    In some cases of reduction patterns, the type of the reduction variable X is
3416    different than the type of the other arguments of STMT.
3417    In such cases, the vectype that is used when transforming STMT into a vector
3418    stmt is different than the vectype that is used to determine the
3419    vectorization factor, because it consists of a different number of elements
3420    than the actual number of elements that are being operated upon in parallel.
3421
3422    For example, consider an accumulation of shorts into an int accumulator.
3423    On some targets it's possible to vectorize this pattern operating on 8
3424    shorts at a time (hence, the vectype for purposes of determining the
3425    vectorization factor should be V8HI); on the other hand, the vectype that
3426    is used to create the vector form is actually V4SI (the type of the result).
3427
3428    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3429    indicates what is the actual level of parallelism (V8HI in the example), so
3430    that the right vectorization factor would be derived. This vectype
3431    corresponds to the type of arguments to the reduction stmt, and should *NOT*
3432    be used to create the vectorized stmt. The right vectype for the vectorized
3433    stmt is obtained from the type of the result X:
3434         get_vectype_for_scalar_type (TREE_TYPE (X))
3435
3436    This means that, contrary to "regular" reductions (or "regular" stmts in
3437    general), the following equation:
3438       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3439    does *NOT* necessarily hold for reduction patterns.  */
3440
3441 bool
3442 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3443                         gimple *vec_stmt)
3444 {
3445   tree vec_dest;
3446   tree scalar_dest;
3447   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3448   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3449   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3450   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3451   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3452   enum tree_code code, orig_code, epilog_reduc_code;
3453   enum machine_mode vec_mode;
3454   int op_type;
3455   optab optab, reduc_optab;
3456   tree new_temp = NULL_TREE;
3457   tree def;
3458   gimple def_stmt;
3459   enum vect_def_type dt;
3460   gimple new_phi = NULL;
3461   tree scalar_type;
3462   bool is_simple_use;
3463   gimple orig_stmt;
3464   stmt_vec_info orig_stmt_info;
3465   tree expr = NULL_TREE;
3466   int i;
3467   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3468   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3469   int epilog_copies;
3470   stmt_vec_info prev_stmt_info, prev_phi_info;
3471   gimple first_phi = NULL;
3472   bool single_defuse_cycle = false;
3473   tree reduc_def = NULL_TREE;
3474   gimple new_stmt = NULL;
3475   int j;
3476   tree ops[3];
3477   bool nested_cycle = false, found_nested_cycle_def = false;
3478   gimple reduc_def_stmt = NULL;
3479   /* The default is that the reduction variable is the last in statement.  */
3480   int reduc_index = 2;
3481   bool double_reduc = false, dummy;
3482   basic_block def_bb;
3483   struct loop * def_stmt_loop, *outer_loop = NULL;
3484   tree def_arg;
3485   gimple def_arg_stmt;
3486
3487   if (nested_in_vect_loop_p (loop, stmt))
3488     {
3489       outer_loop = loop;
3490       loop = loop->inner;
3491       nested_cycle = true;
3492     }
3493
3494   gcc_assert (ncopies >= 1);
3495
3496   /* FORNOW: SLP not supported.  */
3497   if (STMT_SLP_TYPE (stmt_info))
3498     return false;
3499
3500   /* 1. Is vectorizable reduction?  */
3501   /* Not supportable if the reduction variable is used in the loop.  */
3502   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3503     return false;
3504
3505   /* Reductions that are not used even in an enclosing outer-loop,
3506      are expected to be "live" (used out of the loop).  */
3507   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3508       && !STMT_VINFO_LIVE_P (stmt_info))
3509     return false;
3510
3511   /* Make sure it was already recognized as a reduction computation.  */
3512   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3513       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3514     return false;
3515
3516   /* 2. Has this been recognized as a reduction pattern?
3517
3518      Check if STMT represents a pattern that has been recognized
3519      in earlier analysis stages.  For stmts that represent a pattern,
3520      the STMT_VINFO_RELATED_STMT field records the last stmt in
3521      the original sequence that constitutes the pattern.  */
3522
3523   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3524   if (orig_stmt)
3525     {
3526       orig_stmt_info = vinfo_for_stmt (orig_stmt);
3527       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3528       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3529       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3530     }
3531
3532   /* 3. Check the operands of the operation. The first operands are defined
3533         inside the loop body. The last operand is the reduction variable,
3534         which is defined by the loop-header-phi.  */
3535
3536   gcc_assert (is_gimple_assign (stmt));
3537
3538   /* Flatten RHS */
3539   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3540     {
3541     case GIMPLE_SINGLE_RHS:
3542       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3543       if (op_type == ternary_op)
3544         {
3545           tree rhs = gimple_assign_rhs1 (stmt);
3546           ops[0] = TREE_OPERAND (rhs, 0);
3547           ops[1] = TREE_OPERAND (rhs, 1);
3548           ops[2] = TREE_OPERAND (rhs, 2);
3549           code = TREE_CODE (rhs);
3550         }
3551       else
3552         return false;
3553       break;
3554
3555     case GIMPLE_BINARY_RHS:
3556       code = gimple_assign_rhs_code (stmt);
3557       op_type = TREE_CODE_LENGTH (code);
3558       gcc_assert (op_type == binary_op);
3559       ops[0] = gimple_assign_rhs1 (stmt);
3560       ops[1] = gimple_assign_rhs2 (stmt);
3561       break;
3562
3563     case GIMPLE_UNARY_RHS:
3564       return false;
3565
3566     default:
3567       gcc_unreachable ();
3568     }
3569
3570   scalar_dest = gimple_assign_lhs (stmt);
3571   scalar_type = TREE_TYPE (scalar_dest);
3572   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3573       && !SCALAR_FLOAT_TYPE_P (scalar_type))
3574     return false;
3575
3576   /* All uses but the last are expected to be defined in the loop.
3577      The last use is the reduction variable. In case of nested cycle this
3578      assumption is not true: we use reduc_index to record the index of the
3579      reduction variable.  */
3580   for (i = 0; i < op_type-1; i++)
3581     {
3582       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
3583       if (i == 0 && code == COND_EXPR)
3584         continue;
3585
3586       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3587                                           &def, &dt);
3588       gcc_assert (is_simple_use);
3589       if (dt != vect_internal_def
3590           && dt != vect_external_def
3591           && dt != vect_constant_def
3592           && dt != vect_induction_def
3593           && !(dt == vect_nested_cycle && nested_cycle))
3594         return false;
3595
3596       if (dt == vect_nested_cycle)
3597         {
3598           found_nested_cycle_def = true;
3599           reduc_def_stmt = def_stmt;
3600           reduc_index = i;
3601         }
3602     }
3603
3604   is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3605                                       &def, &dt);
3606   gcc_assert (is_simple_use);
3607   gcc_assert (dt == vect_reduction_def
3608               || dt == vect_nested_cycle
3609               || ((dt == vect_internal_def || dt == vect_external_def
3610                    || dt == vect_constant_def || dt == vect_induction_def)
3611                    && nested_cycle && found_nested_cycle_def));
3612   if (!found_nested_cycle_def)
3613     reduc_def_stmt = def_stmt;
3614
3615   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
3616   if (orig_stmt)
3617     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
3618                                                        reduc_def_stmt,
3619                                                        !nested_cycle,
3620                                                        &dummy));
3621   else
3622     gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
3623                                                   !nested_cycle, &dummy));
3624
3625   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
3626     return false;
3627
3628   vec_mode = TYPE_MODE (vectype);
3629
3630   if (code == COND_EXPR)
3631     {
3632       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
3633         {
3634           if (vect_print_dump_info (REPORT_DETAILS))
3635             fprintf (vect_dump, "unsupported condition in reduction");
3636
3637             return false;
3638         }
3639     }
3640   else
3641     {
3642       /* 4. Supportable by target?  */
3643
3644       /* 4.1. check support for the operation in the loop  */
3645       optab = optab_for_tree_code (code, vectype, optab_default);
3646       if (!optab)
3647         {
3648           if (vect_print_dump_info (REPORT_DETAILS))
3649             fprintf (vect_dump, "no optab.");
3650
3651           return false;
3652         }
3653
3654       if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
3655         {
3656           if (vect_print_dump_info (REPORT_DETAILS))
3657             fprintf (vect_dump, "op not supported by target.");
3658
3659           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3660               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3661                   < vect_min_worthwhile_factor (code))
3662             return false;
3663
3664           if (vect_print_dump_info (REPORT_DETAILS))
3665             fprintf (vect_dump, "proceeding using word mode.");
3666         }
3667
3668       /* Worthwhile without SIMD support?  */
3669       if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3670           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3671              < vect_min_worthwhile_factor (code))
3672         {
3673           if (vect_print_dump_info (REPORT_DETAILS))
3674             fprintf (vect_dump, "not worthwhile without SIMD support.");
3675
3676           return false;
3677         }
3678     }
3679
3680   /* 4.2. Check support for the epilog operation.
3681
3682           If STMT represents a reduction pattern, then the type of the
3683           reduction variable may be different than the type of the rest
3684           of the arguments.  For example, consider the case of accumulation
3685           of shorts into an int accumulator; The original code:
3686                         S1: int_a = (int) short_a;
3687           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
3688
3689           was replaced with:
3690                         STMT: int_acc = widen_sum <short_a, int_acc>
3691
3692           This means that:
3693           1. The tree-code that is used to create the vector operation in the
3694              epilog code (that reduces the partial results) is not the
3695              tree-code of STMT, but is rather the tree-code of the original
3696              stmt from the pattern that STMT is replacing. I.e, in the example
3697              above we want to use 'widen_sum' in the loop, but 'plus' in the
3698              epilog.
3699           2. The type (mode) we use to check available target support
3700              for the vector operation to be created in the *epilog*, is
3701              determined by the type of the reduction variable (in the example
3702              above we'd check this: plus_optab[vect_int_mode]).
3703              However the type (mode) we use to check available target support
3704              for the vector operation to be created *inside the loop*, is
3705              determined by the type of the other arguments to STMT (in the
3706              example we'd check this: widen_sum_optab[vect_short_mode]).
3707
3708           This is contrary to "regular" reductions, in which the types of all
3709           the arguments are the same as the type of the reduction variable.
3710           For "regular" reductions we can therefore use the same vector type
3711           (and also the same tree-code) when generating the epilog code and
3712           when generating the code inside the loop.  */
3713
3714   if (orig_stmt)
3715     {
3716       /* This is a reduction pattern: get the vectype from the type of the
3717          reduction variable, and get the tree-code from orig_stmt.  */
3718       orig_code = gimple_assign_rhs_code (orig_stmt);
3719       vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3720       if (!vectype)
3721         {
3722           if (vect_print_dump_info (REPORT_DETAILS))
3723             {
3724               fprintf (vect_dump, "unsupported data-type ");
3725               print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3726             }
3727           return false;
3728         }
3729
3730       vec_mode = TYPE_MODE (vectype);
3731     }
3732   else
3733     {
3734       /* Regular reduction: use the same vectype and tree-code as used for
3735          the vector code inside the loop can be used for the epilog code. */
3736       orig_code = code;
3737     }
3738
3739   if (nested_cycle)
3740     {
3741       def_bb = gimple_bb (reduc_def_stmt);
3742       def_stmt_loop = def_bb->loop_father;
3743       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
3744                                        loop_preheader_edge (def_stmt_loop));
3745       if (TREE_CODE (def_arg) == SSA_NAME
3746           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
3747           && gimple_code (def_arg_stmt) == GIMPLE_PHI
3748           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
3749           && vinfo_for_stmt (def_arg_stmt)
3750           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
3751               == vect_double_reduction_def)
3752         double_reduc = true;
3753     }
3754
3755   epilog_reduc_code = ERROR_MARK;
3756   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3757     {
3758       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype,
3759                                          optab_default);
3760       if (!reduc_optab)
3761         {
3762           if (vect_print_dump_info (REPORT_DETAILS))
3763             fprintf (vect_dump, "no optab for reduction.");
3764
3765           epilog_reduc_code = ERROR_MARK;
3766         }
3767
3768       if (reduc_optab
3769           && optab_handler (reduc_optab, vec_mode)->insn_code
3770               == CODE_FOR_nothing)
3771         {
3772           if (vect_print_dump_info (REPORT_DETAILS))
3773             fprintf (vect_dump, "reduc op not supported by target.");
3774
3775           epilog_reduc_code = ERROR_MARK;
3776         }
3777     }
3778   else
3779     {
3780       if (!nested_cycle || double_reduc)
3781         {
3782           if (vect_print_dump_info (REPORT_DETAILS))
3783             fprintf (vect_dump, "no reduc code for scalar code.");
3784
3785           return false;
3786         }
3787     }
3788
3789   if (double_reduc && ncopies > 1)
3790     {
3791       if (vect_print_dump_info (REPORT_DETAILS))
3792         fprintf (vect_dump, "multiple types in double reduction");
3793
3794       return false;
3795     }
3796
3797   if (!vec_stmt) /* transformation not required.  */
3798     {
3799       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3800       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3801         return false;
3802       return true;
3803     }
3804
3805   /** Transform.  **/
3806
3807   if (vect_print_dump_info (REPORT_DETAILS))
3808     fprintf (vect_dump, "transform reduction.");
3809
3810   /* FORNOW: Multiple types are not supported for condition.  */
3811   if (code == COND_EXPR)
3812     gcc_assert (ncopies == 1);
3813
3814   /* Create the destination vector  */
3815   vec_dest = vect_create_destination_var (scalar_dest, vectype);
3816
3817   /* In case the vectorization factor (VF) is bigger than the number
3818      of elements that we can fit in a vectype (nunits), we have to generate
3819      more than one vector stmt - i.e - we need to "unroll" the
3820      vector stmt by a factor VF/nunits.  For more details see documentation
3821      in vectorizable_operation.  */
3822
3823   /* If the reduction is used in an outer loop we need to generate
3824      VF intermediate results, like so (e.g. for ncopies=2):
3825         r0 = phi (init, r0)
3826         r1 = phi (init, r1)
3827         r0 = x0 + r0;
3828         r1 = x1 + r1;
3829     (i.e. we generate VF results in 2 registers).
3830     In this case we have a separate def-use cycle for each copy, and therefore
3831     for each copy we get the vector def for the reduction variable from the
3832     respective phi node created for this copy.
3833
3834     Otherwise (the reduction is unused in the loop nest), we can combine
3835     together intermediate results, like so (e.g. for ncopies=2):
3836         r = phi (init, r)
3837         r = x0 + r;
3838         r = x1 + r;
3839    (i.e. we generate VF/2 results in a single register).
3840    In this case for each copy we get the vector def for the reduction variable
3841    from the vectorized reduction operation generated in the previous iteration.
3842   */
3843
3844   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
3845     {
3846       single_defuse_cycle = true;
3847       epilog_copies = 1;
3848     }
3849   else
3850     epilog_copies = ncopies;
3851
3852   prev_stmt_info = NULL;
3853   prev_phi_info = NULL;
3854   for (j = 0; j < ncopies; j++)
3855     {
3856       if (j == 0 || !single_defuse_cycle)
3857         {
3858           /* Create the reduction-phi that defines the reduction-operand.  */
3859           new_phi = create_phi_node (vec_dest, loop->header);
3860           set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo,
3861                                                           NULL));
3862           /* Get the vector def for the reduction variable from the phi
3863              node.  */
3864           reduc_def = PHI_RESULT (new_phi);
3865         }
3866
3867       if (code == COND_EXPR)
3868         {
3869           first_phi = new_phi;
3870           vectorizable_condition (stmt, gsi, vec_stmt, reduc_def, reduc_index);
3871           /* Multiple types are not supported for condition.  */
3872           break;
3873         }
3874
3875       /* Handle uses.  */
3876       if (j == 0)
3877         {
3878           loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
3879                                                         stmt, NULL);
3880           if (op_type == ternary_op)
3881             {
3882               if (reduc_index == 0)
3883                 loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
3884                                                               NULL);
3885               else
3886                 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
3887                                                               NULL);
3888             }
3889
3890           /* Get the vector def for the reduction variable from the phi
3891              node.  */
3892           first_phi = new_phi;
3893         }
3894       else
3895         {
3896           enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3897           loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3898           if (op_type == ternary_op)
3899             loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3900
3901           if (single_defuse_cycle)
3902             reduc_def = gimple_assign_lhs (new_stmt);
3903           else
3904             reduc_def = PHI_RESULT (new_phi);
3905
3906           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3907         }
3908
3909       /* Arguments are ready. Create the new vector stmt.  */
3910       if (op_type == binary_op)
3911         {
3912           if (reduc_index == 0)
3913             expr = build2 (code, vectype, reduc_def, loop_vec_def0);
3914           else
3915             expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3916         }
3917       else
3918         {
3919           if (reduc_index == 0)
3920             expr = build3 (code, vectype, reduc_def, loop_vec_def0,
3921                            loop_vec_def1);
3922           else
3923             {
3924               if (reduc_index == 1)
3925                 expr = build3 (code, vectype, loop_vec_def0, reduc_def,
3926                                loop_vec_def1);
3927               else
3928                 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3929                                reduc_def);
3930             }
3931         }
3932
3933       new_stmt = gimple_build_assign (vec_dest, expr);
3934       new_temp = make_ssa_name (vec_dest, new_stmt);
3935       gimple_assign_set_lhs (new_stmt, new_temp);
3936       vect_finish_stmt_generation (stmt, new_stmt, gsi);
3937
3938       if (j == 0)
3939         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3940       else
3941         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3942
3943       prev_stmt_info = vinfo_for_stmt (new_stmt);
3944       prev_phi_info = vinfo_for_stmt (new_phi);
3945     }
3946
3947   /* Finalize the reduction-phi (set its arguments) and create the
3948      epilog reduction code.  */
3949   if (!single_defuse_cycle || code == COND_EXPR)
3950     new_temp = gimple_assign_lhs (*vec_stmt);
3951
3952   vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3953                                     epilog_reduc_code, first_phi, reduc_index,
3954                                     double_reduc);
3955   return true;
3956 }
3957
3958 /* Function vect_min_worthwhile_factor.
3959
3960    For a loop where we could vectorize the operation indicated by CODE,
3961    return the minimum vectorization factor that makes it worthwhile
3962    to use generic vectors.  */
3963 int
3964 vect_min_worthwhile_factor (enum tree_code code)
3965 {
3966   switch (code)
3967     {
3968     case PLUS_EXPR:
3969     case MINUS_EXPR:
3970     case NEGATE_EXPR:
3971       return 4;
3972
3973     case BIT_AND_EXPR:
3974     case BIT_IOR_EXPR:
3975     case BIT_XOR_EXPR:
3976     case BIT_NOT_EXPR:
3977       return 2;
3978
3979     default:
3980       return INT_MAX;
3981     }
3982 }
3983
3984
3985 /* Function vectorizable_induction
3986
3987    Check if PHI performs an induction computation that can be vectorized.
3988    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3989    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3990    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
3991
3992 bool
3993 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3994                         gimple *vec_stmt)
3995 {
3996   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3997   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3998   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3999   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4000   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4001   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4002   tree vec_def;
4003
4004   gcc_assert (ncopies >= 1);
4005   /* FORNOW. This restriction should be relaxed.  */
4006   if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4007     {
4008       if (vect_print_dump_info (REPORT_DETAILS))
4009         fprintf (vect_dump, "multiple types in nested loop.");
4010       return false;
4011     }
4012
4013   if (!STMT_VINFO_RELEVANT_P (stmt_info))
4014     return false;
4015
4016   /* FORNOW: SLP not supported.  */
4017   if (STMT_SLP_TYPE (stmt_info))
4018     return false;
4019
4020   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4021
4022   if (gimple_code (phi) != GIMPLE_PHI)
4023     return false;
4024
4025   if (!vec_stmt) /* transformation not required.  */
4026     {
4027       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4028       if (vect_print_dump_info (REPORT_DETAILS))
4029         fprintf (vect_dump, "=== vectorizable_induction ===");
4030       vect_model_induction_cost (stmt_info, ncopies);
4031       return true;
4032     }
4033
4034   /** Transform.  **/
4035
4036   if (vect_print_dump_info (REPORT_DETAILS))
4037     fprintf (vect_dump, "transform induction phi.");
4038
4039   vec_def = get_initial_def_for_induction (phi);
4040   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4041   return true;
4042 }
4043
4044 /* Function vectorizable_live_operation.
4045
4046    STMT computes a value that is used outside the loop. Check if
4047    it can be supported.  */
4048
4049 bool
4050 vectorizable_live_operation (gimple stmt,
4051                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4052                              gimple *vec_stmt ATTRIBUTE_UNUSED)
4053 {
4054   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4055   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4056   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4057   int i;
4058   int op_type;
4059   tree op;
4060   tree def;
4061   gimple def_stmt;
4062   enum vect_def_type dt;
4063   enum tree_code code;
4064   enum gimple_rhs_class rhs_class;
4065
4066   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4067
4068   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4069     return false;
4070
4071   if (!is_gimple_assign (stmt))
4072     return false;
4073
4074   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4075     return false;
4076
4077   /* FORNOW. CHECKME. */
4078   if (nested_in_vect_loop_p (loop, stmt))
4079     return false;
4080
4081   code = gimple_assign_rhs_code (stmt);
4082   op_type = TREE_CODE_LENGTH (code);
4083   rhs_class = get_gimple_rhs_class (code);
4084   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4085   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4086
4087   /* FORNOW: support only if all uses are invariant. This means
4088      that the scalar operations can remain in place, unvectorized.
4089      The original last scalar value that they compute will be used.  */
4090
4091   for (i = 0; i < op_type; i++)
4092     {
4093       if (rhs_class == GIMPLE_SINGLE_RHS)
4094         op = TREE_OPERAND (gimple_op (stmt, 1), i);
4095       else
4096         op = gimple_op (stmt, i + 1);
4097       if (op
4098           && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4099         {
4100           if (vect_print_dump_info (REPORT_DETAILS))
4101             fprintf (vect_dump, "use not simple.");
4102           return false;
4103         }
4104
4105       if (dt != vect_external_def && dt != vect_constant_def)
4106         return false;
4107     }
4108
4109   /* No transformation is required for the cases we currently support.  */
4110   return true;
4111 }
4112
4113 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
4114
4115 static void
4116 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4117 {
4118   ssa_op_iter op_iter;
4119   imm_use_iterator imm_iter;
4120   def_operand_p def_p;
4121   gimple ustmt;
4122
4123   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4124     {
4125       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4126         {
4127           basic_block bb;
4128
4129           if (!is_gimple_debug (ustmt))
4130             continue;
4131
4132           bb = gimple_bb (ustmt);
4133
4134           if (!flow_bb_inside_loop_p (loop, bb))
4135             {
4136               if (gimple_debug_bind_p (ustmt))
4137                 {
4138                   if (vect_print_dump_info (REPORT_DETAILS))
4139                     fprintf (vect_dump, "killing debug use");
4140
4141                   gimple_debug_bind_reset_value (ustmt);
4142                   update_stmt (ustmt);
4143                 }
4144               else
4145                 gcc_unreachable ();
4146             }
4147         }
4148     }
4149 }
4150
4151 /* Function vect_transform_loop.
4152
4153    The analysis phase has determined that the loop is vectorizable.
4154    Vectorize the loop - created vectorized stmts to replace the scalar
4155    stmts in the loop, and update the loop exit condition.  */
4156
4157 void
4158 vect_transform_loop (loop_vec_info loop_vinfo)
4159 {
4160   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4161   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4162   int nbbs = loop->num_nodes;
4163   gimple_stmt_iterator si;
4164   int i;
4165   tree ratio = NULL;
4166   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4167   bool strided_store;
4168   bool slp_scheduled = false;
4169   unsigned int nunits;
4170   tree cond_expr = NULL_TREE;
4171   gimple_seq cond_expr_stmt_list = NULL;
4172   bool do_peeling_for_loop_bound;
4173
4174   if (vect_print_dump_info (REPORT_DETAILS))
4175     fprintf (vect_dump, "=== vec_transform_loop ===");
4176
4177   /* Peel the loop if there are data refs with unknown alignment.
4178      Only one data ref with unknown store is allowed.  */
4179
4180   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4181     vect_do_peeling_for_alignment (loop_vinfo);
4182
4183   do_peeling_for_loop_bound
4184     = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4185        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4186            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4187
4188   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4189       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4190     vect_loop_versioning (loop_vinfo,
4191                           !do_peeling_for_loop_bound,
4192                           &cond_expr, &cond_expr_stmt_list);
4193
4194   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4195      compile time constant), or it is a constant that doesn't divide by the
4196      vectorization factor, then an epilog loop needs to be created.
4197      We therefore duplicate the loop: the original loop will be vectorized,
4198      and will compute the first (n/VF) iterations. The second copy of the loop
4199      will remain scalar and will compute the remaining (n%VF) iterations.
4200      (VF is the vectorization factor).  */
4201
4202   if (do_peeling_for_loop_bound)
4203     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4204                                     cond_expr, cond_expr_stmt_list);
4205   else
4206     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4207                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4208
4209   /* 1) Make sure the loop header has exactly two entries
4210      2) Make sure we have a preheader basic block.  */
4211
4212   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4213
4214   split_edge (loop_preheader_edge (loop));
4215
4216   /* FORNOW: the vectorizer supports only loops which body consist
4217      of one basic block (header + empty latch). When the vectorizer will
4218      support more involved loop forms, the order by which the BBs are
4219      traversed need to be reconsidered.  */
4220
4221   for (i = 0; i < nbbs; i++)
4222     {
4223       basic_block bb = bbs[i];
4224       stmt_vec_info stmt_info;
4225       gimple phi;
4226
4227       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4228         {
4229           phi = gsi_stmt (si);
4230           if (vect_print_dump_info (REPORT_DETAILS))
4231             {
4232               fprintf (vect_dump, "------>vectorizing phi: ");
4233               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4234             }
4235           stmt_info = vinfo_for_stmt (phi);
4236           if (!stmt_info)
4237             continue;
4238
4239           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4240               && !STMT_VINFO_LIVE_P (stmt_info))
4241             {
4242               if (MAY_HAVE_DEBUG_STMTS)
4243                 vect_loop_kill_debug_uses (loop, phi);
4244               continue;
4245             }
4246
4247           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4248                 != (unsigned HOST_WIDE_INT) vectorization_factor)
4249               && vect_print_dump_info (REPORT_DETAILS))
4250             fprintf (vect_dump, "multiple-types.");
4251
4252           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4253             {
4254               if (vect_print_dump_info (REPORT_DETAILS))
4255                 fprintf (vect_dump, "transform phi.");
4256               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4257             }
4258         }
4259
4260       for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4261         {
4262           gimple stmt = gsi_stmt (si);
4263           bool is_store;
4264
4265           if (vect_print_dump_info (REPORT_DETAILS))
4266             {
4267               fprintf (vect_dump, "------>vectorizing statement: ");
4268               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4269             }
4270
4271           stmt_info = vinfo_for_stmt (stmt);
4272
4273           /* vector stmts created in the outer-loop during vectorization of
4274              stmts in an inner-loop may not have a stmt_info, and do not
4275              need to be vectorized.  */
4276           if (!stmt_info)
4277             {
4278               gsi_next (&si);
4279               continue;
4280             }
4281
4282           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4283               && !STMT_VINFO_LIVE_P (stmt_info))
4284             {
4285               if (MAY_HAVE_DEBUG_STMTS)
4286                 vect_loop_kill_debug_uses (loop, stmt);
4287               gsi_next (&si);
4288               continue;
4289             }
4290
4291           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4292           nunits =
4293             (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4294           if (!STMT_SLP_TYPE (stmt_info)
4295               && nunits != (unsigned int) vectorization_factor
4296               && vect_print_dump_info (REPORT_DETAILS))
4297             /* For SLP VF is set according to unrolling factor, and not to
4298                vector size, hence for SLP this print is not valid.  */
4299             fprintf (vect_dump, "multiple-types.");
4300
4301           /* SLP. Schedule all the SLP instances when the first SLP stmt is
4302              reached.  */
4303           if (STMT_SLP_TYPE (stmt_info))
4304             {
4305               if (!slp_scheduled)
4306                 {
4307                   slp_scheduled = true;
4308
4309                   if (vect_print_dump_info (REPORT_DETAILS))
4310                     fprintf (vect_dump, "=== scheduling SLP instances ===");
4311
4312                   vect_schedule_slp (loop_vinfo, NULL);
4313                 }
4314
4315               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
4316               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4317                 {
4318                   gsi_next (&si);
4319                   continue;
4320                 }
4321             }
4322
4323           /* -------- vectorize statement ------------ */
4324           if (vect_print_dump_info (REPORT_DETAILS))
4325             fprintf (vect_dump, "transform statement.");
4326
4327           strided_store = false;
4328           is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4329           if (is_store)
4330             {
4331               if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4332                 {
4333                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4334                      interleaving chain was completed - free all the stores in
4335                      the chain.  */
4336                   vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4337                   gsi_remove (&si, true);
4338                   continue;
4339                 }
4340               else
4341                 {
4342                   /* Free the attached stmt_vec_info and remove the stmt.  */
4343                   free_stmt_vec_info (stmt);
4344                   gsi_remove (&si, true);
4345                   continue;
4346                 }
4347             }
4348           gsi_next (&si);
4349         }                       /* stmts in BB */
4350     }                           /* BBs in loop */
4351
4352   slpeel_make_loop_iterate_ntimes (loop, ratio);
4353
4354   /* The memory tags and pointers in vectorized statements need to
4355      have their SSA forms updated.  FIXME, why can't this be delayed
4356      until all the loops have been transformed?  */
4357   update_ssa (TODO_update_ssa);
4358
4359   if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4360     fprintf (vect_dump, "LOOP VECTORIZED.");
4361   if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4362     fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
4363 }