gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software
   3    Foundation, Inc.
   4    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   5    Ira Rosen <irar@il.ibm.com>
   6
   7 This file is part of GCC.
   8
   9 GCC is free software; you can redistribute it and/or modify it under
  10 the terms of the GNU General Public License as published by the Free
  11 Software Foundation; either version 3, or (at your option) any later
  12 version.
  13
  14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  17 for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with GCC; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "diagnostic.h"
  31 #include "tree-flow.h"
  32 #include "tree-dump.h"
  33 #include "cfgloop.h"
  34 #include "cfglayout.h"
  35 #include "expr.h"
  36 #include "recog.h"
  37 #include "optabs.h"
  38 #include "params.h"
  39 #include "toplev.h"
  40 #include "tree-chrec.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43
  44 /* Loop Vectorization Pass.
  45
  46    This pass tries to vectorize loops.
  47
  48    For example, the vectorizer transforms the following simple loop:
  49
  50         short a[N]; short b[N]; short c[N]; int i;
  51
  52         for (i=0; i<N; i++){
  53           a[i] = b[i] + c[i];
  54         }
  55
  56    as if it was manually vectorized by rewriting the source code into:
  57
  58         typedef int __attribute__((mode(V8HI))) v8hi;
  59         short a[N];  short b[N]; short c[N];   int i;
  60         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  61         v8hi va, vb, vc;
  62
  63         for (i=0; i<N/8; i++){
  64           vb = pb[i];
  65           vc = pc[i];
  66           va = vb + vc;
  67           pa[i] = va;
  68         }
  69
  70         The main entry to this pass is vectorize_loops(), in which
  71    the vectorizer applies a set of analyses on a given set of loops,
  72    followed by the actual vectorization transformation for the loops that
  73    had successfully passed the analysis phase.
  74         Throughout this pass we make a distinction between two types of
  75    data: scalars (which are represented by SSA_NAMES), and memory references
  76    ("data-refs"). These two types of data require different handling both
  77    during analysis and transformation. The types of data-refs that the
  78    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  79    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  80    accesses are required to have a simple (consecutive) access pattern.
  81
  82    Analysis phase:
  83    ===============
  84         The driver for the analysis phase is vect_analyze_loop().
  85    It applies a set of analyses, some of which rely on the scalar evolution
  86    analyzer (scev) developed by Sebastian Pop.
  87
  88         During the analysis phase the vectorizer records some information
  89    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  90    loop, as well as general information about the loop as a whole, which is
  91    recorded in a "loop_vec_info" struct attached to each loop.
  92
  93    Transformation phase:
  94    =====================
  95         The loop transformation phase scans all the stmts in the loop, and
  96    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  97    the loop that needs to be vectorized. It inserts the vector code sequence
  98    just before the scalar stmt S, and records a pointer to the vector code
  99    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 100    attached to S). This pointer will be used for the vectorization of following
 101    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 102    otherwise, we rely on dead code elimination for removing it.
 103
 104         For example, say stmt S1 was vectorized into stmt VS1:
 105
 106    VS1: vb = px[i];
 107    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 108    S2:  a = b;
 109
 110    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 111    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 112    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
 113    resulting sequence would be:
 114
 115    VS1: vb = px[i];
 116    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 117    VS2: va = vb;
 118    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 119
 120         Operands that are not SSA_NAMEs, are data-refs that appear in
 121    load/store operations (like 'x[i]' in S1), and are handled differently.
 122
 123    Target modeling:
 124    =================
 125         Currently the only target specific information that is used is the
 126    size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
 127    support different sizes of vectors, for now will need to specify one value
 128    for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
 129
 130         Since we only vectorize operations which vector form can be
 131    expressed using existing tree codes, to verify that an operation is
 132    supported, the vectorizer checks the relevant optab at the relevant
 133    machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
 134    the value found is CODE_FOR_nothing, then there's no target support, and
 135    we can't vectorize the stmt.
 136
 137    For additional information on this project see:
 138    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 139 */
 140
 141 /* Function vect_determine_vectorization_factor
 142
 143    Determine the vectorization factor (VF). VF is the number of data elements
 144    that are operated upon in parallel in a single iteration of the vectorized
 145    loop. For example, when vectorizing a loop that operates on 4byte elements,
 146    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 147    elements can fit in a single vector register.
 148
 149    We currently support vectorization of loops in which all types operated upon
 150    are of the same size. Therefore this function currently sets VF according to
 151    the size of the types operated upon, and fails if there are multiple sizes
 152    in the loop.
 153
 154    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 155    original loop:
 156         for (i=0; i<N; i++){
 157           a[i] = b[i] + c[i];
 158         }
 159
 160    vectorized loop:
 161         for (i=0; i<N; i+=VF){
 162           a[i:VF] = b[i:VF] + c[i:VF];
 163         }
 164 */
 165
 166 static bool
 167 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 168 {
 169   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 170   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 171   int nbbs = loop->num_nodes;
 172   gimple_stmt_iterator si;
 173   unsigned int vectorization_factor = 0;
 174   tree scalar_type;
 175   gimple phi;
 176   tree vectype;
 177   unsigned int nunits;
 178   stmt_vec_info stmt_info;
 179   int i;
 180   HOST_WIDE_INT dummy;
 181
 182   if (vect_print_dump_info (REPORT_DETAILS))
 183     fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
 184
 185   for (i = 0; i < nbbs; i++)
 186     {
 187       basic_block bb = bbs[i];
 188
 189       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 190         {
 191           phi = gsi_stmt (si);
 192           stmt_info = vinfo_for_stmt (phi);
 193           if (vect_print_dump_info (REPORT_DETAILS))
 194             {
 195               fprintf (vect_dump, "==> examining phi: ");
 196               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 197             }
 198
 199           gcc_assert (stmt_info);
 200
 201           if (STMT_VINFO_RELEVANT_P (stmt_info))
 202             {
 203               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 204               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 205
 206               if (vect_print_dump_info (REPORT_DETAILS))
 207                 {
 208                   fprintf (vect_dump, "get vectype for scalar type:  ");
 209                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 210                 }
 211
 212               vectype = get_vectype_for_scalar_type (scalar_type);
 213               if (!vectype)
 214                 {
 215                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 216                     {
 217                       fprintf (vect_dump,
 218                                "not vectorized: unsupported data-type ");
 219                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 220                     }
 221                   return false;
 222                 }
 223               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 224
 225               if (vect_print_dump_info (REPORT_DETAILS))
 226                 {
 227                   fprintf (vect_dump, "vectype: ");
 228                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 229                 }
 230
 231               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 232               if (vect_print_dump_info (REPORT_DETAILS))
 233                 fprintf (vect_dump, "nunits = %d", nunits);
 234
 235               if (!vectorization_factor
 236                   || (nunits > vectorization_factor))
 237                 vectorization_factor = nunits;
 238             }
 239         }
 240
 241       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 242         {
 243           gimple stmt = gsi_stmt (si);
 244           stmt_info = vinfo_for_stmt (stmt);
 245
 246           if (vect_print_dump_info (REPORT_DETAILS))
 247             {
 248               fprintf (vect_dump, "==> examining statement: ");
 249               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 250             }
 251
 252           gcc_assert (stmt_info);
 253
 254           /* skip stmts which do not need to be vectorized.  */
 255           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 256               && !STMT_VINFO_LIVE_P (stmt_info))
 257             {
 258               if (vect_print_dump_info (REPORT_DETAILS))
 259                 fprintf (vect_dump, "skip.");
 260               continue;
 261             }
 262
 263           if (gimple_get_lhs (stmt) == NULL_TREE)
 264             {
 265               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 266                 {
 267                   fprintf (vect_dump, "not vectorized: irregular stmt.");
 268                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 269                 }
 270               return false;
 271             }
 272
 273           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 274             {
 275               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 276                 {
 277                   fprintf (vect_dump, "not vectorized: vector stmt in loop:");
 278                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 279                 }
 280               return false;
 281             }
 282
 283           if (STMT_VINFO_VECTYPE (stmt_info))
 284             {
 285               /* The only case when a vectype had been already set is for stmts
 286                  that contain a dataref, or for "pattern-stmts" (stmts generated
 287                  by the vectorizer to represent/replace a certain idiom).  */
 288               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 289                           || is_pattern_stmt_p (stmt_info));
 290               vectype = STMT_VINFO_VECTYPE (stmt_info);
 291             }
 292           else
 293             {
 294               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
 295                           && !is_pattern_stmt_p (stmt_info));
 296
 297               scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 298                                                            &dummy);
 299               if (vect_print_dump_info (REPORT_DETAILS))
 300                 {
 301                   fprintf (vect_dump, "get vectype for scalar type:  ");
 302                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 303                 }
 304
 305               vectype = get_vectype_for_scalar_type (scalar_type);
 306               if (!vectype)
 307                 {
 308                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 309                     {
 310                       fprintf (vect_dump,
 311                                "not vectorized: unsupported data-type ");
 312                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 313                     }
 314                   return false;
 315                 }
 316               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 317             }
 318
 319           if (vect_print_dump_info (REPORT_DETAILS))
 320             {
 321               fprintf (vect_dump, "vectype: ");
 322               print_generic_expr (vect_dump, vectype, TDF_SLIM);
 323             }
 324
 325           nunits = TYPE_VECTOR_SUBPARTS (vectype);
 326           if (vect_print_dump_info (REPORT_DETAILS))
 327             fprintf (vect_dump, "nunits = %d", nunits);
 328
 329           if (!vectorization_factor
 330               || (nunits > vectorization_factor))
 331             vectorization_factor = nunits;
 332
 333         }
 334     }
 335
 336   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 337   if (vect_print_dump_info (REPORT_DETAILS))
 338     fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
 339   if (vectorization_factor <= 1)
 340     {
 341       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 342         fprintf (vect_dump, "not vectorized: unsupported data-type");
 343       return false;
 344     }
 345   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 346
 347   return true;
 348 }
 349
 350
 351 /* Function vect_is_simple_iv_evolution.
 352
 353    FORNOW: A simple evolution of an induction variables in the loop is
 354    considered a polynomial evolution with constant step.  */
 355
 356 static bool
 357 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 358                              tree * step)
 359 {
 360   tree init_expr;
 361   tree step_expr;
 362   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 363
 364   /* When there is no evolution in this loop, the evolution function
 365      is not "simple".  */
 366   if (evolution_part == NULL_TREE)
 367     return false;
 368
 369   /* When the evolution is a polynomial of degree >= 2
 370      the evolution function is not "simple".  */
 371   if (tree_is_chrec (evolution_part))
 372     return false;
 373
 374   step_expr = evolution_part;
 375   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 376
 377   if (vect_print_dump_info (REPORT_DETAILS))
 378     {
 379       fprintf (vect_dump, "step: ");
 380       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
 381       fprintf (vect_dump, ",  init: ");
 382       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
 383     }
 384
 385   *init = init_expr;
 386   *step = step_expr;
 387
 388   if (TREE_CODE (step_expr) != INTEGER_CST)
 389     {
 390       if (vect_print_dump_info (REPORT_DETAILS))
 391         fprintf (vect_dump, "step unknown.");
 392       return false;
 393     }
 394
 395   return true;
 396 }
 397
 398 /* Function vect_analyze_scalar_cycles_1.
 399
 400    Examine the cross iteration def-use cycles of scalar variables
 401    in LOOP. LOOP_VINFO represents the loop that is now being
 402    considered for vectorization (can be LOOP, or an outer-loop
 403    enclosing LOOP).  */
 404
 405 static void
 406 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 407 {
 408   basic_block bb = loop->header;
 409   tree dumy;
 410   VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
 411   gimple_stmt_iterator gsi;
 412   bool double_reduc;
 413
 414   if (vect_print_dump_info (REPORT_DETAILS))
 415     fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
 416
 417   /* First - identify all inductions. Reduction detection assumes that all the
 418      inductions have been identified, therefore, this order must not be
 419      changed.  */
 420   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 421     {
 422       gimple phi = gsi_stmt (gsi);
 423       tree access_fn = NULL;
 424       tree def = PHI_RESULT (phi);
 425       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 426
 427       if (vect_print_dump_info (REPORT_DETAILS))
 428         {
 429           fprintf (vect_dump, "Analyze phi: ");
 430           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 431         }
 432
 433       /* Skip virtual phi's. The data dependences that are associated with
 434          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 435       if (!is_gimple_reg (SSA_NAME_VAR (def)))
 436         continue;
 437
 438       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 439
 440       /* Analyze the evolution function.  */
 441       access_fn = analyze_scalar_evolution (loop, def);
 442       if (access_fn && vect_print_dump_info (REPORT_DETAILS))
 443         {
 444           fprintf (vect_dump, "Access function of PHI: ");
 445           print_generic_expr (vect_dump, access_fn, TDF_SLIM);
 446         }
 447
 448       if (!access_fn
 449           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 450         {
 451           VEC_safe_push (gimple, heap, worklist, phi);
 452           continue;
 453         }
 454
 455       if (vect_print_dump_info (REPORT_DETAILS))
 456         fprintf (vect_dump, "Detected induction.");
 457       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 458     }
 459
 460
 461   /* Second - identify all reductions and nested cycles.  */
 462   while (VEC_length (gimple, worklist) > 0)
 463     {
 464       gimple phi = VEC_pop (gimple, worklist);
 465       tree def = PHI_RESULT (phi);
 466       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 467       gimple reduc_stmt;
 468       bool nested_cycle;
 469
 470       if (vect_print_dump_info (REPORT_DETAILS))
 471         {
 472           fprintf (vect_dump, "Analyze phi: ");
 473           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 474         }
 475
 476       gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
 477       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 478
 479       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 480       reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle,
 481                                              &double_reduc);
 482       if (reduc_stmt)
 483         {
 484           if (double_reduc)
 485             {
 486               if (vect_print_dump_info (REPORT_DETAILS))
 487                 fprintf (vect_dump, "Detected double reduction.");
 488
 489               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 490               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 491                                                     vect_double_reduction_def;
 492             }
 493           else
 494             {
 495               if (nested_cycle)
 496                 {
 497                   if (vect_print_dump_info (REPORT_DETAILS))
 498                     fprintf (vect_dump, "Detected vectorizable nested cycle.");
 499
 500                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 501                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 502                                                              vect_nested_cycle;
 503                 }
 504               else
 505                 {
 506                   if (vect_print_dump_info (REPORT_DETAILS))
 507                     fprintf (vect_dump, "Detected reduction.");
 508
 509                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 510                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 511                                                            vect_reduction_def;
 512                 }
 513             }
 514         }
 515       else
 516         if (vect_print_dump_info (REPORT_DETAILS))
 517           fprintf (vect_dump, "Unknown def-use cycle pattern.");
 518     }
 519
 520   VEC_free (gimple, heap, worklist);
 521 }
 522
 523
 524 /* Function vect_analyze_scalar_cycles.
 525
 526    Examine the cross iteration def-use cycles of scalar variables, by
 527    analyzing the loop-header PHIs of scalar variables; Classify each
 528    cycle as one of the following: invariant, induction, reduction, unknown.
 529    We do that for the loop represented by LOOP_VINFO, and also to its
 530    inner-loop, if exists.
 531    Examples for scalar cycles:
 532
 533    Example1: reduction:
 534
 535               loop1:
 536               for (i=0; i<N; i++)
 537                  sum += a[i];
 538
 539    Example2: induction:
 540
 541               loop2:
 542               for (i=0; i<N; i++)
 543                  a[i] = i;  */
 544
 545 static void
 546 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 547 {
 548   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 549
 550   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 551
 552   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 553      Reductions in such inner-loop therefore have different properties than
 554      the reductions in the nest that gets vectorized:
 555      1. When vectorized, they are executed in the same order as in the original
 556         scalar loop, so we can't change the order of computation when
 557         vectorizing them.
 558      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 559         current checks are too strict.  */
 560
 561   if (loop->inner)
 562     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 563 }
 564
 565 /* Function vect_get_loop_niters.
 566
 567    Determine how many iterations the loop is executed.
 568    If an expression that represents the number of iterations
 569    can be constructed, place it in NUMBER_OF_ITERATIONS.
 570    Return the loop exit condition.  */
 571
 572 static gimple
 573 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 574 {
 575   tree niters;
 576
 577   if (vect_print_dump_info (REPORT_DETAILS))
 578     fprintf (vect_dump, "=== get_loop_niters ===");
 579
 580   niters = number_of_exit_cond_executions (loop);
 581
 582   if (niters != NULL_TREE
 583       && niters != chrec_dont_know)
 584     {
 585       *number_of_iterations = niters;
 586
 587       if (vect_print_dump_info (REPORT_DETAILS))
 588         {
 589           fprintf (vect_dump, "==> get_loop_niters:" );
 590           print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
 591         }
 592     }
 593
 594   return get_loop_exit_condition (loop);
 595 }
 596
 597
 598 /* Function bb_in_loop_p
 599
 600    Used as predicate for dfs order traversal of the loop bbs.  */
 601
 602 static bool
 603 bb_in_loop_p (const_basic_block bb, const void *data)
 604 {
 605   const struct loop *const loop = (const struct loop *)data;
 606   if (flow_bb_inside_loop_p (loop, bb))
 607     return true;
 608   return false;
 609 }
 610
 611
 612 /* Function new_loop_vec_info.
 613
 614    Create and initialize a new loop_vec_info struct for LOOP, as well as
 615    stmt_vec_info structs for all the stmts in LOOP.  */
 616
 617 static loop_vec_info
 618 new_loop_vec_info (struct loop *loop)
 619 {
 620   loop_vec_info res;
 621   basic_block *bbs;
 622   gimple_stmt_iterator si;
 623   unsigned int i, nbbs;
 624
 625   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 626   LOOP_VINFO_LOOP (res) = loop;
 627
 628   bbs = get_loop_body (loop);
 629
 630   /* Create/Update stmt_info for all stmts in the loop.  */
 631   for (i = 0; i < loop->num_nodes; i++)
 632     {
 633       basic_block bb = bbs[i];
 634
 635       /* BBs in a nested inner-loop will have been already processed (because
 636          we will have called vect_analyze_loop_form for any nested inner-loop).
 637          Therefore, for stmts in an inner-loop we just want to update the
 638          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 639          loop_info of the outer-loop we are currently considering to vectorize
 640          (instead of the loop_info of the inner-loop).
 641          For stmts in other BBs we need to create a stmt_info from scratch.  */
 642       if (bb->loop_father != loop)
 643         {
 644           /* Inner-loop bb.  */
 645           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 646           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 647             {
 648               gimple phi = gsi_stmt (si);
 649               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 650               loop_vec_info inner_loop_vinfo =
 651                 STMT_VINFO_LOOP_VINFO (stmt_info);
 652               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 653               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 654             }
 655           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 656            {
 657               gimple stmt = gsi_stmt (si);
 658               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 659               loop_vec_info inner_loop_vinfo =
 660                  STMT_VINFO_LOOP_VINFO (stmt_info);
 661               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 662               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 663            }
 664         }
 665       else
 666         {
 667           /* bb in current nest.  */
 668           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 669             {
 670               gimple phi = gsi_stmt (si);
 671               gimple_set_uid (phi, 0);
 672               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 673             }
 674
 675           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 676             {
 677               gimple stmt = gsi_stmt (si);
 678               gimple_set_uid (stmt, 0);
 679               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 680             }
 681         }
 682     }
 683
 684   /* CHECKME: We want to visit all BBs before their successors (except for
 685      latch blocks, for which this assertion wouldn't hold).  In the simple
 686      case of the loop forms we allow, a dfs order of the BBs would the same
 687      as reversed postorder traversal, so we are safe.  */
 688
 689    free (bbs);
 690    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 691    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 692                               bbs, loop->num_nodes, loop);
 693    gcc_assert (nbbs == loop->num_nodes);
 694
 695   LOOP_VINFO_BBS (res) = bbs;
 696   LOOP_VINFO_NITERS (res) = NULL;
 697   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 698   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 699   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 700   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 701   LOOP_VINFO_VECT_FACTOR (res) = 0;
 702   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
 703   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
 704   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 705   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
 706     VEC_alloc (gimple, heap,
 707                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 708   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
 709     VEC_alloc (ddr_p, heap,
 710                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 711   LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
 712   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
 713   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 714
 715   return res;
 716 }
 717
 718
 719 /* Function destroy_loop_vec_info.
 720
 721    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 722    stmts in the loop.  */
 723
 724 void
 725 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 726 {
 727   struct loop *loop;
 728   basic_block *bbs;
 729   int nbbs;
 730   gimple_stmt_iterator si;
 731   int j;
 732   VEC (slp_instance, heap) *slp_instances;
 733   slp_instance instance;
 734
 735   if (!loop_vinfo)
 736     return;
 737
 738   loop = LOOP_VINFO_LOOP (loop_vinfo);
 739
 740   bbs = LOOP_VINFO_BBS (loop_vinfo);
 741   nbbs = loop->num_nodes;
 742
 743   if (!clean_stmts)
 744     {
 745       free (LOOP_VINFO_BBS (loop_vinfo));
 746       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 747       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 748       VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 749
 750       free (loop_vinfo);
 751       loop->aux = NULL;
 752       return;
 753     }
 754
 755   for (j = 0; j < nbbs; j++)
 756     {
 757       basic_block bb = bbs[j];
 758       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 759         free_stmt_vec_info (gsi_stmt (si));
 760
 761       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 762         {
 763           gimple stmt = gsi_stmt (si);
 764           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 765
 766           if (stmt_info)
 767             {
 768               /* Check if this is a "pattern stmt" (introduced by the
 769                  vectorizer during the pattern recognition pass).  */
 770               bool remove_stmt_p = false;
 771               gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
 772               if (orig_stmt)
 773                 {
 774                   stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
 775                   if (orig_stmt_info
 776                       && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
 777                     remove_stmt_p = true;
 778                 }
 779
 780               /* Free stmt_vec_info.  */
 781               free_stmt_vec_info (stmt);
 782
 783               /* Remove dead "pattern stmts".  */
 784               if (remove_stmt_p)
 785                 gsi_remove (&si, true);
 786             }
 787           gsi_next (&si);
 788         }
 789     }
 790
 791   free (LOOP_VINFO_BBS (loop_vinfo));
 792   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 793   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 794   VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 795   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 796   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 797   for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
 798     vect_free_slp_instance (instance);
 799
 800   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
 801   VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
 802
 803   free (loop_vinfo);
 804   loop->aux = NULL;
 805 }
 806
 807
 808 /* Function vect_analyze_loop_1.
 809
 810    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 811    for it. The different analyses will record information in the
 812    loop_vec_info struct.  This is a subset of the analyses applied in
 813    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 814    that is now considered for (outer-loop) vectorization.  */
 815
 816 static loop_vec_info
 817 vect_analyze_loop_1 (struct loop *loop)
 818 {
 819   loop_vec_info loop_vinfo;
 820
 821   if (vect_print_dump_info (REPORT_DETAILS))
 822     fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
 823
 824   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 825
 826   loop_vinfo = vect_analyze_loop_form (loop);
 827   if (!loop_vinfo)
 828     {
 829       if (vect_print_dump_info (REPORT_DETAILS))
 830         fprintf (vect_dump, "bad inner-loop form.");
 831       return NULL;
 832     }
 833
 834   return loop_vinfo;
 835 }
 836
 837
 838 /* Function vect_analyze_loop_form.
 839
 840    Verify that certain CFG restrictions hold, including:
 841    - the loop has a pre-header
 842    - the loop has a single entry and exit
 843    - the loop exit condition is simple enough, and the number of iterations
 844      can be analyzed (a countable loop).  */
 845
 846 loop_vec_info
 847 vect_analyze_loop_form (struct loop *loop)
 848 {
 849   loop_vec_info loop_vinfo;
 850   gimple loop_cond;
 851   tree number_of_iterations = NULL;
 852   loop_vec_info inner_loop_vinfo = NULL;
 853
 854   if (vect_print_dump_info (REPORT_DETAILS))
 855     fprintf (vect_dump, "=== vect_analyze_loop_form ===");
 856
 857   /* Different restrictions apply when we are considering an inner-most loop,
 858      vs. an outer (nested) loop.
 859      (FORNOW. May want to relax some of these restrictions in the future).  */
 860
 861   if (!loop->inner)
 862     {
 863       /* Inner-most loop.  We currently require that the number of BBs is
 864          exactly 2 (the header and latch).  Vectorizable inner-most loops
 865          look like this:
 866
 867                         (pre-header)
 868                            |
 869                           header <--------+
 870                            | |            |
 871                            | +--> latch --+
 872                            |
 873                         (exit-bb)  */
 874
 875       if (loop->num_nodes != 2)
 876         {
 877           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 878             fprintf (vect_dump, "not vectorized: control flow in loop.");
 879           return NULL;
 880         }
 881
 882       if (empty_block_p (loop->header))
 883     {
 884           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 885             fprintf (vect_dump, "not vectorized: empty loop.");
 886       return NULL;
 887     }
 888     }
 889   else
 890     {
 891       struct loop *innerloop = loop->inner;
 892       edge backedge, entryedge;
 893
 894       /* Nested loop. We currently require that the loop is doubly-nested,
 895          contains a single inner loop, and the number of BBs is exactly 5.
 896          Vectorizable outer-loops look like this:
 897
 898                         (pre-header)
 899                            |
 900                           header <---+
 901                            |         |
 902                           inner-loop |
 903                            |         |
 904                           tail ------+
 905                            |
 906                         (exit-bb)
 907
 908          The inner-loop has the properties expected of inner-most loops
 909          as described above.  */
 910
 911       if ((loop->inner)->inner || (loop->inner)->next)
 912         {
 913           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 914             fprintf (vect_dump, "not vectorized: multiple nested loops.");
 915           return NULL;
 916         }
 917
 918       /* Analyze the inner-loop.  */
 919       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
 920       if (!inner_loop_vinfo)
 921         {
 922           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 923             fprintf (vect_dump, "not vectorized: Bad inner loop.");
 924           return NULL;
 925         }
 926
 927       if (!expr_invariant_in_loop_p (loop,
 928                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
 929         {
 930           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 931             fprintf (vect_dump,
 932                      "not vectorized: inner-loop count not invariant.");
 933           destroy_loop_vec_info (inner_loop_vinfo, true);
 934           return NULL;
 935         }
 936
 937       if (loop->num_nodes != 5)
 938         {
 939           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 940             fprintf (vect_dump, "not vectorized: control flow in loop.");
 941           destroy_loop_vec_info (inner_loop_vinfo, true);
 942           return NULL;
 943         }
 944
 945       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
 946       backedge = EDGE_PRED (innerloop->header, 1);
 947       entryedge = EDGE_PRED (innerloop->header, 0);
 948       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
 949         {
 950           backedge = EDGE_PRED (innerloop->header, 0);
 951           entryedge = EDGE_PRED (innerloop->header, 1);
 952         }
 953
 954       if (entryedge->src != loop->header
 955           || !single_exit (innerloop)
 956           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
 957         {
 958           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 959             fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
 960           destroy_loop_vec_info (inner_loop_vinfo, true);
 961           return NULL;
 962         }
 963
 964       if (vect_print_dump_info (REPORT_DETAILS))
 965         fprintf (vect_dump, "Considering outer-loop vectorization.");
 966     }
 967
 968   if (!single_exit (loop)
 969       || EDGE_COUNT (loop->header->preds) != 2)
 970     {
 971       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 972         {
 973           if (!single_exit (loop))
 974             fprintf (vect_dump, "not vectorized: multiple exits.");
 975           else if (EDGE_COUNT (loop->header->preds) != 2)
 976             fprintf (vect_dump, "not vectorized: too many incoming edges.");
 977         }
 978       if (inner_loop_vinfo)
 979         destroy_loop_vec_info (inner_loop_vinfo, true);
 980       return NULL;
 981     }
 982
 983   /* We assume that the loop exit condition is at the end of the loop. i.e,
 984      that the loop is represented as a do-while (with a proper if-guard
 985      before the loop if needed), where the loop header contains all the
 986      executable statements, and the latch is empty.  */
 987   if (!empty_block_p (loop->latch)
 988         || phi_nodes (loop->latch))
 989     {
 990       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 991         fprintf (vect_dump, "not vectorized: unexpected loop form.");
 992       if (inner_loop_vinfo)
 993         destroy_loop_vec_info (inner_loop_vinfo, true);
 994       return NULL;
 995     }
 996
 997   /* Make sure there exists a single-predecessor exit bb:  */
 998   if (!single_pred_p (single_exit (loop)->dest))
 999     {
1000       edge e = single_exit (loop);
1001       if (!(e->flags & EDGE_ABNORMAL))
1002         {
1003           split_loop_exit_edge (e);
1004           if (vect_print_dump_info (REPORT_DETAILS))
1005             fprintf (vect_dump, "split exit edge.");
1006         }
1007       else
1008         {
1009           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1010             fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1011           if (inner_loop_vinfo)
1012             destroy_loop_vec_info (inner_loop_vinfo, true);
1013           return NULL;
1014         }
1015     }
1016
1017   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1018   if (!loop_cond)
1019     {
1020       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1021         fprintf (vect_dump, "not vectorized: complicated exit condition.");
1022       if (inner_loop_vinfo)
1023         destroy_loop_vec_info (inner_loop_vinfo, true);
1024       return NULL;
1025     }
1026
1027   if (!number_of_iterations)
1028     {
1029       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1030         fprintf (vect_dump,
1031                  "not vectorized: number of iterations cannot be computed.");
1032       if (inner_loop_vinfo)
1033         destroy_loop_vec_info (inner_loop_vinfo, true);
1034       return NULL;
1035     }
1036
1037   if (chrec_contains_undetermined (number_of_iterations))
1038     {
1039       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1040         fprintf (vect_dump, "Infinite number of iterations.");
1041       if (inner_loop_vinfo)
1042         destroy_loop_vec_info (inner_loop_vinfo, true);
1043       return NULL;
1044     }
1045
1046   if (!NITERS_KNOWN_P (number_of_iterations))
1047     {
1048       if (vect_print_dump_info (REPORT_DETAILS))
1049         {
1050           fprintf (vect_dump, "Symbolic number of iterations is ");
1051           print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1052         }
1053     }
1054   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1055     {
1056       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1057         fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1058       if (inner_loop_vinfo)
1059         destroy_loop_vec_info (inner_loop_vinfo, false);
1060       return NULL;
1061     }
1062
1063   loop_vinfo = new_loop_vec_info (loop);
1064   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1065   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1066
1067   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1068
1069   /* CHECKME: May want to keep it around it in the future.  */
1070   if (inner_loop_vinfo)
1071     destroy_loop_vec_info (inner_loop_vinfo, false);
1072
1073   gcc_assert (!loop->aux);
1074   loop->aux = loop_vinfo;
1075   return loop_vinfo;
1076 }
1077
1078
1079 /* Function vect_analyze_loop_operations.
1080
1081    Scan the loop stmts and make sure they are all vectorizable.  */
1082
1083 static bool
1084 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1085 {
1086   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1087   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1088   int nbbs = loop->num_nodes;
1089   gimple_stmt_iterator si;
1090   unsigned int vectorization_factor = 0;
1091   int i;
1092   gimple phi;
1093   stmt_vec_info stmt_info;
1094   bool need_to_vectorize = false;
1095   int min_profitable_iters;
1096   int min_scalar_loop_bound;
1097   unsigned int th;
1098   bool only_slp_in_loop = true, ok;
1099
1100   if (vect_print_dump_info (REPORT_DETAILS))
1101     fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1102
1103   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1104   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1105
1106   for (i = 0; i < nbbs; i++)
1107     {
1108       basic_block bb = bbs[i];
1109
1110       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1111         {
1112           phi = gsi_stmt (si);
1113           ok = true;
1114
1115           stmt_info = vinfo_for_stmt (phi);
1116           if (vect_print_dump_info (REPORT_DETAILS))
1117             {
1118               fprintf (vect_dump, "examining phi: ");
1119               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1120             }
1121
1122           if (! is_loop_header_bb_p (bb))
1123             {
1124               /* inner-loop loop-closed exit phi in outer-loop vectorization
1125                  (i.e. a phi in the tail of the outer-loop).
1126                  FORNOW: we currently don't support the case that these phis
1127                  are not used in the outerloop (unless it is double reduction,
1128                  i.e., this phi is vect_reduction_def), cause this case
1129                  requires to actually do something here.  */
1130               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1131                    || STMT_VINFO_LIVE_P (stmt_info))
1132                   && STMT_VINFO_DEF_TYPE (stmt_info)
1133                      != vect_double_reduction_def)
1134                 {
1135                   if (vect_print_dump_info (REPORT_DETAILS))
1136                     fprintf (vect_dump,
1137                              "Unsupported loop-closed phi in outer-loop.");
1138                   return false;
1139                 }
1140               continue;
1141             }
1142
1143           gcc_assert (stmt_info);
1144
1145           if (STMT_VINFO_LIVE_P (stmt_info))
1146             {
1147               /* FORNOW: not yet supported.  */
1148               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1149                 fprintf (vect_dump, "not vectorized: value used after loop.");
1150               return false;
1151             }
1152
1153           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1154               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1155             {
1156               /* A scalar-dependence cycle that we don't support.  */
1157               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1158                 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1159               return false;
1160             }
1161
1162           if (STMT_VINFO_RELEVANT_P (stmt_info))
1163             {
1164               need_to_vectorize = true;
1165               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1166                 ok = vectorizable_induction (phi, NULL, NULL);
1167             }
1168
1169           if (!ok)
1170             {
1171               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1172                 {
1173                   fprintf (vect_dump,
1174                            "not vectorized: relevant phi not supported: ");
1175                   print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1176                 }
1177               return false;
1178             }
1179         }
1180
1181       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1182         {
1183           gimple stmt = gsi_stmt (si);
1184           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1185
1186           gcc_assert (stmt_info);
1187
1188           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1189             return false;
1190
1191           if (STMT_VINFO_RELEVANT_P (stmt_info) && !PURE_SLP_STMT (stmt_info))
1192             /* STMT needs both SLP and loop-based vectorization.  */
1193             only_slp_in_loop = false;
1194         }
1195     } /* bbs */
1196
1197   /* All operations in the loop are either irrelevant (deal with loop
1198      control, or dead), or only used outside the loop and can be moved
1199      out of the loop (e.g. invariants, inductions).  The loop can be
1200      optimized away by scalar optimizations.  We're better off not
1201      touching this loop.  */
1202   if (!need_to_vectorize)
1203     {
1204       if (vect_print_dump_info (REPORT_DETAILS))
1205         fprintf (vect_dump,
1206                  "All the computation can be taken out of the loop.");
1207       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1208         fprintf (vect_dump,
1209                  "not vectorized: redundant loop. no profit to vectorize.");
1210       return false;
1211     }
1212
1213   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1214      vectorization factor of the loop is the unrolling factor required by the
1215      SLP instances.  If that unrolling factor is 1, we say, that we perform
1216      pure SLP on loop - cross iteration parallelism is not exploited.  */
1217   if (only_slp_in_loop)
1218     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1219   else
1220     vectorization_factor = least_common_multiple (vectorization_factor,
1221                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1222
1223   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1224
1225   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1226       && vect_print_dump_info (REPORT_DETAILS))
1227     fprintf (vect_dump,
1228         "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1229         vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1230
1231   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1232       && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1233     {
1234       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1235         fprintf (vect_dump, "not vectorized: iteration count too small.");
1236       if (vect_print_dump_info (REPORT_DETAILS))
1237         fprintf (vect_dump,"not vectorized: iteration count smaller than "
1238                  "vectorization factor.");
1239       return false;
1240     }
1241
1242   /* Analyze cost. Decide if worth while to vectorize.  */
1243
1244   /* Once VF is set, SLP costs should be updated since the number of created
1245      vector stmts depends on VF.  */
1246   vect_update_slp_costs_according_to_vf (loop_vinfo);
1247
1248   min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1249   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1250
1251   if (min_profitable_iters < 0)
1252     {
1253       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1254         fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1255       if (vect_print_dump_info (REPORT_DETAILS))
1256         fprintf (vect_dump, "not vectorized: vector version will never be "
1257                  "profitable.");
1258       return false;
1259     }
1260
1261   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1262                             * vectorization_factor) - 1);
1263
1264   /* Use the cost model only if it is more conservative than user specified
1265      threshold.  */
1266
1267   th = (unsigned) min_scalar_loop_bound;
1268   if (min_profitable_iters
1269       && (!min_scalar_loop_bound
1270           || min_profitable_iters > min_scalar_loop_bound))
1271     th = (unsigned) min_profitable_iters;
1272
1273   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1274       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1275     {
1276       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1277         fprintf (vect_dump, "not vectorized: vectorization not "
1278                  "profitable.");
1279       if (vect_print_dump_info (REPORT_DETAILS))
1280         fprintf (vect_dump, "not vectorized: iteration count smaller than "
1281                  "user specified loop bound parameter or minimum "
1282                  "profitable iterations (whichever is more conservative).");
1283       return false;
1284     }
1285
1286   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1287       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1288       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1289     {
1290       if (vect_print_dump_info (REPORT_DETAILS))
1291         fprintf (vect_dump, "epilog loop required.");
1292       if (!vect_can_advance_ivs_p (loop_vinfo))
1293         {
1294           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1295             fprintf (vect_dump,
1296                      "not vectorized: can't create epilog loop 1.");
1297           return false;
1298         }
1299       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1300         {
1301           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1302             fprintf (vect_dump,
1303                      "not vectorized: can't create epilog loop 2.");
1304           return false;
1305         }
1306     }
1307
1308   return true;
1309 }
1310
1311
1312 /* Function vect_analyze_loop.
1313
1314    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1315    for it. The different analyses will record information in the
1316    loop_vec_info struct.  */
1317 loop_vec_info
1318 vect_analyze_loop (struct loop *loop)
1319 {
1320   bool ok;
1321   loop_vec_info loop_vinfo;
1322
1323   if (vect_print_dump_info (REPORT_DETAILS))
1324     fprintf (vect_dump, "===== analyze_loop_nest =====");
1325
1326   if (loop_outer (loop)
1327       && loop_vec_info_for_loop (loop_outer (loop))
1328       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1329     {
1330       if (vect_print_dump_info (REPORT_DETAILS))
1331         fprintf (vect_dump, "outer-loop already vectorized.");
1332       return NULL;
1333     }
1334
1335   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1336
1337   loop_vinfo = vect_analyze_loop_form (loop);
1338   if (!loop_vinfo)
1339     {
1340       if (vect_print_dump_info (REPORT_DETAILS))
1341         fprintf (vect_dump, "bad loop form.");
1342       return NULL;
1343     }
1344
1345   /* Find all data references in the loop (which correspond to vdefs/vuses)
1346      and analyze their evolution in the loop.
1347
1348      FORNOW: Handle only simple, array references, which
1349      alignment can be forced, and aligned pointer-references.  */
1350
1351   ok = vect_analyze_data_refs (loop_vinfo, NULL);
1352   if (!ok)
1353     {
1354       if (vect_print_dump_info (REPORT_DETAILS))
1355         fprintf (vect_dump, "bad data references.");
1356       destroy_loop_vec_info (loop_vinfo, true);
1357       return NULL;
1358     }
1359
1360   /* Classify all cross-iteration scalar data-flow cycles.
1361      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1362
1363   vect_analyze_scalar_cycles (loop_vinfo);
1364
1365   vect_pattern_recog (loop_vinfo);
1366
1367   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1368
1369   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1370   if (!ok)
1371     {
1372       if (vect_print_dump_info (REPORT_DETAILS))
1373         fprintf (vect_dump, "unexpected pattern.");
1374       destroy_loop_vec_info (loop_vinfo, true);
1375       return NULL;
1376     }
1377
1378   /* Analyze the alignment of the data-refs in the loop.
1379      Fail if a data reference is found that cannot be vectorized.  */
1380
1381   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1382   if (!ok)
1383     {
1384       if (vect_print_dump_info (REPORT_DETAILS))
1385         fprintf (vect_dump, "bad data alignment.");
1386       destroy_loop_vec_info (loop_vinfo, true);
1387       return NULL;
1388     }
1389
1390   ok = vect_determine_vectorization_factor (loop_vinfo);
1391   if (!ok)
1392     {
1393       if (vect_print_dump_info (REPORT_DETAILS))
1394         fprintf (vect_dump, "can't determine vectorization factor.");
1395       destroy_loop_vec_info (loop_vinfo, true);
1396       return NULL;
1397     }
1398
1399   /* Analyze data dependences between the data-refs in the loop.
1400      FORNOW: fail at the first data dependence that we encounter.  */
1401
1402   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL);
1403   if (!ok)
1404     {
1405       if (vect_print_dump_info (REPORT_DETAILS))
1406         fprintf (vect_dump, "bad data dependence.");
1407       destroy_loop_vec_info (loop_vinfo, true);
1408       return NULL;
1409     }
1410
1411   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1412      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1413
1414   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1415   if (!ok)
1416     {
1417       if (vect_print_dump_info (REPORT_DETAILS))
1418         fprintf (vect_dump, "bad data access.");
1419       destroy_loop_vec_info (loop_vinfo, true);
1420       return NULL;
1421     }
1422
1423   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1424      It is important to call pruning after vect_analyze_data_ref_accesses,
1425      since we use grouping information gathered by interleaving analysis.  */
1426   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1427   if (!ok)
1428     {
1429       if (vect_print_dump_info (REPORT_DETAILS))
1430         fprintf (vect_dump, "too long list of versioning for alias "
1431                             "run-time tests.");
1432       destroy_loop_vec_info (loop_vinfo, true);
1433       return NULL;
1434     }
1435
1436   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1437   ok = vect_analyze_slp (loop_vinfo, NULL);
1438   if (ok)
1439     {
1440       /* Decide which possible SLP instances to SLP.  */
1441       vect_make_slp_decision (loop_vinfo);
1442
1443       /* Find stmts that need to be both vectorized and SLPed.  */
1444       vect_detect_hybrid_slp (loop_vinfo);
1445     }
1446
1447   /* This pass will decide on using loop versioning and/or loop peeling in
1448      order to enhance the alignment of data references in the loop.  */
1449
1450   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1451   if (!ok)
1452     {
1453       if (vect_print_dump_info (REPORT_DETAILS))
1454         fprintf (vect_dump, "bad data alignment.");
1455       destroy_loop_vec_info (loop_vinfo, true);
1456       return NULL;
1457     }
1458
1459   /* Scan all the operations in the loop and make sure they are
1460      vectorizable.  */
1461
1462   ok = vect_analyze_loop_operations (loop_vinfo);
1463   if (!ok)
1464     {
1465       if (vect_print_dump_info (REPORT_DETAILS))
1466         fprintf (vect_dump, "bad operation or unsupported loop bound.");
1467       destroy_loop_vec_info (loop_vinfo, true);
1468       return NULL;
1469     }
1470
1471   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1472
1473   return loop_vinfo;
1474 }
1475
1476
1477 /* Function reduction_code_for_scalar_code
1478
1479    Input:
1480    CODE - tree_code of a reduction operations.
1481
1482    Output:
1483    REDUC_CODE - the corresponding tree-code to be used to reduce the
1484       vector of partial results into a single scalar result (which
1485       will also reside in a vector) or ERROR_MARK if the operation is
1486       a supported reduction operation, but does not have such tree-code.
1487
1488    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1489
1490 static bool
1491 reduction_code_for_scalar_code (enum tree_code code,
1492                                 enum tree_code *reduc_code)
1493 {
1494   switch (code)
1495     {
1496       case MAX_EXPR:
1497         *reduc_code = REDUC_MAX_EXPR;
1498         return true;
1499
1500       case MIN_EXPR:
1501         *reduc_code = REDUC_MIN_EXPR;
1502         return true;
1503
1504       case PLUS_EXPR:
1505         *reduc_code = REDUC_PLUS_EXPR;
1506         return true;
1507
1508       case MULT_EXPR:
1509       case MINUS_EXPR:
1510       case BIT_IOR_EXPR:
1511       case BIT_XOR_EXPR:
1512       case BIT_AND_EXPR:
1513         *reduc_code = ERROR_MARK;
1514         return true;
1515
1516       default:
1517        return false;
1518     }
1519 }
1520
1521
1522 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1523    STMT is printed with a message MSG. */
1524
1525 static void
1526 report_vect_op (gimple stmt, const char *msg)
1527 {
1528   fprintf (vect_dump, "%s", msg);
1529   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1530 }
1531
1532
1533 /* Function vect_is_simple_reduction
1534
1535    (1) Detect a cross-iteration def-use cycle that represents a simple
1536    reduction computation. We look for the following pattern:
1537
1538    loop_header:
1539      a1 = phi < a0, a2 >
1540      a3 = ...
1541      a2 = operation (a3, a1)
1542
1543    such that:
1544    1. operation is commutative and associative and it is safe to
1545       change the order of the computation (if CHECK_REDUCTION is true)
1546    2. no uses for a2 in the loop (a2 is used out of the loop)
1547    3. no uses of a1 in the loop besides the reduction operation.
1548
1549    Condition 1 is tested here.
1550    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1551
1552    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1553    nested cycles, if CHECK_REDUCTION is false.
1554
1555    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1556    reductions:
1557
1558      a1 = phi < a0, a2 >
1559      inner loop (def of a3)
1560      a2 = phi < a3 >
1561 */
1562
1563 gimple
1564 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
1565                           bool check_reduction, bool *double_reduc)
1566 {
1567   struct loop *loop = (gimple_bb (phi))->loop_father;
1568   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1569   edge latch_e = loop_latch_edge (loop);
1570   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1571   gimple def_stmt, def1 = NULL, def2 = NULL;
1572   enum tree_code code;
1573   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1574   tree type;
1575   int nloop_uses;
1576   tree name;
1577   imm_use_iterator imm_iter;
1578   use_operand_p use_p;
1579   bool phi_def;
1580
1581   *double_reduc = false;
1582
1583   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1584      otherwise, we assume outer loop vectorization.  */
1585   gcc_assert ((check_reduction && loop == vect_loop)
1586               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1587
1588   name = PHI_RESULT (phi);
1589   nloop_uses = 0;
1590   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1591     {
1592       gimple use_stmt = USE_STMT (use_p);
1593       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1594           && vinfo_for_stmt (use_stmt)
1595           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1596         nloop_uses++;
1597       if (nloop_uses > 1)
1598         {
1599           if (vect_print_dump_info (REPORT_DETAILS))
1600             fprintf (vect_dump, "reduction used in loop.");
1601           return NULL;
1602         }
1603     }
1604
1605   if (TREE_CODE (loop_arg) != SSA_NAME)
1606     {
1607       if (vect_print_dump_info (REPORT_DETAILS))
1608         {
1609           fprintf (vect_dump, "reduction: not ssa_name: ");
1610           print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1611         }
1612       return NULL;
1613     }
1614
1615   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1616   if (!def_stmt)
1617     {
1618       if (vect_print_dump_info (REPORT_DETAILS))
1619         fprintf (vect_dump, "reduction: no def_stmt.");
1620       return NULL;
1621     }
1622
1623   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1624     {
1625       if (vect_print_dump_info (REPORT_DETAILS))
1626         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1627       return NULL;
1628     }
1629
1630   if (is_gimple_assign (def_stmt))
1631     {
1632       name = gimple_assign_lhs (def_stmt);
1633       phi_def = false;
1634     }
1635   else
1636     {
1637       name = PHI_RESULT (def_stmt);
1638       phi_def = true;
1639     }
1640
1641   nloop_uses = 0;
1642   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1643     {
1644       gimple use_stmt = USE_STMT (use_p);
1645       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1646           && vinfo_for_stmt (use_stmt)
1647           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1648         nloop_uses++;
1649       if (nloop_uses > 1)
1650         {
1651           if (vect_print_dump_info (REPORT_DETAILS))
1652             fprintf (vect_dump, "reduction used in loop.");
1653           return NULL;
1654         }
1655     }
1656
1657   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1658      defined in the inner loop.  */
1659   if (phi_def)
1660     {
1661       op1 = PHI_ARG_DEF (def_stmt, 0);
1662
1663       if (gimple_phi_num_args (def_stmt) != 1
1664           || TREE_CODE (op1) != SSA_NAME)
1665         {
1666           if (vect_print_dump_info (REPORT_DETAILS))
1667             fprintf (vect_dump, "unsupported phi node definition.");
1668
1669           return NULL;
1670         }
1671
1672       def1 = SSA_NAME_DEF_STMT (op1);
1673       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1674           && loop->inner
1675           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1676           && is_gimple_assign (def1))
1677         {
1678           if (vect_print_dump_info (REPORT_DETAILS))
1679             report_vect_op (def_stmt, "detected double reduction: ");
1680
1681           *double_reduc = true;
1682           return def_stmt;
1683         }
1684
1685       return NULL;
1686     }
1687
1688   code = gimple_assign_rhs_code (def_stmt);
1689
1690   if (check_reduction
1691       && (!commutative_tree_code (code) || !associative_tree_code (code)))
1692     {
1693       if (vect_print_dump_info (REPORT_DETAILS))
1694         report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1695       return NULL;
1696     }
1697
1698   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1699     {
1700       if (code != COND_EXPR)
1701         {
1702           if (vect_print_dump_info (REPORT_DETAILS))
1703             report_vect_op (def_stmt, "reduction: not binary operation: ");
1704
1705           return NULL;
1706         }
1707
1708       op3 = TREE_OPERAND (TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0), 0);
1709       op4 = TREE_OPERAND (TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0), 1);
1710       op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1711       op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1712
1713       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1714         {
1715           if (vect_print_dump_info (REPORT_DETAILS))
1716             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1717
1718           return NULL;
1719         }
1720     }
1721   else
1722     {
1723       op1 = gimple_assign_rhs1 (def_stmt);
1724       op2 = gimple_assign_rhs2 (def_stmt);
1725
1726       if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1727         {
1728           if (vect_print_dump_info (REPORT_DETAILS))
1729             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1730
1731           return NULL;
1732         }
1733    }
1734
1735   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1736   if ((TREE_CODE (op1) == SSA_NAME
1737        && TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op1)))
1738       || (TREE_CODE (op2) == SSA_NAME
1739           && TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op2)))
1740       || (op3 && TREE_CODE (op3) == SSA_NAME
1741           && TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op3)))
1742       || (op4 && TREE_CODE (op4) == SSA_NAME
1743           && TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op4))))
1744     {
1745       if (vect_print_dump_info (REPORT_DETAILS))
1746         {
1747           fprintf (vect_dump, "reduction: multiple types: operation type: ");
1748           print_generic_expr (vect_dump, type, TDF_SLIM);
1749           fprintf (vect_dump, ", operands types: ");
1750           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1751           fprintf (vect_dump, ",");
1752           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1753           if (op3 && op4)
1754             {
1755               fprintf (vect_dump, ",");
1756               print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1757               fprintf (vect_dump, ",");
1758               print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1759             }
1760         }
1761
1762       return NULL;
1763     }
1764
1765   /* Check that it's ok to change the order of the computation.
1766      Generally, when vectorizing a reduction we change the order of the
1767      computation.  This may change the behavior of the program in some
1768      cases, so we need to check that this is ok.  One exception is when
1769      vectorizing an outer-loop: the inner-loop is executed sequentially,
1770      and therefore vectorizing reductions in the inner-loop during
1771      outer-loop vectorization is safe.  */
1772
1773   /* CHECKME: check for !flag_finite_math_only too?  */
1774   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1775       && check_reduction)
1776     {
1777       /* Changing the order of operations changes the semantics.  */
1778       if (vect_print_dump_info (REPORT_DETAILS))
1779         report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1780       return NULL;
1781     }
1782   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1783            && check_reduction)
1784     {
1785       /* Changing the order of operations changes the semantics.  */
1786       if (vect_print_dump_info (REPORT_DETAILS))
1787         report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1788       return NULL;
1789     }
1790   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1791     {
1792       /* Changing the order of operations changes the semantics.  */
1793       if (vect_print_dump_info (REPORT_DETAILS))
1794         report_vect_op (def_stmt,
1795                         "reduction: unsafe fixed-point math optimization: ");
1796       return NULL;
1797     }
1798
1799   /* Reduction is safe. We're dealing with one of the following:
1800      1) integer arithmetic and no trapv
1801      2) floating point arithmetic, and special flags permit this optimization
1802      3) nested cycle (i.e., outer loop vectorization).  */
1803   if (TREE_CODE (op1) == SSA_NAME)
1804     def1 = SSA_NAME_DEF_STMT (op1);
1805
1806   if (TREE_CODE (op2) == SSA_NAME)
1807     def2 = SSA_NAME_DEF_STMT (op2);
1808
1809   if (code != COND_EXPR
1810       && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1811     {
1812       if (vect_print_dump_info (REPORT_DETAILS))
1813         report_vect_op (def_stmt, "reduction: no defs for operands: ");
1814       return NULL;
1815     }
1816
1817   /* Check that one def is the reduction def, defined by PHI,
1818      the other def is either defined in the loop ("vect_internal_def"),
1819      or it's an induction (defined by a loop-header phi-node).  */
1820
1821   if (def2 && def2 == phi
1822       && (code == COND_EXPR
1823           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1824               && (is_gimple_assign (def1)
1825                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1826                       == vect_induction_def
1827                   || (gimple_code (def1) == GIMPLE_PHI
1828                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1829                           == vect_internal_def
1830                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
1831     {
1832       if (vect_print_dump_info (REPORT_DETAILS))
1833         report_vect_op (def_stmt, "detected reduction: ");
1834       return def_stmt;
1835     }
1836   else if (def1 && def1 == phi
1837            && (code == COND_EXPR
1838                || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1839                    && (is_gimple_assign (def2)
1840                        || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1841                            == vect_induction_def
1842                        || (gimple_code (def2) == GIMPLE_PHI
1843                            && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1844                                == vect_internal_def
1845                            && !is_loop_header_bb_p (gimple_bb (def2)))))))
1846     {
1847       if (check_reduction)
1848         {
1849           /* Swap operands (just for simplicity - so that the rest of the code
1850              can assume that the reduction variable is always the last (second)
1851              argument).  */
1852           if (vect_print_dump_info (REPORT_DETAILS))
1853             report_vect_op (def_stmt,
1854                             "detected reduction: need to swap operands: ");
1855
1856           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
1857                               gimple_assign_rhs2_ptr (def_stmt));
1858         }
1859       else
1860         {
1861           if (vect_print_dump_info (REPORT_DETAILS))
1862             report_vect_op (def_stmt, "detected reduction: ");
1863         }
1864
1865       return def_stmt;
1866     }
1867   else
1868     {
1869       if (vect_print_dump_info (REPORT_DETAILS))
1870         report_vect_op (def_stmt, "reduction: unknown pattern: ");
1871
1872       return NULL;
1873     }
1874 }
1875
1876
1877 /* Function vect_estimate_min_profitable_iters
1878
1879    Return the number of iterations required for the vector version of the
1880    loop to be profitable relative to the cost of the scalar version of the
1881    loop.
1882
1883    TODO: Take profile info into account before making vectorization
1884    decisions, if available.  */
1885
1886 int
1887 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
1888 {
1889   int i;
1890   int min_profitable_iters;
1891   int peel_iters_prologue;
1892   int peel_iters_epilogue;
1893   int vec_inside_cost = 0;
1894   int vec_outside_cost = 0;
1895   int scalar_single_iter_cost = 0;
1896   int scalar_outside_cost = 0;
1897   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1898   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1899   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1900   int nbbs = loop->num_nodes;
1901   int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
1902   int peel_guard_costs = 0;
1903   int innerloop_iters = 0, factor;
1904   VEC (slp_instance, heap) *slp_instances;
1905   slp_instance instance;
1906
1907   /* Cost model disabled.  */
1908   if (!flag_vect_cost_model)
1909     {
1910       if (vect_print_dump_info (REPORT_COST))
1911         fprintf (vect_dump, "cost model disabled.");
1912       return 0;
1913     }
1914
1915   /* Requires loop versioning tests to handle misalignment.  */
1916   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1917     {
1918       /*  FIXME: Make cost depend on complexity of individual check.  */
1919       vec_outside_cost +=
1920         VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1921       if (vect_print_dump_info (REPORT_COST))
1922         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1923                  "versioning to treat misalignment.\n");
1924     }
1925
1926   /* Requires loop versioning with alias checks.  */
1927   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1928     {
1929       /*  FIXME: Make cost depend on complexity of individual check.  */
1930       vec_outside_cost +=
1931         VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1932       if (vect_print_dump_info (REPORT_COST))
1933         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1934                  "versioning aliasing.\n");
1935     }
1936
1937   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
1938       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1939     vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
1940
1941   /* Count statements in scalar loop.  Using this as scalar cost for a single
1942      iteration for now.
1943
1944      TODO: Add outer loop support.
1945
1946      TODO: Consider assigning different costs to different scalar
1947      statements.  */
1948
1949   /* FORNOW.  */
1950   if (loop->inner)
1951     innerloop_iters = 50; /* FIXME */
1952
1953   for (i = 0; i < nbbs; i++)
1954     {
1955       gimple_stmt_iterator si;
1956       basic_block bb = bbs[i];
1957
1958       if (bb->loop_father == loop->inner)
1959         factor = innerloop_iters;
1960       else
1961         factor = 1;
1962
1963       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1964         {
1965           gimple stmt = gsi_stmt (si);
1966           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1967           /* Skip stmts that are not vectorized inside the loop.  */
1968           if (!STMT_VINFO_RELEVANT_P (stmt_info)
1969               && (!STMT_VINFO_LIVE_P (stmt_info)
1970                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
1971             continue;
1972           scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
1973           vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
1974           /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
1975              some of the "outside" costs are generated inside the outer-loop.  */
1976           vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
1977         }
1978     }
1979
1980   /* Add additional cost for the peeled instructions in prologue and epilogue
1981      loop.
1982
1983      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
1984      at compile-time - we assume it's vf/2 (the worst would be vf-1).
1985
1986      TODO: Build an expression that represents peel_iters for prologue and
1987      epilogue to be used in a run-time test.  */
1988
1989   if (byte_misalign < 0)
1990     {
1991       peel_iters_prologue = vf/2;
1992       if (vect_print_dump_info (REPORT_COST))
1993         fprintf (vect_dump, "cost model: "
1994                  "prologue peel iters set to vf/2.");
1995
1996       /* If peeling for alignment is unknown, loop bound of main loop becomes
1997          unknown.  */
1998       peel_iters_epilogue = vf/2;
1999       if (vect_print_dump_info (REPORT_COST))
2000         fprintf (vect_dump, "cost model: "
2001                  "epilogue peel iters set to vf/2 because "
2002                  "peeling for alignment is unknown .");
2003
2004       /* If peeled iterations are unknown, count a taken branch and a not taken
2005          branch per peeled loop. Even if scalar loop iterations are known,
2006          vector iterations are not known since peeled prologue iterations are
2007          not known. Hence guards remain the same.  */
2008       peel_guard_costs +=  2 * (TARG_COND_TAKEN_BRANCH_COST
2009                               + TARG_COND_NOT_TAKEN_BRANCH_COST);
2010     }
2011   else
2012     {
2013       if (byte_misalign)
2014         {
2015           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2016           int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
2017           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2018           int nelements = TYPE_VECTOR_SUBPARTS (vectype);
2019
2020           peel_iters_prologue = nelements - (byte_misalign / element_size);
2021         }
2022       else
2023         peel_iters_prologue = 0;
2024
2025       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2026         {
2027           peel_iters_epilogue = vf/2;
2028           if (vect_print_dump_info (REPORT_COST))
2029             fprintf (vect_dump, "cost model: "
2030                      "epilogue peel iters set to vf/2 because "
2031                      "loop iterations are unknown .");
2032
2033           /* If peeled iterations are known but number of scalar loop
2034              iterations are unknown, count a taken branch per peeled loop.  */
2035           peel_guard_costs +=  2 * TARG_COND_TAKEN_BRANCH_COST;
2036
2037         }
2038       else
2039         {
2040           int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2041           peel_iters_prologue = niters < peel_iters_prologue ?
2042                                         niters : peel_iters_prologue;
2043           peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2044         }
2045     }
2046
2047   vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2048                       + (peel_iters_epilogue * scalar_single_iter_cost)
2049                       + peel_guard_costs;
2050
2051   /* FORNOW: The scalar outside cost is incremented in one of the
2052      following ways:
2053
2054      1. The vectorizer checks for alignment and aliasing and generates
2055      a condition that allows dynamic vectorization.  A cost model
2056      check is ANDED with the versioning condition.  Hence scalar code
2057      path now has the added cost of the versioning check.
2058
2059        if (cost > th & versioning_check)
2060          jmp to vector code
2061
2062      Hence run-time scalar is incremented by not-taken branch cost.
2063
2064      2. The vectorizer then checks if a prologue is required.  If the
2065      cost model check was not done before during versioning, it has to
2066      be done before the prologue check.
2067
2068        if (cost <= th)
2069          prologue = scalar_iters
2070        if (prologue == 0)
2071          jmp to vector code
2072        else
2073          execute prologue
2074        if (prologue == num_iters)
2075          go to exit
2076
2077      Hence the run-time scalar cost is incremented by a taken branch,
2078      plus a not-taken branch, plus a taken branch cost.
2079
2080      3. The vectorizer then checks if an epilogue is required.  If the
2081      cost model check was not done before during prologue check, it
2082      has to be done with the epilogue check.
2083
2084        if (prologue == 0)
2085          jmp to vector code
2086        else
2087          execute prologue
2088        if (prologue == num_iters)
2089          go to exit
2090        vector code:
2091          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2092            jmp to epilogue
2093
2094      Hence the run-time scalar cost should be incremented by 2 taken
2095      branches.
2096
2097      TODO: The back end may reorder the BBS's differently and reverse
2098      conditions/branch directions.  Change the estimates below to
2099      something more reasonable.  */
2100
2101   /* If the number of iterations is known and we do not do versioning, we can
2102      decide whether to vectorize at compile time. Hence the scalar version
2103      do not carry cost model guard costs.  */
2104   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2105       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2106       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2107     {
2108       /* Cost model check occurs at versioning.  */
2109       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2110           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2111         scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
2112       else
2113         {
2114           /* Cost model check occurs at prologue generation.  */
2115           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2116             scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
2117               + TARG_COND_NOT_TAKEN_BRANCH_COST;
2118           /* Cost model check occurs at epilogue generation.  */
2119           else
2120             scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
2121         }
2122     }
2123
2124   /* Add SLP costs.  */
2125   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2126   for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
2127     {
2128       vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2129       vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2130     }
2131
2132   /* Calculate number of iterations required to make the vector version
2133      profitable, relative to the loop bodies only. The following condition
2134      must hold true:
2135      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2136      where
2137      SIC = scalar iteration cost, VIC = vector iteration cost,
2138      VOC = vector outside cost, VF = vectorization factor,
2139      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2140      SOC = scalar outside cost for run time cost model check.  */
2141
2142   if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2143     {
2144       if (vec_outside_cost <= 0)
2145         min_profitable_iters = 1;
2146       else
2147         {
2148           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2149                                   - vec_inside_cost * peel_iters_prologue
2150                                   - vec_inside_cost * peel_iters_epilogue)
2151                                  / ((scalar_single_iter_cost * vf)
2152                                     - vec_inside_cost);
2153
2154           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2155               <= ((vec_inside_cost * min_profitable_iters)
2156                   + ((vec_outside_cost - scalar_outside_cost) * vf)))
2157             min_profitable_iters++;
2158         }
2159     }
2160   /* vector version will never be profitable.  */
2161   else
2162     {
2163       if (vect_print_dump_info (REPORT_COST))
2164         fprintf (vect_dump, "cost model: vector iteration cost = %d "
2165                  "is divisible by scalar iteration cost = %d by a factor "
2166                  "greater than or equal to the vectorization factor = %d .",
2167                  vec_inside_cost, scalar_single_iter_cost, vf);
2168       return -1;
2169     }
2170
2171   if (vect_print_dump_info (REPORT_COST))
2172     {
2173       fprintf (vect_dump, "Cost model analysis: \n");
2174       fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
2175                vec_inside_cost);
2176       fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
2177                vec_outside_cost);
2178       fprintf (vect_dump, "  Scalar iteration cost: %d\n",
2179                scalar_single_iter_cost);
2180       fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
2181       fprintf (vect_dump, "  prologue iterations: %d\n",
2182                peel_iters_prologue);
2183       fprintf (vect_dump, "  epilogue iterations: %d\n",
2184                peel_iters_epilogue);
2185       fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
2186                min_profitable_iters);
2187     }
2188
2189   min_profitable_iters =
2190         min_profitable_iters < vf ? vf : min_profitable_iters;
2191
2192   /* Because the condition we create is:
2193      if (niters <= min_profitable_iters)
2194        then skip the vectorized loop.  */
2195   min_profitable_iters--;
2196
2197   if (vect_print_dump_info (REPORT_COST))
2198     fprintf (vect_dump, "  Profitability threshold = %d\n",
2199              min_profitable_iters);
2200
2201   return min_profitable_iters;
2202 }
2203
2204
2205 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2206    functions. Design better to avoid maintenance issues.  */
2207
2208 /* Function vect_model_reduction_cost.
2209
2210    Models cost for a reduction operation, including the vector ops
2211    generated within the strip-mine loop, the initial definition before
2212    the loop, and the epilogue code that must be generated.  */
2213
2214 static bool
2215 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2216                            int ncopies)
2217 {
2218   int outer_cost = 0;
2219   enum tree_code code;
2220   optab optab;
2221   tree vectype;
2222   gimple stmt, orig_stmt;
2223   tree reduction_op;
2224   enum machine_mode mode;
2225   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2226   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2227
2228
2229   /* Cost of reduction op inside loop.  */
2230   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
2231
2232   stmt = STMT_VINFO_STMT (stmt_info);
2233
2234   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2235     {
2236     case GIMPLE_SINGLE_RHS:
2237       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2238       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2239       break;
2240     case GIMPLE_UNARY_RHS:
2241       reduction_op = gimple_assign_rhs1 (stmt);
2242       break;
2243     case GIMPLE_BINARY_RHS:
2244       reduction_op = gimple_assign_rhs2 (stmt);
2245       break;
2246     default:
2247       gcc_unreachable ();
2248     }
2249
2250   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2251   if (!vectype)
2252     {
2253       if (vect_print_dump_info (REPORT_COST))
2254         {
2255           fprintf (vect_dump, "unsupported data-type ");
2256           print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2257         }
2258       return false;
2259    }
2260
2261   mode = TYPE_MODE (vectype);
2262   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2263
2264   if (!orig_stmt)
2265     orig_stmt = STMT_VINFO_STMT (stmt_info);
2266
2267   code = gimple_assign_rhs_code (orig_stmt);
2268
2269   /* Add in cost for initial definition.  */
2270   outer_cost += TARG_SCALAR_TO_VEC_COST;
2271
2272   /* Determine cost of epilogue code.
2273
2274      We have a reduction operator that will reduce the vector in one statement.
2275      Also requires scalar extract.  */
2276
2277   if (!nested_in_vect_loop_p (loop, orig_stmt))
2278     {
2279       if (reduc_code != ERROR_MARK)
2280         outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
2281       else
2282         {
2283           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2284           tree bitsize =
2285             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2286           int element_bitsize = tree_low_cst (bitsize, 1);
2287           int nelements = vec_size_in_bits / element_bitsize;
2288
2289           optab = optab_for_tree_code (code, vectype, optab_default);
2290
2291           /* We have a whole vector shift available.  */
2292           if (VECTOR_MODE_P (mode)
2293               && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
2294               && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2295             /* Final reduction via vector shifts and the reduction operator. Also
2296                requires scalar extract.  */
2297             outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
2298                                 + TARG_VEC_TO_SCALAR_COST);
2299           else
2300             /* Use extracts and reduction op for final reduction.  For N elements,
2301                we have N extracts and N-1 reduction ops.  */
2302             outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
2303         }
2304     }
2305
2306   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2307
2308   if (vect_print_dump_info (REPORT_COST))
2309     fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2310              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2311              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2312
2313   return true;
2314 }
2315
2316
2317 /* Function vect_model_induction_cost.
2318
2319    Models cost for induction operations.  */
2320
2321 static void
2322 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2323 {
2324   /* loop cost for vec_loop.  */
2325   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
2326   /* prologue cost for vec_init and vec_step.  */
2327   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
2328
2329   if (vect_print_dump_info (REPORT_COST))
2330     fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2331              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2332              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2333 }
2334
2335
2336 /* Function get_initial_def_for_induction
2337
2338    Input:
2339    STMT - a stmt that performs an induction operation in the loop.
2340    IV_PHI - the initial value of the induction variable
2341
2342    Output:
2343    Return a vector variable, initialized with the first VF values of
2344    the induction variable. E.g., for an iv with IV_PHI='X' and
2345    evolution S, for a vector of 4 units, we want to return:
2346    [X, X + S, X + 2*S, X + 3*S].  */
2347
2348 static tree
2349 get_initial_def_for_induction (gimple iv_phi)
2350 {
2351   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2352   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2353   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2354   tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2355   tree vectype;
2356   int nunits;
2357   edge pe = loop_preheader_edge (loop);
2358   struct loop *iv_loop;
2359   basic_block new_bb;
2360   tree vec, vec_init, vec_step, t;
2361   tree access_fn;
2362   tree new_var;
2363   tree new_name;
2364   gimple init_stmt, induction_phi, new_stmt;
2365   tree induc_def, vec_def, vec_dest;
2366   tree init_expr, step_expr;
2367   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2368   int i;
2369   bool ok;
2370   int ncopies;
2371   tree expr;
2372   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2373   bool nested_in_vect_loop = false;
2374   gimple_seq stmts = NULL;
2375   imm_use_iterator imm_iter;
2376   use_operand_p use_p;
2377   gimple exit_phi;
2378   edge latch_e;
2379   tree loop_arg;
2380   gimple_stmt_iterator si;
2381   basic_block bb = gimple_bb (iv_phi);
2382   tree stepvectype;
2383
2384   vectype = get_vectype_for_scalar_type (scalar_type);
2385   gcc_assert (vectype);
2386   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2387   ncopies = vf / nunits;
2388
2389   gcc_assert (phi_info);
2390   gcc_assert (ncopies >= 1);
2391
2392   /* Find the first insertion point in the BB.  */
2393   si = gsi_after_labels (bb);
2394
2395   if (INTEGRAL_TYPE_P (scalar_type))
2396     step_expr = build_int_cst (scalar_type, 0);
2397   else if (POINTER_TYPE_P (scalar_type))
2398     step_expr = build_int_cst (sizetype, 0);
2399   else
2400     step_expr = build_real (scalar_type, dconst0);
2401
2402   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
2403   if (nested_in_vect_loop_p (loop, iv_phi))
2404     {
2405       nested_in_vect_loop = true;
2406       iv_loop = loop->inner;
2407     }
2408   else
2409     iv_loop = loop;
2410   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2411
2412   latch_e = loop_latch_edge (iv_loop);
2413   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2414
2415   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2416   gcc_assert (access_fn);
2417   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2418                                     &init_expr, &step_expr);
2419   gcc_assert (ok);
2420   pe = loop_preheader_edge (iv_loop);
2421
2422   /* Create the vector that holds the initial_value of the induction.  */
2423   if (nested_in_vect_loop)
2424     {
2425       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
2426          been created during vectorization of previous stmts; We obtain it from
2427          the STMT_VINFO_VEC_STMT of the defining stmt. */
2428       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2429                                            loop_preheader_edge (iv_loop));
2430       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2431     }
2432   else
2433     {
2434       /* iv_loop is the loop to be vectorized. Create:
2435          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
2436       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2437       add_referenced_var (new_var);
2438
2439       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2440       if (stmts)
2441         {
2442           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2443           gcc_assert (!new_bb);
2444         }
2445
2446       t = NULL_TREE;
2447       t = tree_cons (NULL_TREE, init_expr, t);
2448       for (i = 1; i < nunits; i++)
2449         {
2450           /* Create: new_name_i = new_name + step_expr  */
2451           enum tree_code code = POINTER_TYPE_P (scalar_type)
2452                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2453           init_stmt = gimple_build_assign_with_ops (code, new_var,
2454                                                     new_name, step_expr);
2455           new_name = make_ssa_name (new_var, init_stmt);
2456           gimple_assign_set_lhs (init_stmt, new_name);
2457
2458           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2459           gcc_assert (!new_bb);
2460
2461           if (vect_print_dump_info (REPORT_DETAILS))
2462             {
2463               fprintf (vect_dump, "created new init_stmt: ");
2464               print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2465             }
2466           t = tree_cons (NULL_TREE, new_name, t);
2467         }
2468       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
2469       vec = build_constructor_from_list (vectype, nreverse (t));
2470       vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2471     }
2472
2473
2474   /* Create the vector that holds the step of the induction.  */
2475   if (nested_in_vect_loop)
2476     /* iv_loop is nested in the loop to be vectorized. Generate:
2477        vec_step = [S, S, S, S]  */
2478     new_name = step_expr;
2479   else
2480     {
2481       /* iv_loop is the loop to be vectorized. Generate:
2482           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
2483       expr = build_int_cst (TREE_TYPE (step_expr), vf);
2484       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2485                               expr, step_expr);
2486     }
2487
2488   t = NULL_TREE;
2489   for (i = 0; i < nunits; i++)
2490     t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2491   gcc_assert (CONSTANT_CLASS_P (new_name));
2492   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2493   gcc_assert (stepvectype);
2494   vec = build_vector (stepvectype, t);
2495   vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2496
2497
2498   /* Create the following def-use cycle:
2499      loop prolog:
2500          vec_init = ...
2501          vec_step = ...
2502      loop:
2503          vec_iv = PHI <vec_init, vec_loop>
2504          ...
2505          STMT
2506          ...
2507          vec_loop = vec_iv + vec_step;  */
2508
2509   /* Create the induction-phi that defines the induction-operand.  */
2510   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2511   add_referenced_var (vec_dest);
2512   induction_phi = create_phi_node (vec_dest, iv_loop->header);
2513   set_vinfo_for_stmt (induction_phi,
2514                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2515   induc_def = PHI_RESULT (induction_phi);
2516
2517   /* Create the iv update inside the loop  */
2518   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2519                                            induc_def, vec_step);
2520   vec_def = make_ssa_name (vec_dest, new_stmt);
2521   gimple_assign_set_lhs (new_stmt, vec_def);
2522   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2523   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2524                                                    NULL));
2525
2526   /* Set the arguments of the phi node:  */
2527   add_phi_arg (induction_phi, vec_init, pe);
2528   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
2529
2530
2531   /* In case that vectorization factor (VF) is bigger than the number
2532      of elements that we can fit in a vectype (nunits), we have to generate
2533      more than one vector stmt - i.e - we need to "unroll" the
2534      vector stmt by a factor VF/nunits.  For more details see documentation
2535      in vectorizable_operation.  */
2536
2537   if (ncopies > 1)
2538     {
2539       stmt_vec_info prev_stmt_vinfo;
2540       /* FORNOW. This restriction should be relaxed.  */
2541       gcc_assert (!nested_in_vect_loop);
2542
2543       /* Create the vector that holds the step of the induction.  */
2544       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2545       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2546                               expr, step_expr);
2547       t = NULL_TREE;
2548       for (i = 0; i < nunits; i++)
2549         t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2550       gcc_assert (CONSTANT_CLASS_P (new_name));
2551       vec = build_vector (stepvectype, t);
2552       vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2553
2554       vec_def = induc_def;
2555       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2556       for (i = 1; i < ncopies; i++)
2557         {
2558           /* vec_i = vec_prev + vec_step  */
2559           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2560                                                    vec_def, vec_step);
2561           vec_def = make_ssa_name (vec_dest, new_stmt);
2562           gimple_assign_set_lhs (new_stmt, vec_def);
2563
2564           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2565           set_vinfo_for_stmt (new_stmt,
2566                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2567           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2568           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2569         }
2570     }
2571
2572   if (nested_in_vect_loop)
2573     {
2574       /* Find the loop-closed exit-phi of the induction, and record
2575          the final vector of induction results:  */
2576       exit_phi = NULL;
2577       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2578         {
2579           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2580             {
2581               exit_phi = USE_STMT (use_p);
2582               break;
2583             }
2584         }
2585       if (exit_phi)
2586         {
2587           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2588           /* FORNOW. Currently not supporting the case that an inner-loop induction
2589              is not used in the outer-loop (i.e. only outside the outer-loop).  */
2590           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2591                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
2592
2593           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2594           if (vect_print_dump_info (REPORT_DETAILS))
2595             {
2596               fprintf (vect_dump, "vector of inductions after inner-loop:");
2597               print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2598             }
2599         }
2600     }
2601
2602
2603   if (vect_print_dump_info (REPORT_DETAILS))
2604     {
2605       fprintf (vect_dump, "transform induction: created def-use cycle: ");
2606       print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2607       fprintf (vect_dump, "\n");
2608       print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2609     }
2610
2611   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2612   return induc_def;
2613 }
2614
2615
2616 /* Function get_initial_def_for_reduction
2617
2618    Input:
2619    STMT - a stmt that performs a reduction operation in the loop.
2620    INIT_VAL - the initial value of the reduction variable
2621
2622    Output:
2623    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2624         of the reduction (used for adjusting the epilog - see below).
2625    Return a vector variable, initialized according to the operation that STMT
2626         performs. This vector will be used as the initial value of the
2627         vector of partial results.
2628
2629    Option1 (adjust in epilog): Initialize the vector as follows:
2630      add/bit or/xor:    [0,0,...,0,0]
2631      mult/bit and:      [1,1,...,1,1]
2632      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2633    and when necessary (e.g. add/mult case) let the caller know
2634    that it needs to adjust the result by init_val.
2635
2636    Option2: Initialize the vector as follows:
2637      add/bit or/xor:    [init_val,0,0,...,0]
2638      mult/bit and:      [init_val,1,1,...,1]
2639      min/max/cond_expr: [init_val,init_val,...,init_val]
2640    and no adjustments are needed.
2641
2642    For example, for the following code:
2643
2644    s = init_val;
2645    for (i=0;i<n;i++)
2646      s = s + a[i];
2647
2648    STMT is 's = s + a[i]', and the reduction variable is 's'.
2649    For a vector of 4 units, we want to return either [0,0,0,init_val],
2650    or [0,0,0,0] and let the caller know that it needs to adjust
2651    the result at the end by 'init_val'.
2652
2653    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2654    initialization vector is simpler (same element in all entries), if
2655    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2656
2657    A cost model should help decide between these two schemes.  */
2658
2659 tree
2660 get_initial_def_for_reduction (gimple stmt, tree init_val,
2661                                tree *adjustment_def)
2662 {
2663   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2664   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2665   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2666   tree scalar_type = TREE_TYPE (init_val);
2667   tree vectype = get_vectype_for_scalar_type (scalar_type);
2668   int nunits;
2669   enum tree_code code = gimple_assign_rhs_code (stmt);
2670   tree def_for_init;
2671   tree init_def;
2672   tree t = NULL_TREE;
2673   int i;
2674   bool nested_in_vect_loop = false;
2675   tree init_value;
2676   REAL_VALUE_TYPE real_init_val = dconst0;
2677   int int_init_val = 0;
2678   gimple def_stmt = NULL;
2679
2680   gcc_assert (vectype);
2681   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2682
2683   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2684               || SCALAR_FLOAT_TYPE_P (scalar_type));
2685
2686   if (nested_in_vect_loop_p (loop, stmt))
2687     nested_in_vect_loop = true;
2688   else
2689     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2690
2691   /* In case of double reduction we only create a vector variable to be put
2692      in the reduction phi node. The actual statement creation is done in
2693      vect_create_epilog_for_reduction.  */
2694   if (adjustment_def && nested_in_vect_loop
2695       && TREE_CODE (init_val) == SSA_NAME
2696       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2697       && gimple_code (def_stmt) == GIMPLE_PHI
2698       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2699       && vinfo_for_stmt (def_stmt)
2700       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2701           == vect_double_reduction_def)
2702     {
2703       *adjustment_def = NULL;
2704       return vect_create_destination_var (init_val, vectype);
2705     }
2706
2707   if (TREE_CONSTANT (init_val))
2708     {
2709       if (SCALAR_FLOAT_TYPE_P (scalar_type))
2710         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2711       else
2712         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2713     }
2714   else
2715     init_value = init_val;
2716
2717   switch (code)
2718     {
2719       case WIDEN_SUM_EXPR:
2720       case DOT_PROD_EXPR:
2721       case PLUS_EXPR:
2722       case MINUS_EXPR:
2723       case BIT_IOR_EXPR:
2724       case BIT_XOR_EXPR:
2725       case MULT_EXPR:
2726       case BIT_AND_EXPR:
2727         /* ADJUSMENT_DEF is NULL when called from
2728            vect_create_epilog_for_reduction to vectorize double reduction.  */
2729         if (adjustment_def)
2730           {
2731             if (nested_in_vect_loop)
2732               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2733                                                               NULL);
2734             else
2735               *adjustment_def = init_val;
2736           }
2737
2738         if (code == MULT_EXPR || code == BIT_AND_EXPR)
2739           {
2740             real_init_val = dconst1;
2741             int_init_val = 1;
2742           }
2743
2744         if (SCALAR_FLOAT_TYPE_P (scalar_type))
2745           def_for_init = build_real (scalar_type, real_init_val);
2746         else
2747           def_for_init = build_int_cst (scalar_type, int_init_val);
2748
2749         /* Create a vector of '0' or '1' except the first element.  */
2750         for (i = nunits - 2; i >= 0; --i)
2751           t = tree_cons (NULL_TREE, def_for_init, t);
2752
2753         /* Option1: the first element is '0' or '1' as well.  */
2754         if (adjustment_def)
2755           {
2756             t = tree_cons (NULL_TREE, def_for_init, t);
2757             init_def = build_vector (vectype, t);
2758             break;
2759           }
2760
2761         /* Option2: the first element is INIT_VAL.  */
2762         t = tree_cons (NULL_TREE, init_value, t);
2763         if (TREE_CONSTANT (init_val))
2764           init_def = build_vector (vectype, t);
2765         else
2766           init_def = build_constructor_from_list (vectype, t);
2767
2768         break;
2769
2770       case MIN_EXPR:
2771       case MAX_EXPR:
2772       case COND_EXPR:
2773         if (adjustment_def)
2774           {
2775             *adjustment_def = NULL_TREE;
2776             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2777             break;
2778           }
2779
2780         for (i = nunits - 1; i >= 0; --i)
2781           t = tree_cons (NULL_TREE, init_value, t);
2782
2783         if (TREE_CONSTANT (init_val))
2784           init_def = build_vector (vectype, t);
2785         else
2786           init_def = build_constructor_from_list (vectype, t);
2787
2788         break;
2789
2790       default:
2791         gcc_unreachable ();
2792     }
2793
2794   return init_def;
2795 }
2796
2797
2798 /* Function vect_create_epilog_for_reduction
2799
2800    Create code at the loop-epilog to finalize the result of a reduction
2801    computation.
2802
2803    VECT_DEF is a vector of partial results.
2804    REDUC_CODE is the tree-code for the epilog reduction.
2805    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2806      number of elements that we can fit in a vectype (nunits). In this case
2807      we have to generate more than one vector stmt - i.e - we need to "unroll"
2808      the vector stmt by a factor VF/nunits.  For more details see documentation
2809      in vectorizable_operation.
2810    STMT is the scalar reduction stmt that is being vectorized.
2811    REDUCTION_PHI is the phi-node that carries the reduction computation.
2812    REDUC_INDEX is the index of the operand in the right hand side of the
2813      statement that is defined by REDUCTION_PHI.
2814    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
2815
2816    This function:
2817    1. Creates the reduction def-use cycle: sets the arguments for
2818       REDUCTION_PHI:
2819       The loop-entry argument is the vectorized initial-value of the reduction.
2820       The loop-latch argument is VECT_DEF - the vector of partial sums.
2821    2. "Reduces" the vector of partial results VECT_DEF into a single result,
2822       by applying the operation specified by REDUC_CODE if available, or by
2823       other means (whole-vector shifts or a scalar loop).
2824       The function also creates a new phi node at the loop exit to preserve
2825       loop-closed form, as illustrated below.
2826
2827      The flow at the entry to this function:
2828
2829         loop:
2830           vec_def = phi <null, null>            # REDUCTION_PHI
2831           VECT_DEF = vector_stmt                # vectorized form of STMT
2832           s_loop = scalar_stmt                  # (scalar) STMT
2833         loop_exit:
2834           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2835           use <s_out0>
2836           use <s_out0>
2837
2838      The above is transformed by this function into:
2839
2840         loop:
2841           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
2842           VECT_DEF = vector_stmt                # vectorized form of STMT
2843           s_loop = scalar_stmt                  # (scalar) STMT
2844         loop_exit:
2845           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2846           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
2847           v_out2 = reduce <v_out1>
2848           s_out3 = extract_field <v_out2, 0>
2849           s_out4 = adjust_result <s_out3>
2850           use <s_out4>
2851           use <s_out4>
2852 */
2853
2854 static void
2855 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2856                                   int ncopies,
2857                                   enum tree_code reduc_code,
2858                                   gimple reduction_phi,
2859                                   int reduc_index,
2860                                   bool double_reduc)
2861 {
2862   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2863   stmt_vec_info prev_phi_info;
2864   tree vectype;
2865   enum machine_mode mode;
2866   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2867   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
2868   basic_block exit_bb;
2869   tree scalar_dest;
2870   tree scalar_type;
2871   gimple new_phi = NULL, phi;
2872   gimple_stmt_iterator exit_gsi;
2873   tree vec_dest;
2874   tree new_temp = NULL_TREE;
2875   tree new_name;
2876   gimple epilog_stmt = NULL;
2877   tree new_scalar_dest, new_dest;
2878   gimple exit_phi;
2879   tree bitsize, bitpos, bytesize;
2880   enum tree_code code = gimple_assign_rhs_code (stmt);
2881   tree adjustment_def;
2882   tree vec_initial_def, def;
2883   tree orig_name;
2884   imm_use_iterator imm_iter;
2885   use_operand_p use_p;
2886   bool extract_scalar_result = false;
2887   tree reduction_op, expr;
2888   gimple orig_stmt;
2889   gimple use_stmt;
2890   bool nested_in_vect_loop = false;
2891   VEC(gimple,heap) *phis = NULL;
2892   enum vect_def_type dt = vect_unknown_def_type;
2893   int j, i;
2894
2895   if (nested_in_vect_loop_p (loop, stmt))
2896     {
2897       outer_loop = loop;
2898       loop = loop->inner;
2899       nested_in_vect_loop = true;
2900     }
2901
2902   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2903     {
2904     case GIMPLE_SINGLE_RHS:
2905       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
2906                                        == ternary_op);
2907       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
2908       break;
2909     case GIMPLE_UNARY_RHS:
2910       reduction_op = gimple_assign_rhs1 (stmt);
2911       break;
2912     case GIMPLE_BINARY_RHS:
2913       reduction_op = reduc_index ?
2914                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
2915       break;
2916     default:
2917       gcc_unreachable ();
2918     }
2919
2920   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2921   gcc_assert (vectype);
2922   mode = TYPE_MODE (vectype);
2923
2924   /*** 1. Create the reduction def-use cycle  ***/
2925
2926   /* For the case of reduction, vect_get_vec_def_for_operand returns
2927      the scalar def before the loop, that defines the initial value
2928      of the reduction variable.  */
2929   vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2930                                                   &adjustment_def);
2931
2932   phi = reduction_phi;
2933   def = vect_def;
2934   for (j = 0; j < ncopies; j++)
2935     {
2936       /* 1.1 set the loop-entry arg of the reduction-phi:  */
2937       add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop));
2938
2939       /* 1.2 set the loop-latch arg for the reduction-phi:  */
2940       if (j > 0)
2941         def = vect_get_vec_def_for_stmt_copy (dt, def);
2942       add_phi_arg (phi, def, loop_latch_edge (loop));
2943
2944       if (vect_print_dump_info (REPORT_DETAILS))
2945         {
2946           fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2947           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2948           fprintf (vect_dump, "\n");
2949           print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2950         }
2951
2952       phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2953     }
2954
2955   /*** 2. Create epilog code
2956           The reduction epilog code operates across the elements of the vector
2957           of partial results computed by the vectorized loop.
2958           The reduction epilog code consists of:
2959           step 1: compute the scalar result in a vector (v_out2)
2960           step 2: extract the scalar result (s_out3) from the vector (v_out2)
2961           step 3: adjust the scalar result (s_out3) if needed.
2962
2963           Step 1 can be accomplished using one the following three schemes:
2964           (scheme 1) using reduc_code, if available.
2965           (scheme 2) using whole-vector shifts, if available.
2966           (scheme 3) using a scalar loop. In this case steps 1+2 above are
2967                      combined.
2968
2969           The overall epilog code looks like this:
2970
2971           s_out0 = phi <s_loop>         # original EXIT_PHI
2972           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
2973           v_out2 = reduce <v_out1>              # step 1
2974           s_out3 = extract_field <v_out2, 0>    # step 2
2975           s_out4 = adjust_result <s_out3>       # step 3
2976
2977           (step 3 is optional, and steps 1 and 2 may be combined).
2978           Lastly, the uses of s_out0 are replaced by s_out4.
2979
2980           ***/
2981
2982   /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2983         v_out1 = phi <v_loop>  */
2984
2985   exit_bb = single_exit (loop)->dest;
2986   def = vect_def;
2987   prev_phi_info = NULL;
2988   for (j = 0; j < ncopies; j++)
2989     {
2990       phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2991       set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
2992       if (j == 0)
2993         new_phi = phi;
2994       else
2995         {
2996           def = vect_get_vec_def_for_stmt_copy (dt, def);
2997           STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
2998         }
2999       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3000       prev_phi_info = vinfo_for_stmt (phi);
3001     }
3002
3003   exit_gsi = gsi_after_labels (exit_bb);
3004
3005   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3006          (i.e. when reduc_code is not available) and in the final adjustment
3007          code (if needed).  Also get the original scalar reduction variable as
3008          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3009          represents a reduction pattern), the tree-code and scalar-def are
3010          taken from the original stmt that the pattern-stmt (STMT) replaces.
3011          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3012          are taken from STMT.  */
3013
3014   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3015   if (!orig_stmt)
3016     {
3017       /* Regular reduction  */
3018       orig_stmt = stmt;
3019     }
3020   else
3021     {
3022       /* Reduction pattern  */
3023       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3024       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3025       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3026     }
3027
3028   code = gimple_assign_rhs_code (orig_stmt);
3029   scalar_dest = gimple_assign_lhs (orig_stmt);
3030   scalar_type = TREE_TYPE (scalar_dest);
3031   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3032   bitsize = TYPE_SIZE (scalar_type);
3033   bytesize = TYPE_SIZE_UNIT (scalar_type);
3034
3035   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3036      partial results are added and not subtracted.  */
3037   if (code == MINUS_EXPR)
3038     code = PLUS_EXPR;
3039
3040   /* In case this is a reduction in an inner-loop while vectorizing an outer
3041      loop - we don't need to extract a single scalar result at the end of the
3042      inner-loop (unless it is double reduction, i.e., the use of reduction is
3043      outside the outer-loop). The final vector of partial results will be used
3044      in the vectorized outer-loop, or reduced to a scalar result at the end of
3045      the outer-loop.  */
3046   if (nested_in_vect_loop && !double_reduc)
3047     goto vect_finalize_reduction;
3048
3049   /* The epilogue is created for the outer-loop, i.e., for the loop being
3050      vectorized.  */
3051   if (double_reduc)
3052     loop = outer_loop;
3053
3054   /* FORNOW */
3055   gcc_assert (ncopies == 1);
3056
3057   /* 2.3 Create the reduction code, using one of the three schemes described
3058          above.  */
3059
3060   if (reduc_code != ERROR_MARK)
3061     {
3062       tree tmp;
3063
3064       /*** Case 1:  Create:
3065            v_out2 = reduc_expr <v_out1>  */
3066
3067       if (vect_print_dump_info (REPORT_DETAILS))
3068         fprintf (vect_dump, "Reduce using direct vector reduction.");
3069
3070       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3071       tmp = build1 (reduc_code, vectype,  PHI_RESULT (new_phi));
3072       epilog_stmt = gimple_build_assign (vec_dest, tmp);
3073       new_temp = make_ssa_name (vec_dest, epilog_stmt);
3074       gimple_assign_set_lhs (epilog_stmt, new_temp);
3075       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3076
3077       extract_scalar_result = true;
3078     }
3079   else
3080     {
3081       enum tree_code shift_code = ERROR_MARK;
3082       bool have_whole_vector_shift = true;
3083       int bit_offset;
3084       int element_bitsize = tree_low_cst (bitsize, 1);
3085       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3086       tree vec_temp;
3087
3088       if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
3089         shift_code = VEC_RSHIFT_EXPR;
3090       else
3091         have_whole_vector_shift = false;
3092
3093       /* Regardless of whether we have a whole vector shift, if we're
3094          emulating the operation via tree-vect-generic, we don't want
3095          to use it.  Only the first round of the reduction is likely
3096          to still be profitable via emulation.  */
3097       /* ??? It might be better to emit a reduction tree code here, so that
3098          tree-vect-generic can expand the first round via bit tricks.  */
3099       if (!VECTOR_MODE_P (mode))
3100         have_whole_vector_shift = false;
3101       else
3102         {
3103           optab optab = optab_for_tree_code (code, vectype, optab_default);
3104           if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
3105             have_whole_vector_shift = false;
3106         }
3107
3108       if (have_whole_vector_shift)
3109         {
3110           /*** Case 2: Create:
3111              for (offset = VS/2; offset >= element_size; offset/=2)
3112                 {
3113                   Create:  va' = vec_shift <va, offset>
3114                   Create:  va = vop <va, va'>
3115                 }  */
3116
3117           if (vect_print_dump_info (REPORT_DETAILS))
3118             fprintf (vect_dump, "Reduce using vector shifts");
3119
3120           vec_dest = vect_create_destination_var (scalar_dest, vectype);
3121           new_temp = PHI_RESULT (new_phi);
3122
3123           for (bit_offset = vec_size_in_bits/2;
3124                bit_offset >= element_bitsize;
3125                bit_offset /= 2)
3126             {
3127               tree bitpos = size_int (bit_offset);
3128
3129               epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
3130                                                           new_temp, bitpos);
3131               new_name = make_ssa_name (vec_dest, epilog_stmt);
3132               gimple_assign_set_lhs (epilog_stmt, new_name);
3133               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3134
3135               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3136                                                           new_name, new_temp);
3137               new_temp = make_ssa_name (vec_dest, epilog_stmt);
3138               gimple_assign_set_lhs (epilog_stmt, new_temp);
3139               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3140             }
3141
3142           extract_scalar_result = true;
3143         }
3144       else
3145         {
3146           tree rhs;
3147
3148           /*** Case 3: Create:
3149              s = extract_field <v_out2, 0>
3150              for (offset = element_size;
3151                   offset < vector_size;
3152                   offset += element_size;)
3153                {
3154                  Create:  s' = extract_field <v_out2, offset>
3155                  Create:  s = op <s, s'>
3156                }  */
3157
3158           if (vect_print_dump_info (REPORT_DETAILS))
3159             fprintf (vect_dump, "Reduce using scalar code. ");
3160
3161           vec_temp = PHI_RESULT (new_phi);
3162           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3163           rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3164                          bitsize_zero_node);
3165           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3166           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3167           gimple_assign_set_lhs (epilog_stmt, new_temp);
3168           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3169
3170           for (bit_offset = element_bitsize;
3171                bit_offset < vec_size_in_bits;
3172                bit_offset += element_bitsize)
3173             {
3174               tree bitpos = bitsize_int (bit_offset);
3175               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3176                                  bitpos);
3177
3178               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3179               new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3180               gimple_assign_set_lhs (epilog_stmt, new_name);
3181               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3182
3183               epilog_stmt = gimple_build_assign_with_ops (code,
3184                                                           new_scalar_dest,
3185                                                           new_name, new_temp);
3186               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3187               gimple_assign_set_lhs (epilog_stmt, new_temp);
3188               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3189             }
3190
3191           extract_scalar_result = false;
3192         }
3193     }
3194
3195   /* 2.4  Extract the final scalar result.  Create:
3196          s_out3 = extract_field <v_out2, bitpos>  */
3197
3198   if (extract_scalar_result)
3199     {
3200       tree rhs;
3201
3202       gcc_assert (!nested_in_vect_loop || double_reduc);
3203       if (vect_print_dump_info (REPORT_DETAILS))
3204         fprintf (vect_dump, "extract scalar result");
3205
3206       if (BYTES_BIG_ENDIAN)
3207         bitpos = size_binop (MULT_EXPR,
3208                        bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3209                        TYPE_SIZE (scalar_type));
3210       else
3211         bitpos = bitsize_zero_node;
3212
3213       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3214       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3215       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3216       gimple_assign_set_lhs (epilog_stmt, new_temp);
3217       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3218     }
3219
3220 vect_finalize_reduction:
3221
3222   if (double_reduc)
3223     loop = loop->inner;
3224
3225   /* 2.5 Adjust the final result by the initial value of the reduction
3226          variable. (When such adjustment is not needed, then
3227          'adjustment_def' is zero).  For example, if code is PLUS we create:
3228          new_temp = loop_exit_def + adjustment_def  */
3229
3230   if (adjustment_def)
3231     {
3232       if (nested_in_vect_loop)
3233         {
3234           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3235           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3236           new_dest = vect_create_destination_var (scalar_dest, vectype);
3237         }
3238       else
3239         {
3240           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3241           expr = build2 (code, scalar_type, new_temp, adjustment_def);
3242           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3243         }
3244
3245       epilog_stmt = gimple_build_assign (new_dest, expr);
3246       new_temp = make_ssa_name (new_dest, epilog_stmt);
3247       gimple_assign_set_lhs (epilog_stmt, new_temp);
3248       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3249       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3250     }
3251
3252
3253   /* 2.6  Handle the loop-exit phi  */
3254
3255   /* Replace uses of s_out0 with uses of s_out3:
3256      Find the loop-closed-use at the loop exit of the original scalar result.
3257      (The reduction result is expected to have two immediate uses - one at the
3258      latch block, and one at the loop exit).  */
3259   phis = VEC_alloc (gimple, heap, 10);
3260   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3261     {
3262       if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3263         {
3264           exit_phi = USE_STMT (use_p);
3265           VEC_quick_push (gimple, phis, exit_phi);
3266         }
3267     }
3268
3269   /* We expect to have found an exit_phi because of loop-closed-ssa form.  */
3270   gcc_assert (!VEC_empty (gimple, phis));
3271
3272   for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
3273     {
3274       if (nested_in_vect_loop)
3275         {
3276           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3277           gimple vect_phi;
3278
3279           /* FORNOW. Currently not supporting the case that an inner-loop
3280              reduction is not used in the outer-loop (but only outside the
3281              outer-loop), unless it is double reduction.  */
3282           gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo)
3283                       && !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc);
3284
3285           epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
3286           STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
3287           set_vinfo_for_stmt (epilog_stmt,
3288                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
3289                                                  NULL));
3290           if (adjustment_def)
3291             STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3292                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3293
3294           if (!double_reduc
3295               || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def)
3296             continue;
3297
3298           /* Handle double reduction:
3299
3300              stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
3301              stmt2:   s3 = phi <s1, s4> - (regular) reduction phi (inner loop)
3302              stmt3:   s4 = use (s3)     - (regular) reduction stmt (inner loop)
3303              stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
3304
3305              At that point the regular reduction (stmt2 and stmt3) is already
3306              vectorized, as well as the exit phi node, stmt4.
3307              Here we vectorize the phi node of double reduction, stmt1, and
3308              update all relevant statements.  */
3309
3310           /* Go through all the uses of s2 to find double reduction phi node,
3311              i.e., stmt1 above.  */
3312           orig_name = PHI_RESULT (exit_phi);
3313           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3314             {
3315               stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3316               stmt_vec_info new_phi_vinfo;
3317               tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3318               basic_block bb = gimple_bb (use_stmt);
3319               gimple use;
3320
3321               /* Check that USE_STMT is really double reduction phi node.  */
3322               if (gimple_code (use_stmt) != GIMPLE_PHI
3323                   || gimple_phi_num_args (use_stmt) != 2
3324                   || !use_stmt_vinfo
3325                   || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3326                       != vect_double_reduction_def
3327                   || bb->loop_father != outer_loop)
3328                 continue;
3329
3330               /* Create vector phi node for double reduction:
3331                  vs1 = phi <vs0, vs2>
3332                  vs1 was created previously in this function by a call to
3333                  vect_get_vec_def_for_operand and is stored in vec_initial_def;
3334                  vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3335                  vs0 is created here.  */
3336
3337               /* Create vector phi node.  */
3338               vect_phi = create_phi_node (vec_initial_def, bb);
3339               new_phi_vinfo = new_stmt_vec_info (vect_phi,
3340                                     loop_vec_info_for_loop (outer_loop), NULL);
3341               set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3342
3343               /* Create vs0 - initial def of the double reduction phi.  */
3344               preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3345                                              loop_preheader_edge (outer_loop));
3346               init_def = get_initial_def_for_reduction (stmt, preheader_arg,
3347                                                         NULL);
3348               vect_phi_init = vect_init_vector (use_stmt, init_def, vectype,
3349                                                 NULL);
3350
3351               /* Update phi node arguments with vs0 and vs2.  */
3352               add_phi_arg (vect_phi, vect_phi_init,
3353                            loop_preheader_edge (outer_loop));
3354               add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3355                            loop_latch_edge (outer_loop));
3356               if (vect_print_dump_info (REPORT_DETAILS))
3357                 {
3358                   fprintf (vect_dump, "created double reduction phi node: ");
3359                   print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3360                 }
3361
3362               vect_phi_res = PHI_RESULT (vect_phi);
3363
3364               /* Replace the use, i.e., set the correct vs1 in the regular
3365                  reduction phi node. FORNOW, NCOPIES is always 1, so the loop
3366                  is redundant.  */
3367               use = reduction_phi;
3368               for (j = 0; j < ncopies; j++)
3369                 {
3370                   edge pr_edge = loop_preheader_edge (loop);
3371                   SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3372                   use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3373                 }
3374             }
3375         }
3376
3377       /* Replace the uses:  */
3378       orig_name = PHI_RESULT (exit_phi);
3379       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3380         FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3381           SET_USE (use_p, new_temp);
3382     }
3383
3384   VEC_free (gimple, heap, phis);
3385 }
3386
3387
3388 /* Function vectorizable_reduction.
3389
3390    Check if STMT performs a reduction operation that can be vectorized.
3391    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3392    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3393    Return FALSE if not a vectorizable STMT, TRUE otherwise.
3394
3395    This function also handles reduction idioms (patterns) that have been
3396    recognized in advance during vect_pattern_recog. In this case, STMT may be
3397    of this form:
3398      X = pattern_expr (arg0, arg1, ..., X)
3399    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3400    sequence that had been detected and replaced by the pattern-stmt (STMT).
3401
3402    In some cases of reduction patterns, the type of the reduction variable X is
3403    different than the type of the other arguments of STMT.
3404    In such cases, the vectype that is used when transforming STMT into a vector
3405    stmt is different than the vectype that is used to determine the
3406    vectorization factor, because it consists of a different number of elements
3407    than the actual number of elements that are being operated upon in parallel.
3408
3409    For example, consider an accumulation of shorts into an int accumulator.
3410    On some targets it's possible to vectorize this pattern operating on 8
3411    shorts at a time (hence, the vectype for purposes of determining the
3412    vectorization factor should be V8HI); on the other hand, the vectype that
3413    is used to create the vector form is actually V4SI (the type of the result).
3414
3415    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3416    indicates what is the actual level of parallelism (V8HI in the example), so
3417    that the right vectorization factor would be derived. This vectype
3418    corresponds to the type of arguments to the reduction stmt, and should *NOT*
3419    be used to create the vectorized stmt. The right vectype for the vectorized
3420    stmt is obtained from the type of the result X:
3421         get_vectype_for_scalar_type (TREE_TYPE (X))
3422
3423    This means that, contrary to "regular" reductions (or "regular" stmts in
3424    general), the following equation:
3425       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3426    does *NOT* necessarily hold for reduction patterns.  */
3427
3428 bool
3429 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3430                         gimple *vec_stmt)
3431 {
3432   tree vec_dest;
3433   tree scalar_dest;
3434   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3435   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3436   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3437   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3438   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3439   enum tree_code code, orig_code, epilog_reduc_code;
3440   enum machine_mode vec_mode;
3441   int op_type;
3442   optab optab, reduc_optab;
3443   tree new_temp = NULL_TREE;
3444   tree def;
3445   gimple def_stmt;
3446   enum vect_def_type dt;
3447   gimple new_phi = NULL;
3448   tree scalar_type;
3449   bool is_simple_use;
3450   gimple orig_stmt;
3451   stmt_vec_info orig_stmt_info;
3452   tree expr = NULL_TREE;
3453   int i;
3454   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3455   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3456   int epilog_copies;
3457   stmt_vec_info prev_stmt_info, prev_phi_info;
3458   gimple first_phi = NULL;
3459   bool single_defuse_cycle = false;
3460   tree reduc_def = NULL_TREE;
3461   gimple new_stmt = NULL;
3462   int j;
3463   tree ops[3];
3464   bool nested_cycle = false, found_nested_cycle_def = false;
3465   gimple reduc_def_stmt = NULL;
3466   /* The default is that the reduction variable is the last in statement.  */
3467   int reduc_index = 2;
3468   bool double_reduc = false, dummy;
3469   basic_block def_bb;
3470   struct loop * def_stmt_loop, *outer_loop = NULL;
3471   tree def_arg;
3472   gimple def_arg_stmt;
3473
3474   if (nested_in_vect_loop_p (loop, stmt))
3475     {
3476       outer_loop = loop;
3477       loop = loop->inner;
3478       nested_cycle = true;
3479     }
3480
3481   gcc_assert (ncopies >= 1);
3482
3483   /* FORNOW: SLP not supported.  */
3484   if (STMT_SLP_TYPE (stmt_info))
3485     return false;
3486
3487   /* 1. Is vectorizable reduction?  */
3488   /* Not supportable if the reduction variable is used in the loop.  */
3489   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3490     return false;
3491
3492   /* Reductions that are not used even in an enclosing outer-loop,
3493      are expected to be "live" (used out of the loop).  */
3494   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3495       && !STMT_VINFO_LIVE_P (stmt_info))
3496     return false;
3497
3498   /* Make sure it was already recognized as a reduction computation.  */
3499   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3500       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3501     return false;
3502
3503   /* 2. Has this been recognized as a reduction pattern?
3504
3505      Check if STMT represents a pattern that has been recognized
3506      in earlier analysis stages.  For stmts that represent a pattern,
3507      the STMT_VINFO_RELATED_STMT field records the last stmt in
3508      the original sequence that constitutes the pattern.  */
3509
3510   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3511   if (orig_stmt)
3512     {
3513       orig_stmt_info = vinfo_for_stmt (orig_stmt);
3514       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3515       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3516       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3517     }
3518
3519   /* 3. Check the operands of the operation. The first operands are defined
3520         inside the loop body. The last operand is the reduction variable,
3521         which is defined by the loop-header-phi.  */
3522
3523   gcc_assert (is_gimple_assign (stmt));
3524
3525   /* Flatten RHS */
3526   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3527     {
3528     case GIMPLE_SINGLE_RHS:
3529       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3530       if (op_type == ternary_op)
3531         {
3532           tree rhs = gimple_assign_rhs1 (stmt);
3533           ops[0] = TREE_OPERAND (rhs, 0);
3534           ops[1] = TREE_OPERAND (rhs, 1);
3535           ops[2] = TREE_OPERAND (rhs, 2);
3536           code = TREE_CODE (rhs);
3537         }
3538       else
3539         return false;
3540       break;
3541
3542     case GIMPLE_BINARY_RHS:
3543       code = gimple_assign_rhs_code (stmt);
3544       op_type = TREE_CODE_LENGTH (code);
3545       gcc_assert (op_type == binary_op);
3546       ops[0] = gimple_assign_rhs1 (stmt);
3547       ops[1] = gimple_assign_rhs2 (stmt);
3548       break;
3549
3550     case GIMPLE_UNARY_RHS:
3551       return false;
3552
3553     default:
3554       gcc_unreachable ();
3555     }
3556
3557   scalar_dest = gimple_assign_lhs (stmt);
3558   scalar_type = TREE_TYPE (scalar_dest);
3559   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3560       && !SCALAR_FLOAT_TYPE_P (scalar_type))
3561     return false;
3562
3563   /* All uses but the last are expected to be defined in the loop.
3564      The last use is the reduction variable. In case of nested cycle this
3565      assumption is not true: we use reduc_index to record the index of the
3566      reduction variable.  */
3567   for (i = 0; i < op_type-1; i++)
3568     {
3569       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
3570       if (i == 0 && code == COND_EXPR)
3571         continue;
3572
3573       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3574                                           &def, &dt);
3575       gcc_assert (is_simple_use);
3576       if (dt != vect_internal_def
3577           && dt != vect_external_def
3578           && dt != vect_constant_def
3579           && dt != vect_induction_def
3580           && !(dt == vect_nested_cycle && nested_cycle))
3581         return false;
3582
3583       if (dt == vect_nested_cycle)
3584         {
3585           found_nested_cycle_def = true;
3586           reduc_def_stmt = def_stmt;
3587           reduc_index = i;
3588         }
3589     }
3590
3591   is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3592                                       &def, &dt);
3593   gcc_assert (is_simple_use);
3594   gcc_assert (dt == vect_reduction_def
3595               || dt == vect_nested_cycle
3596               || ((dt == vect_internal_def || dt == vect_external_def
3597                    || dt == vect_constant_def || dt == vect_induction_def)
3598                    && nested_cycle && found_nested_cycle_def));
3599   if (!found_nested_cycle_def)
3600     reduc_def_stmt = def_stmt;
3601
3602   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
3603   if (orig_stmt)
3604     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
3605                                                        reduc_def_stmt,
3606                                                        !nested_cycle,
3607                                                        &dummy));
3608   else
3609     gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
3610                                                   !nested_cycle, &dummy));
3611
3612   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
3613     return false;
3614
3615   vec_mode = TYPE_MODE (vectype);
3616
3617   if (code == COND_EXPR)
3618     {
3619       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
3620         {
3621           if (vect_print_dump_info (REPORT_DETAILS))
3622             fprintf (vect_dump, "unsupported condition in reduction");
3623
3624             return false;
3625         }
3626     }
3627   else
3628     {
3629       /* 4. Supportable by target?  */
3630
3631       /* 4.1. check support for the operation in the loop  */
3632       optab = optab_for_tree_code (code, vectype, optab_default);
3633       if (!optab)
3634         {
3635           if (vect_print_dump_info (REPORT_DETAILS))
3636             fprintf (vect_dump, "no optab.");
3637
3638           return false;
3639         }
3640
3641       if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
3642         {
3643           if (vect_print_dump_info (REPORT_DETAILS))
3644             fprintf (vect_dump, "op not supported by target.");
3645
3646           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3647               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3648                   < vect_min_worthwhile_factor (code))
3649             return false;
3650
3651           if (vect_print_dump_info (REPORT_DETAILS))
3652             fprintf (vect_dump, "proceeding using word mode.");
3653         }
3654
3655       /* Worthwhile without SIMD support?  */
3656       if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3657           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3658              < vect_min_worthwhile_factor (code))
3659         {
3660           if (vect_print_dump_info (REPORT_DETAILS))
3661             fprintf (vect_dump, "not worthwhile without SIMD support.");
3662
3663           return false;
3664         }
3665     }
3666
3667   /* 4.2. Check support for the epilog operation.
3668
3669           If STMT represents a reduction pattern, then the type of the
3670           reduction variable may be different than the type of the rest
3671           of the arguments.  For example, consider the case of accumulation
3672           of shorts into an int accumulator; The original code:
3673                         S1: int_a = (int) short_a;
3674           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
3675
3676           was replaced with:
3677                         STMT: int_acc = widen_sum <short_a, int_acc>
3678
3679           This means that:
3680           1. The tree-code that is used to create the vector operation in the
3681              epilog code (that reduces the partial results) is not the
3682              tree-code of STMT, but is rather the tree-code of the original
3683              stmt from the pattern that STMT is replacing. I.e, in the example
3684              above we want to use 'widen_sum' in the loop, but 'plus' in the
3685              epilog.
3686           2. The type (mode) we use to check available target support
3687              for the vector operation to be created in the *epilog*, is
3688              determined by the type of the reduction variable (in the example
3689              above we'd check this: plus_optab[vect_int_mode]).
3690              However the type (mode) we use to check available target support
3691              for the vector operation to be created *inside the loop*, is
3692              determined by the type of the other arguments to STMT (in the
3693              example we'd check this: widen_sum_optab[vect_short_mode]).
3694
3695           This is contrary to "regular" reductions, in which the types of all
3696           the arguments are the same as the type of the reduction variable.
3697           For "regular" reductions we can therefore use the same vector type
3698           (and also the same tree-code) when generating the epilog code and
3699           when generating the code inside the loop.  */
3700
3701   if (orig_stmt)
3702     {
3703       /* This is a reduction pattern: get the vectype from the type of the
3704          reduction variable, and get the tree-code from orig_stmt.  */
3705       orig_code = gimple_assign_rhs_code (orig_stmt);
3706       vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3707       if (!vectype)
3708         {
3709           if (vect_print_dump_info (REPORT_DETAILS))
3710             {
3711               fprintf (vect_dump, "unsupported data-type ");
3712               print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3713             }
3714           return false;
3715         }
3716
3717       vec_mode = TYPE_MODE (vectype);
3718     }
3719   else
3720     {
3721       /* Regular reduction: use the same vectype and tree-code as used for
3722          the vector code inside the loop can be used for the epilog code. */
3723       orig_code = code;
3724     }
3725
3726   if (nested_cycle)
3727     {
3728       def_bb = gimple_bb (reduc_def_stmt);
3729       def_stmt_loop = def_bb->loop_father;
3730       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
3731                                        loop_preheader_edge (def_stmt_loop));
3732       if (TREE_CODE (def_arg) == SSA_NAME
3733           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
3734           && gimple_code (def_arg_stmt) == GIMPLE_PHI
3735           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
3736           && vinfo_for_stmt (def_arg_stmt)
3737           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
3738               == vect_double_reduction_def)
3739         double_reduc = true;
3740     }
3741
3742   epilog_reduc_code = ERROR_MARK;
3743   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3744     {
3745       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype,
3746                                          optab_default);
3747       if (!reduc_optab)
3748         {
3749           if (vect_print_dump_info (REPORT_DETAILS))
3750             fprintf (vect_dump, "no optab for reduction.");
3751
3752           epilog_reduc_code = ERROR_MARK;
3753         }
3754
3755       if (reduc_optab
3756           && optab_handler (reduc_optab, vec_mode)->insn_code
3757               == CODE_FOR_nothing)
3758         {
3759           if (vect_print_dump_info (REPORT_DETAILS))
3760             fprintf (vect_dump, "reduc op not supported by target.");
3761
3762           epilog_reduc_code = ERROR_MARK;
3763         }
3764     }
3765   else
3766     {
3767       if (!nested_cycle || double_reduc)
3768         {
3769           if (vect_print_dump_info (REPORT_DETAILS))
3770             fprintf (vect_dump, "no reduc code for scalar code.");
3771
3772           return false;
3773         }
3774     }
3775
3776   if (double_reduc && ncopies > 1)
3777     {
3778       if (vect_print_dump_info (REPORT_DETAILS))
3779         fprintf (vect_dump, "multiple types in double reduction");
3780
3781       return false;
3782     }
3783
3784   if (!vec_stmt) /* transformation not required.  */
3785     {
3786       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3787       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3788         return false;
3789       return true;
3790     }
3791
3792   /** Transform.  **/
3793
3794   if (vect_print_dump_info (REPORT_DETAILS))
3795     fprintf (vect_dump, "transform reduction.");
3796
3797   /* FORNOW: Multiple types are not supported for condition.  */
3798   if (code == COND_EXPR)
3799     gcc_assert (ncopies == 1);
3800
3801   /* Create the destination vector  */
3802   vec_dest = vect_create_destination_var (scalar_dest, vectype);
3803
3804   /* In case the vectorization factor (VF) is bigger than the number
3805      of elements that we can fit in a vectype (nunits), we have to generate
3806      more than one vector stmt - i.e - we need to "unroll" the
3807      vector stmt by a factor VF/nunits.  For more details see documentation
3808      in vectorizable_operation.  */
3809
3810   /* If the reduction is used in an outer loop we need to generate
3811      VF intermediate results, like so (e.g. for ncopies=2):
3812         r0 = phi (init, r0)
3813         r1 = phi (init, r1)
3814         r0 = x0 + r0;
3815         r1 = x1 + r1;
3816     (i.e. we generate VF results in 2 registers).
3817     In this case we have a separate def-use cycle for each copy, and therefore
3818     for each copy we get the vector def for the reduction variable from the
3819     respective phi node created for this copy.
3820
3821     Otherwise (the reduction is unused in the loop nest), we can combine
3822     together intermediate results, like so (e.g. for ncopies=2):
3823         r = phi (init, r)
3824         r = x0 + r;
3825         r = x1 + r;
3826    (i.e. we generate VF/2 results in a single register).
3827    In this case for each copy we get the vector def for the reduction variable
3828    from the vectorized reduction operation generated in the previous iteration.
3829   */
3830
3831   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
3832     {
3833       single_defuse_cycle = true;
3834       epilog_copies = 1;
3835     }
3836   else
3837     epilog_copies = ncopies;
3838
3839   prev_stmt_info = NULL;
3840   prev_phi_info = NULL;
3841   for (j = 0; j < ncopies; j++)
3842     {
3843       if (j == 0 || !single_defuse_cycle)
3844         {
3845           /* Create the reduction-phi that defines the reduction-operand.  */
3846           new_phi = create_phi_node (vec_dest, loop->header);
3847           set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo,
3848                                                           NULL));
3849           /* Get the vector def for the reduction variable from the phi
3850              node.  */
3851           reduc_def = PHI_RESULT (new_phi);
3852         }
3853
3854       if (code == COND_EXPR)
3855         {
3856           first_phi = new_phi;
3857           vectorizable_condition (stmt, gsi, vec_stmt, reduc_def, reduc_index);
3858           /* Multiple types are not supported for condition.  */
3859           break;
3860         }
3861
3862       /* Handle uses.  */
3863       if (j == 0)
3864         {
3865           loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
3866                                                         stmt, NULL);
3867           if (op_type == ternary_op)
3868             {
3869               if (reduc_index == 0)
3870                 loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
3871                                                               NULL);
3872               else
3873                 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
3874                                                               NULL);
3875             }
3876
3877           /* Get the vector def for the reduction variable from the phi
3878              node.  */
3879           first_phi = new_phi;
3880         }
3881       else
3882         {
3883           enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3884           loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3885           if (op_type == ternary_op)
3886             loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3887
3888           if (single_defuse_cycle)
3889             reduc_def = gimple_assign_lhs (new_stmt);
3890           else
3891             reduc_def = PHI_RESULT (new_phi);
3892
3893           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3894         }
3895
3896       /* Arguments are ready. Create the new vector stmt.  */
3897       if (op_type == binary_op)
3898         {
3899           if (reduc_index == 0)
3900             expr = build2 (code, vectype, reduc_def, loop_vec_def0);
3901           else
3902             expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3903         }
3904       else
3905         {
3906           if (reduc_index == 0)
3907             expr = build3 (code, vectype, reduc_def, loop_vec_def0,
3908                            loop_vec_def1);
3909           else
3910             {
3911               if (reduc_index == 1)
3912                 expr = build3 (code, vectype, loop_vec_def0, reduc_def,
3913                                loop_vec_def1);
3914               else
3915                 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3916                                reduc_def);
3917             }
3918         }
3919
3920       new_stmt = gimple_build_assign (vec_dest, expr);
3921       new_temp = make_ssa_name (vec_dest, new_stmt);
3922       gimple_assign_set_lhs (new_stmt, new_temp);
3923       vect_finish_stmt_generation (stmt, new_stmt, gsi);
3924
3925       if (j == 0)
3926         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3927       else
3928         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3929
3930       prev_stmt_info = vinfo_for_stmt (new_stmt);
3931       prev_phi_info = vinfo_for_stmt (new_phi);
3932     }
3933
3934   /* Finalize the reduction-phi (set its arguments) and create the
3935      epilog reduction code.  */
3936   if (!single_defuse_cycle || code == COND_EXPR)
3937     new_temp = gimple_assign_lhs (*vec_stmt);
3938
3939   vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3940                                     epilog_reduc_code, first_phi, reduc_index,
3941                                     double_reduc);
3942   return true;
3943 }
3944
3945 /* Function vect_min_worthwhile_factor.
3946
3947    For a loop where we could vectorize the operation indicated by CODE,
3948    return the minimum vectorization factor that makes it worthwhile
3949    to use generic vectors.  */
3950 int
3951 vect_min_worthwhile_factor (enum tree_code code)
3952 {
3953   switch (code)
3954     {
3955     case PLUS_EXPR:
3956     case MINUS_EXPR:
3957     case NEGATE_EXPR:
3958       return 4;
3959
3960     case BIT_AND_EXPR:
3961     case BIT_IOR_EXPR:
3962     case BIT_XOR_EXPR:
3963     case BIT_NOT_EXPR:
3964       return 2;
3965
3966     default:
3967       return INT_MAX;
3968     }
3969 }
3970
3971
3972 /* Function vectorizable_induction
3973
3974    Check if PHI performs an induction computation that can be vectorized.
3975    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3976    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3977    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
3978
3979 bool
3980 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3981                         gimple *vec_stmt)
3982 {
3983   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3984   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3985   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3986   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3987   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3988   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3989   tree vec_def;
3990
3991   gcc_assert (ncopies >= 1);
3992   /* FORNOW. This restriction should be relaxed.  */
3993   if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
3994     {
3995       if (vect_print_dump_info (REPORT_DETAILS))
3996         fprintf (vect_dump, "multiple types in nested loop.");
3997       return false;
3998     }
3999
4000   if (!STMT_VINFO_RELEVANT_P (stmt_info))
4001     return false;
4002
4003   /* FORNOW: SLP not supported.  */
4004   if (STMT_SLP_TYPE (stmt_info))
4005     return false;
4006
4007   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4008
4009   if (gimple_code (phi) != GIMPLE_PHI)
4010     return false;
4011
4012   if (!vec_stmt) /* transformation not required.  */
4013     {
4014       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4015       if (vect_print_dump_info (REPORT_DETAILS))
4016         fprintf (vect_dump, "=== vectorizable_induction ===");
4017       vect_model_induction_cost (stmt_info, ncopies);
4018       return true;
4019     }
4020
4021   /** Transform.  **/
4022
4023   if (vect_print_dump_info (REPORT_DETAILS))
4024     fprintf (vect_dump, "transform induction phi.");
4025
4026   vec_def = get_initial_def_for_induction (phi);
4027   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4028   return true;
4029 }
4030
4031 /* Function vectorizable_live_operation.
4032
4033    STMT computes a value that is used outside the loop. Check if
4034    it can be supported.  */
4035
4036 bool
4037 vectorizable_live_operation (gimple stmt,
4038                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4039                              gimple *vec_stmt ATTRIBUTE_UNUSED)
4040 {
4041   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4042   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4043   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4044   int i;
4045   int op_type;
4046   tree op;
4047   tree def;
4048   gimple def_stmt;
4049   enum vect_def_type dt;
4050   enum tree_code code;
4051   enum gimple_rhs_class rhs_class;
4052
4053   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4054
4055   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4056     return false;
4057
4058   if (!is_gimple_assign (stmt))
4059     return false;
4060
4061   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4062     return false;
4063
4064   /* FORNOW. CHECKME. */
4065   if (nested_in_vect_loop_p (loop, stmt))
4066     return false;
4067
4068   code = gimple_assign_rhs_code (stmt);
4069   op_type = TREE_CODE_LENGTH (code);
4070   rhs_class = get_gimple_rhs_class (code);
4071   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4072   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4073
4074   /* FORNOW: support only if all uses are invariant. This means
4075      that the scalar operations can remain in place, unvectorized.
4076      The original last scalar value that they compute will be used.  */
4077
4078   for (i = 0; i < op_type; i++)
4079     {
4080       if (rhs_class == GIMPLE_SINGLE_RHS)
4081         op = TREE_OPERAND (gimple_op (stmt, 1), i);
4082       else
4083         op = gimple_op (stmt, i + 1);
4084       if (op
4085           && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4086         {
4087           if (vect_print_dump_info (REPORT_DETAILS))
4088             fprintf (vect_dump, "use not simple.");
4089           return false;
4090         }
4091
4092       if (dt != vect_external_def && dt != vect_constant_def)
4093         return false;
4094     }
4095
4096   /* No transformation is required for the cases we currently support.  */
4097   return true;
4098 }
4099
4100 /* Function vect_transform_loop.
4101
4102    The analysis phase has determined that the loop is vectorizable.
4103    Vectorize the loop - created vectorized stmts to replace the scalar
4104    stmts in the loop, and update the loop exit condition.  */
4105
4106 void
4107 vect_transform_loop (loop_vec_info loop_vinfo)
4108 {
4109   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4110   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4111   int nbbs = loop->num_nodes;
4112   gimple_stmt_iterator si;
4113   int i;
4114   tree ratio = NULL;
4115   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4116   bool strided_store;
4117   bool slp_scheduled = false;
4118   unsigned int nunits;
4119   tree cond_expr = NULL_TREE;
4120   gimple_seq cond_expr_stmt_list = NULL;
4121   bool do_peeling_for_loop_bound;
4122
4123   if (vect_print_dump_info (REPORT_DETAILS))
4124     fprintf (vect_dump, "=== vec_transform_loop ===");
4125
4126   /* Peel the loop if there are data refs with unknown alignment.
4127      Only one data ref with unknown store is allowed.  */
4128
4129   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4130     vect_do_peeling_for_alignment (loop_vinfo);
4131
4132   do_peeling_for_loop_bound
4133     = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4134        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4135            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4136
4137   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4138       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4139     vect_loop_versioning (loop_vinfo,
4140                           !do_peeling_for_loop_bound,
4141                           &cond_expr, &cond_expr_stmt_list);
4142
4143   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4144      compile time constant), or it is a constant that doesn't divide by the
4145      vectorization factor, then an epilog loop needs to be created.
4146      We therefore duplicate the loop: the original loop will be vectorized,
4147      and will compute the first (n/VF) iterations. The second copy of the loop
4148      will remain scalar and will compute the remaining (n%VF) iterations.
4149      (VF is the vectorization factor).  */
4150
4151   if (do_peeling_for_loop_bound)
4152     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4153                                     cond_expr, cond_expr_stmt_list);
4154   else
4155     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4156                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4157
4158   /* 1) Make sure the loop header has exactly two entries
4159      2) Make sure we have a preheader basic block.  */
4160
4161   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4162
4163   split_edge (loop_preheader_edge (loop));
4164
4165   /* FORNOW: the vectorizer supports only loops which body consist
4166      of one basic block (header + empty latch). When the vectorizer will
4167      support more involved loop forms, the order by which the BBs are
4168      traversed need to be reconsidered.  */
4169
4170   for (i = 0; i < nbbs; i++)
4171     {
4172       basic_block bb = bbs[i];
4173       stmt_vec_info stmt_info;
4174       gimple phi;
4175
4176       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4177         {
4178           phi = gsi_stmt (si);
4179           if (vect_print_dump_info (REPORT_DETAILS))
4180             {
4181               fprintf (vect_dump, "------>vectorizing phi: ");
4182               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4183             }
4184           stmt_info = vinfo_for_stmt (phi);
4185           if (!stmt_info)
4186             continue;
4187
4188           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4189               && !STMT_VINFO_LIVE_P (stmt_info))
4190             continue;
4191
4192           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4193                 != (unsigned HOST_WIDE_INT) vectorization_factor)
4194               && vect_print_dump_info (REPORT_DETAILS))
4195             fprintf (vect_dump, "multiple-types.");
4196
4197           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4198             {
4199               if (vect_print_dump_info (REPORT_DETAILS))
4200                 fprintf (vect_dump, "transform phi.");
4201               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4202             }
4203         }
4204
4205       for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4206         {
4207           gimple stmt = gsi_stmt (si);
4208           bool is_store;
4209
4210           if (vect_print_dump_info (REPORT_DETAILS))
4211             {
4212               fprintf (vect_dump, "------>vectorizing statement: ");
4213               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4214             }
4215
4216           stmt_info = vinfo_for_stmt (stmt);
4217
4218           /* vector stmts created in the outer-loop during vectorization of
4219              stmts in an inner-loop may not have a stmt_info, and do not
4220              need to be vectorized.  */
4221           if (!stmt_info)
4222             {
4223               gsi_next (&si);
4224               continue;
4225             }
4226
4227           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4228               && !STMT_VINFO_LIVE_P (stmt_info))
4229             {
4230               gsi_next (&si);
4231               continue;
4232             }
4233
4234           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4235           nunits =
4236             (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4237           if (!STMT_SLP_TYPE (stmt_info)
4238               && nunits != (unsigned int) vectorization_factor
4239               && vect_print_dump_info (REPORT_DETAILS))
4240             /* For SLP VF is set according to unrolling factor, and not to
4241                vector size, hence for SLP this print is not valid.  */
4242             fprintf (vect_dump, "multiple-types.");
4243
4244           /* SLP. Schedule all the SLP instances when the first SLP stmt is
4245              reached.  */
4246           if (STMT_SLP_TYPE (stmt_info))
4247             {
4248               if (!slp_scheduled)
4249                 {
4250                   slp_scheduled = true;
4251
4252                   if (vect_print_dump_info (REPORT_DETAILS))
4253                     fprintf (vect_dump, "=== scheduling SLP instances ===");
4254
4255                   vect_schedule_slp (loop_vinfo, NULL);
4256                 }
4257
4258               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
4259               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4260                 {
4261                   gsi_next (&si);
4262                   continue;
4263                 }
4264             }
4265
4266           /* -------- vectorize statement ------------ */
4267           if (vect_print_dump_info (REPORT_DETAILS))
4268             fprintf (vect_dump, "transform statement.");
4269
4270           strided_store = false;
4271           is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4272           if (is_store)
4273             {
4274               if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4275                 {
4276                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4277                      interleaving chain was completed - free all the stores in
4278                      the chain.  */
4279                   vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4280                   gsi_remove (&si, true);
4281                   continue;
4282                 }
4283               else
4284                 {
4285                   /* Free the attached stmt_vec_info and remove the stmt.  */
4286                   free_stmt_vec_info (stmt);
4287                   gsi_remove (&si, true);
4288                   continue;
4289                 }
4290             }
4291           gsi_next (&si);
4292         }                       /* stmts in BB */
4293     }                           /* BBs in loop */
4294
4295   slpeel_make_loop_iterate_ntimes (loop, ratio);
4296
4297   /* The memory tags and pointers in vectorized statements need to
4298      have their SSA forms updated.  FIXME, why can't this be delayed
4299      until all the loops have been transformed?  */
4300   update_ssa (TODO_update_ssa);
4301
4302   if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4303     fprintf (vect_dump, "LOOP VECTORIZED.");
4304   if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4305     fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
4306 }