gcc/tree-vect-transform.c

   1 /* Transformation Utilities for Loop Vectorization.
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "ggc.h"
  26 #include "tree.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "basic-block.h"
  30 #include "diagnostic.h"
  31 #include "tree-flow.h"
  32 #include "tree-dump.h"
  33 #include "timevar.h"
  34 #include "cfgloop.h"
  35 #include "expr.h"
  36 #include "optabs.h"
  37 #include "params.h"
  38 #include "recog.h"
  39 #include "tree-data-ref.h"
  40 #include "tree-chrec.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "langhooks.h"
  44 #include "tree-pass.h"
  45 #include "toplev.h"
  46 #include "real.h"
  47
  48 /* Utility functions for the code transformation.  */
  49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *, slp_tree);
  50 static tree vect_create_destination_var (tree, tree);
  51 static tree vect_create_data_ref_ptr
  52   (tree, struct loop*, tree, tree *, tree *, bool, tree, bool *);
  53 static tree vect_create_addr_base_for_vector_ref
  54   (tree, tree *, tree, struct loop *);
  55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
  56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
  57 static tree vect_init_vector (tree, tree, tree, block_stmt_iterator *);
  58 static void vect_finish_stmt_generation
  59   (tree stmt, tree vec_stmt, block_stmt_iterator *);
  60 static bool vect_is_simple_cond (tree, loop_vec_info);
  61 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
  62 static tree get_initial_def_for_reduction (tree, tree, tree *);
  63
  64 /* Utility function dealing with loop peeling (not peeling itself).  */
  65 static void vect_generate_tmps_on_preheader
  66   (loop_vec_info, tree *, tree *, tree *);
  67 static tree vect_build_loop_niters (loop_vec_info);
  68 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
  69 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
  70 static void vect_update_init_of_dr (struct data_reference *, tree niters);
  71 static void vect_update_inits_of_drs (loop_vec_info, tree);
  72 static int vect_min_worthwhile_factor (enum tree_code);
  73
  74
  75 static int
  76 cost_for_stmt (tree stmt)
  77 {
  78   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
  79
  80   switch (STMT_VINFO_TYPE (stmt_info))
  81   {
  82   case load_vec_info_type:
  83     return TARG_SCALAR_LOAD_COST;
  84   case store_vec_info_type:
  85     return TARG_SCALAR_STORE_COST;
  86   case op_vec_info_type:
  87   case condition_vec_info_type:
  88   case assignment_vec_info_type:
  89   case reduc_vec_info_type:
  90   case induc_vec_info_type:
  91   case type_promotion_vec_info_type:
  92   case type_demotion_vec_info_type:
  93   case type_conversion_vec_info_type:
  94   case call_vec_info_type:
  95     return TARG_SCALAR_STMT_COST;
  96   case undef_vec_info_type:
  97   default:
  98     gcc_unreachable ();
  99   }
 100 }
 101
 102
 103 /* Function vect_estimate_min_profitable_iters
 104
 105    Return the number of iterations required for the vector version of the
 106    loop to be profitable relative to the cost of the scalar version of the
 107    loop.
 108
 109    TODO: Take profile info into account before making vectorization
 110    decisions, if available.  */
 111
 112 int
 113 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
 114 {
 115   int i;
 116   int min_profitable_iters;
 117   int peel_iters_prologue;
 118   int peel_iters_epilogue;
 119   int vec_inside_cost = 0;
 120   int vec_outside_cost = 0;
 121   int scalar_single_iter_cost = 0;
 122   int scalar_outside_cost = 0;
 123   bool runtime_test = false;
 124   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 125   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 126   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 127   int nbbs = loop->num_nodes;
 128   int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
 129   int peel_guard_costs = 0;
 130   int innerloop_iters = 0, factor;
 131   VEC (slp_instance, heap) *slp_instances;
 132   slp_instance instance;
 133
 134   /* Cost model disabled.  */
 135   if (!flag_vect_cost_model)
 136     {
 137       if (vect_print_dump_info (REPORT_COST))
 138         fprintf (vect_dump, "cost model disabled.");
 139       return 0;
 140     }
 141
 142   /* If the number of iterations is unknown, or the
 143      peeling-for-misalignment amount is unknown, we will have to generate
 144      a runtime test to test the loop count against the threshold.    */
 145   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
 146       || (byte_misalign < 0))
 147     runtime_test = true;
 148
 149   /* Requires loop versioning tests to handle misalignment.  */
 150
 151   if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
 152     {
 153       /*  FIXME: Make cost depend on complexity of individual check.  */
 154       vec_outside_cost +=
 155         VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 156       if (vect_print_dump_info (REPORT_COST))
 157         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
 158                  "versioning to treat misalignment.\n");
 159     }
 160
 161   if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
 162     {
 163       /*  FIXME: Make cost depend on complexity of individual check.  */
 164       vec_outside_cost +=
 165         VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 166       if (vect_print_dump_info (REPORT_COST))
 167         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
 168                  "versioning aliasing.\n");
 169     }
 170
 171   if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
 172       || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
 173     {
 174       vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
 175     }
 176
 177   /* Count statements in scalar loop.  Using this as scalar cost for a single
 178      iteration for now.
 179
 180      TODO: Add outer loop support.
 181
 182      TODO: Consider assigning different costs to different scalar
 183      statements.  */
 184
 185   /* FORNOW.  */
 186   if (loop->inner)
 187     innerloop_iters = 50; /* FIXME */
 188
 189   for (i = 0; i < nbbs; i++)
 190     {
 191       block_stmt_iterator si;
 192       basic_block bb = bbs[i];
 193
 194       if (bb->loop_father == loop->inner)
 195         factor = innerloop_iters;
 196       else
 197         factor = 1;
 198
 199       for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
 200         {
 201           tree stmt = bsi_stmt (si);
 202           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 203           /* Skip stmts that are not vectorized inside the loop.  */
 204           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 205               && (!STMT_VINFO_LIVE_P (stmt_info)
 206                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
 207             continue;
 208           scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
 209           vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
 210           /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
 211              some of the "outside" costs are generated inside the outer-loop.  */
 212           vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
 213         }
 214     }
 215
 216   /* Add additional cost for the peeled instructions in prologue and epilogue
 217      loop.
 218
 219      FORNOW: If we dont know the value of peel_iters for prologue or epilogue
 220      at compile-time - we assume it's vf/2 (the worst would be vf-1).
 221
 222      TODO: Build an expression that represents peel_iters for prologue and
 223      epilogue to be used in a run-time test.  */
 224
 225   if (byte_misalign < 0)
 226     {
 227       peel_iters_prologue = vf/2;
 228       if (vect_print_dump_info (REPORT_COST))
 229         fprintf (vect_dump, "cost model: "
 230                  "prologue peel iters set to vf/2.");
 231
 232       /* If peeling for alignment is unknown, loop bound of main loop becomes
 233          unknown.  */
 234       peel_iters_epilogue = vf/2;
 235       if (vect_print_dump_info (REPORT_COST))
 236         fprintf (vect_dump, "cost model: "
 237                  "epilogue peel iters set to vf/2 because "
 238                  "peeling for alignment is unknown .");
 239
 240       /* If peeled iterations are unknown, count a taken branch and a not taken
 241          branch per peeled loop. Even if scalar loop iterations are known,
 242          vector iterations are not known since peeled prologue iterations are
 243          not known. Hence guards remain the same.  */
 244       peel_guard_costs +=  2 * (TARG_COND_TAKEN_BRANCH_COST
 245                                + TARG_COND_NOT_TAKEN_BRANCH_COST);
 246
 247     }
 248   else
 249     {
 250       if (byte_misalign)
 251         {
 252           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
 253           int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 254           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
 255           int nelements = TYPE_VECTOR_SUBPARTS (vectype);
 256
 257           peel_iters_prologue = nelements - (byte_misalign / element_size);
 258         }
 259       else
 260         peel_iters_prologue = 0;
 261
 262       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
 263         {
 264           peel_iters_epilogue = vf/2;
 265           if (vect_print_dump_info (REPORT_COST))
 266             fprintf (vect_dump, "cost model: "
 267                      "epilogue peel iters set to vf/2 because "
 268                      "loop iterations are unknown .");
 269
 270           /* If peeled iterations are known but number of scalar loop
 271              iterations are unknown, count a taken branch per peeled loop.  */
 272           peel_guard_costs +=  2 * TARG_COND_TAKEN_BRANCH_COST;
 273
 274         }
 275       else
 276         {
 277           int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
 278           peel_iters_prologue = niters < peel_iters_prologue ?
 279                                         niters : peel_iters_prologue;
 280           peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
 281         }
 282     }
 283
 284   vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
 285                       + (peel_iters_epilogue * scalar_single_iter_cost)
 286                       + peel_guard_costs;
 287
 288   /* FORNOW: The scalar outside cost is incremented in one of the
 289      following ways:
 290
 291      1. The vectorizer checks for alignment and aliasing and generates
 292      a condition that allows dynamic vectorization.  A cost model
 293      check is ANDED with the versioning condition.  Hence scalar code
 294      path now has the added cost of the versioning check.
 295
 296        if (cost > th & versioning_check)
 297          jmp to vector code
 298
 299      Hence run-time scalar is incremented by not-taken branch cost.
 300
 301      2. The vectorizer then checks if a prologue is required.  If the
 302      cost model check was not done before during versioning, it has to
 303      be done before the prologue check.
 304
 305        if (cost <= th)
 306          prologue = scalar_iters
 307        if (prologue == 0)
 308          jmp to vector code
 309        else
 310          execute prologue
 311        if (prologue == num_iters)
 312          go to exit
 313
 314      Hence the run-time scalar cost is incremented by a taken branch,
 315      plus a not-taken branch, plus a taken branch cost.
 316
 317      3. The vectorizer then checks if an epilogue is required.  If the
 318      cost model check was not done before during prologue check, it
 319      has to be done with the epilogue check.
 320
 321        if (prologue == 0)
 322          jmp to vector code
 323        else
 324          execute prologue
 325        if (prologue == num_iters)
 326          go to exit
 327        vector code:
 328          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
 329            jmp to epilogue
 330
 331      Hence the run-time scalar cost should be incremented by 2 taken
 332      branches.
 333
 334      TODO: The back end may reorder the BBS's differently and reverse
 335      conditions/branch directions.  Change the stimates below to
 336      something more reasonable.  */
 337
 338   if (runtime_test)
 339     {
 340       /* Cost model check occurs at versioning.  */
 341       if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
 342           || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
 343         scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
 344       else
 345         {
 346           /* Cost model occurs at prologue generation.  */
 347           if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
 348             scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
 349               + TARG_COND_NOT_TAKEN_BRANCH_COST;
 350           /* Cost model check occurs at epilogue generation.  */
 351           else
 352             scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
 353         }
 354     }
 355
 356   /* Add SLP costs.  */
 357   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 358   for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
 359     {
 360       vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
 361       vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
 362     }
 363
 364   /* Calculate number of iterations required to make the vector version
 365      profitable, relative to the loop bodies only. The following condition
 366      must hold true:
 367      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
 368      where
 369      SIC = scalar iteration cost, VIC = vector iteration cost,
 370      VOC = vector outside cost, VF = vectorization factor,
 371      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
 372      SOC = scalar outside cost for run time cost model check.  */
 373
 374   if ((scalar_single_iter_cost * vf) > vec_inside_cost)
 375     {
 376       if (vec_outside_cost <= 0)
 377         min_profitable_iters = 1;
 378       else
 379         {
 380           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
 381                                   - vec_inside_cost * peel_iters_prologue
 382                                   - vec_inside_cost * peel_iters_epilogue)
 383                                  / ((scalar_single_iter_cost * vf)
 384                                     - vec_inside_cost);
 385
 386           if ((scalar_single_iter_cost * vf * min_profitable_iters)
 387               <= ((vec_inside_cost * min_profitable_iters)
 388                   + ((vec_outside_cost - scalar_outside_cost) * vf)))
 389             min_profitable_iters++;
 390         }
 391     }
 392   /* vector version will never be profitable.  */
 393   else
 394     {
 395       if (vect_print_dump_info (REPORT_COST))
 396         fprintf (vect_dump, "cost model: vector iteration cost = %d "
 397                  "is divisible by scalar iteration cost = %d by a factor "
 398                  "greater than or equal to the vectorization factor = %d .",
 399                  vec_inside_cost, scalar_single_iter_cost, vf);
 400       return -1;
 401     }
 402
 403   if (vect_print_dump_info (REPORT_COST))
 404     {
 405       fprintf (vect_dump, "Cost model analysis: \n");
 406       fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
 407                vec_inside_cost);
 408       fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
 409                vec_outside_cost);
 410       fprintf (vect_dump, "  Scalar iteration cost: %d\n",
 411                scalar_single_iter_cost);
 412       fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
 413       fprintf (vect_dump, "  prologue iterations: %d\n",
 414                peel_iters_prologue);
 415       fprintf (vect_dump, "  epilogue iterations: %d\n",
 416                peel_iters_epilogue);
 417       fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
 418                min_profitable_iters);
 419     }
 420
 421   min_profitable_iters =
 422         min_profitable_iters < vf ? vf : min_profitable_iters;
 423
 424   /* Because the condition we create is:
 425      if (niters <= min_profitable_iters)
 426        then skip the vectorized loop.  */
 427   min_profitable_iters--;
 428
 429   if (vect_print_dump_info (REPORT_COST))
 430     fprintf (vect_dump, "  Profitability threshold = %d\n",
 431              min_profitable_iters);
 432
 433   return min_profitable_iters;
 434 }
 435
 436
 437 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
 438    functions. Design better to avoid maintenance issues.  */
 439
 440 /* Function vect_model_reduction_cost.
 441
 442    Models cost for a reduction operation, including the vector ops
 443    generated within the strip-mine loop, the initial definition before
 444    the loop, and the epilogue code that must be generated.  */
 445
 446 static bool
 447 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
 448                            int ncopies)
 449 {
 450   int outer_cost = 0;
 451   enum tree_code code;
 452   optab optab;
 453   tree vectype;
 454   tree orig_stmt;
 455   tree reduction_op;
 456   enum machine_mode mode;
 457   tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1);
 458   int op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
 459   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 460   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 461
 462   /* Cost of reduction op inside loop.  */
 463   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
 464
 465   reduction_op = TREE_OPERAND (operation, op_type-1);
 466   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
 467   if (!vectype)
 468     {
 469       if (vect_print_dump_info (REPORT_COST))
 470         {
 471           fprintf (vect_dump, "unsupported data-type ");
 472           print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
 473         }
 474       return false;
 475    }
 476
 477   mode = TYPE_MODE (vectype);
 478   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
 479
 480   if (!orig_stmt)
 481     orig_stmt = STMT_VINFO_STMT (stmt_info);
 482
 483   code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
 484
 485   /* Add in cost for initial definition.  */
 486   outer_cost += TARG_SCALAR_TO_VEC_COST;
 487
 488   /* Determine cost of epilogue code.
 489
 490      We have a reduction operator that will reduce the vector in one statement.
 491      Also requires scalar extract.  */
 492
 493   if (!nested_in_vect_loop_p (loop, orig_stmt))
 494     {
 495       if (reduc_code < NUM_TREE_CODES)
 496         outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
 497       else
 498         {
 499           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
 500           tree bitsize =
 501             TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0)));
 502           int element_bitsize = tree_low_cst (bitsize, 1);
 503           int nelements = vec_size_in_bits / element_bitsize;
 504
 505           optab = optab_for_tree_code (code, vectype);
 506
 507           /* We have a whole vector shift available.  */
 508           if (VECTOR_MODE_P (mode)
 509               && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
 510               && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
 511             /* Final reduction via vector shifts and the reduction operator. Also
 512                requires scalar extract.  */
 513             outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
 514                                 + TARG_VEC_TO_SCALAR_COST);
 515           else
 516             /* Use extracts and reduction op for final reduction.  For N elements,
 517                we have N extracts and N-1 reduction ops.  */
 518             outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
 519         }
 520     }
 521
 522   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
 523
 524   if (vect_print_dump_info (REPORT_COST))
 525     fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
 526              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
 527              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
 528
 529   return true;
 530 }
 531
 532
 533 /* Function vect_model_induction_cost.
 534
 535    Models cost for induction operations.  */
 536
 537 static void
 538 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
 539 {
 540   /* loop cost for vec_loop.  */
 541   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
 542   /* prologue cost for vec_init and vec_step.  */
 543   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
 544
 545   if (vect_print_dump_info (REPORT_COST))
 546     fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
 547              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
 548              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
 549 }
 550
 551
 552 /* Function vect_model_simple_cost.
 553
 554    Models cost for simple operations, i.e. those that only emit ncopies of a
 555    single op.  Right now, this does not account for multiple insns that could
 556    be generated for the single vector op.  We will handle that shortly.  */
 557
 558 void
 559 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
 560                         enum vect_def_type *dt, slp_tree slp_node)
 561 {
 562   int i;
 563   int inside_cost = 0, outside_cost = 0;
 564
 565   inside_cost = ncopies * TARG_VEC_STMT_COST;
 566
 567   /* FORNOW: Assuming maximum 2 args per stmts.  */
 568   for (i = 0; i < 2; i++)
 569     {
 570       if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
 571         outside_cost += TARG_SCALAR_TO_VEC_COST;
 572     }
 573
 574   if (vect_print_dump_info (REPORT_COST))
 575     fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
 576              "outside_cost = %d .", inside_cost, outside_cost);
 577
 578   /* Set the costs either in STMT_INFO or SLP_NODE (if exists).  */
 579   stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
 580   stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
 581 }
 582
 583
 584 /* Function vect_cost_strided_group_size
 585
 586    For strided load or store, return the group_size only if it is the first
 587    load or store of a group, else return 1.  This ensures that group size is
 588    only returned once per group.  */
 589
 590 static int
 591 vect_cost_strided_group_size (stmt_vec_info stmt_info)
 592 {
 593   tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
 594
 595   if (first_stmt == STMT_VINFO_STMT (stmt_info))
 596     return DR_GROUP_SIZE (stmt_info);
 597
 598   return 1;
 599 }
 600
 601
 602 /* Function vect_model_store_cost
 603
 604    Models cost for stores.  In the case of strided accesses, one access
 605    has the overhead of the strided access attributed to it.  */
 606
 607 void
 608 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
 609                        enum vect_def_type dt, slp_tree slp_node)
 610 {
 611   int group_size;
 612   int inside_cost = 0, outside_cost = 0;
 613
 614   if (dt == vect_constant_def || dt == vect_invariant_def)
 615     outside_cost = TARG_SCALAR_TO_VEC_COST;
 616
 617   /* Strided access?  */
 618   if (DR_GROUP_FIRST_DR (stmt_info))
 619     group_size = vect_cost_strided_group_size (stmt_info);
 620   /* Not a strided access.  */
 621   else
 622     group_size = 1;
 623
 624   /* Is this an access in a group of stores, which provide strided access?
 625      If so, add in the cost of the permutes.  */
 626   if (group_size > 1)
 627     {
 628       /* Uses a high and low interleave operation for each needed permute.  */
 629       inside_cost = ncopies * exact_log2(group_size) * group_size
 630              * TARG_VEC_STMT_COST;
 631
 632       if (vect_print_dump_info (REPORT_COST))
 633         fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
 634                  group_size);
 635
 636     }
 637
 638   /* Costs of the stores.  */
 639   inside_cost += ncopies * TARG_VEC_STORE_COST;
 640
 641   if (vect_print_dump_info (REPORT_COST))
 642     fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
 643              "outside_cost = %d .", inside_cost, outside_cost);
 644
 645   /* Set the costs either in STMT_INFO or SLP_NODE (if exists).  */
 646   stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
 647   stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
 648 }
 649
 650
 651 /* Function vect_model_load_cost
 652
 653    Models cost for loads.  In the case of strided accesses, the last access
 654    has the overhead of the strided access attributed to it.  Since unaligned
 655    accesses are supported for loads, we also account for the costs of the
 656    access scheme chosen.  */
 657
 658 void
 659 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
 660
 661 {
 662   int group_size;
 663   int alignment_support_cheme;
 664   tree first_stmt;
 665   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
 666   int inside_cost = 0, outside_cost = 0;
 667
 668   /* Strided accesses?  */
 669   first_stmt = DR_GROUP_FIRST_DR (stmt_info);
 670   if (first_stmt && !slp_node)
 671     {
 672       group_size = vect_cost_strided_group_size (stmt_info);
 673       first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
 674     }
 675   /* Not a strided access.  */
 676   else
 677     {
 678       group_size = 1;
 679       first_dr = dr;
 680     }
 681
 682   alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
 683
 684   /* Is this an access in a group of loads providing strided access?
 685      If so, add in the cost of the permutes.  */
 686   if (group_size > 1)
 687     {
 688       /* Uses an even and odd extract operations for each needed permute.  */
 689       inside_cost = ncopies * exact_log2(group_size) * group_size
 690         * TARG_VEC_STMT_COST;
 691
 692       if (vect_print_dump_info (REPORT_COST))
 693         fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
 694                  group_size);
 695
 696     }
 697
 698   /* The loads themselves.  */
 699   switch (alignment_support_cheme)
 700     {
 701     case dr_aligned:
 702       {
 703         inside_cost += ncopies * TARG_VEC_LOAD_COST;
 704
 705         if (vect_print_dump_info (REPORT_COST))
 706           fprintf (vect_dump, "vect_model_load_cost: aligned.");
 707
 708         break;
 709       }
 710     case dr_unaligned_supported:
 711       {
 712         /* Here, we assign an additional cost for the unaligned load.  */
 713         inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
 714
 715         if (vect_print_dump_info (REPORT_COST))
 716           fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
 717                    "hardware.");
 718
 719         break;
 720       }
 721     case dr_explicit_realign:
 722       {
 723         inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
 724
 725         /* FIXME: If the misalignment remains fixed across the iterations of
 726            the containing loop, the following cost should be added to the
 727            outside costs.  */
 728         if (targetm.vectorize.builtin_mask_for_load)
 729           inside_cost += TARG_VEC_STMT_COST;
 730
 731         break;
 732       }
 733     case dr_explicit_realign_optimized:
 734       {
 735         if (vect_print_dump_info (REPORT_COST))
 736           fprintf (vect_dump, "vect_model_load_cost: unaligned software "
 737                    "pipelined.");
 738
 739         /* Unaligned software pipeline has a load of an address, an initial
 740            load, and possibly a mask operation to "prime" the loop. However,
 741            if this is an access in a group of loads, which provide strided
 742            access, then the above cost should only be considered for one
 743            access in the group. Inside the loop, there is a load op
 744            and a realignment op.  */
 745
 746         if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
 747           {
 748             outside_cost = 2*TARG_VEC_STMT_COST;
 749             if (targetm.vectorize.builtin_mask_for_load)
 750               outside_cost += TARG_VEC_STMT_COST;
 751           }
 752
 753         inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
 754
 755         break;
 756       }
 757
 758     default:
 759       gcc_unreachable ();
 760     }
 761
 762   if (vect_print_dump_info (REPORT_COST))
 763     fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
 764              "outside_cost = %d .", inside_cost, outside_cost);
 765
 766   /* Set the costs either in STMT_INFO or SLP_NODE (if exists).  */
 767   stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
 768   stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
 769 }
 770
 771
 772 /* Function vect_get_new_vect_var.
 773
 774    Returns a name for a new variable. The current naming scheme appends the
 775    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
 776    the name of vectorizer generated variables, and appends that to NAME if
 777    provided.  */
 778
 779 static tree
 780 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
 781 {
 782   const char *prefix;
 783   tree new_vect_var;
 784
 785   switch (var_kind)
 786   {
 787   case vect_simple_var:
 788     prefix = "vect_";
 789     break;
 790   case vect_scalar_var:
 791     prefix = "stmp_";
 792     break;
 793   case vect_pointer_var:
 794     prefix = "vect_p";
 795     break;
 796   default:
 797     gcc_unreachable ();
 798   }
 799
 800   if (name)
 801     {
 802       char* tmp = concat (prefix, name, NULL);
 803       new_vect_var = create_tmp_var (type, tmp);
 804       free (tmp);
 805     }
 806   else
 807     new_vect_var = create_tmp_var (type, prefix);
 808
 809   /* Mark vector typed variable as a gimple register variable.  */
 810   if (TREE_CODE (type) == VECTOR_TYPE)
 811     DECL_GIMPLE_REG_P (new_vect_var) = true;
 812
 813   return new_vect_var;
 814 }
 815
 816
 817 /* Function vect_create_addr_base_for_vector_ref.
 818
 819    Create an expression that computes the address of the first memory location
 820    that will be accessed for a data reference.
 821
 822    Input:
 823    STMT: The statement containing the data reference.
 824    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
 825    OFFSET: Optional. If supplied, it is be added to the initial address.
 826    LOOP:    Specify relative to which loop-nest should the address be computed.
 827             For example, when the dataref is in an inner-loop nested in an
 828             outer-loop that is now being vectorized, LOOP can be either the
 829             outer-loop, or the inner-loop. The first memory location accessed
 830             by the following dataref ('in' points to short):
 831
 832                 for (i=0; i<N; i++)
 833                    for (j=0; j<M; j++)
 834                      s += in[i+j]
 835
 836             is as follows:
 837             if LOOP=i_loop:     &in             (relative to i_loop)
 838             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
 839
 840    Output:
 841    1. Return an SSA_NAME whose value is the address of the memory location of
 842       the first vector of the data reference.
 843    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
 844       these statement(s) which define the returned SSA_NAME.
 845
 846    FORNOW: We are only handling array accesses with step 1.  */
 847
 848 static tree
 849 vect_create_addr_base_for_vector_ref (tree stmt,
 850                                       tree *new_stmt_list,
 851                                       tree offset,
 852                                       struct loop *loop)
 853 {
 854   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 855   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
 856   struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
 857   tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
 858   tree base_name;
 859   tree data_ref_base_var;
 860   tree new_base_stmt;
 861   tree vec_stmt;
 862   tree addr_base, addr_expr;
 863   tree dest, new_stmt;
 864   tree base_offset = unshare_expr (DR_OFFSET (dr));
 865   tree init = unshare_expr (DR_INIT (dr));
 866   tree vect_ptr_type, addr_expr2;
 867   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
 868
 869   gcc_assert (loop);
 870   if (loop != containing_loop)
 871     {
 872       loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 873       struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 874
 875       gcc_assert (nested_in_vect_loop_p (loop, stmt));
 876
 877       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
 878       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
 879       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
 880     }
 881
 882   /* Create data_ref_base */
 883   base_name = build_fold_indirect_ref (data_ref_base);
 884   data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
 885   add_referenced_var (data_ref_base_var);
 886   data_ref_base = force_gimple_operand (data_ref_base, &new_base_stmt,
 887                                         true, data_ref_base_var);
 888   append_to_statement_list_force(new_base_stmt, new_stmt_list);
 889
 890   /* Create base_offset */
 891   base_offset = size_binop (PLUS_EXPR, base_offset, init);
 892   base_offset = fold_convert (sizetype, base_offset);
 893   dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
 894   add_referenced_var (dest);
 895   base_offset = force_gimple_operand (base_offset, &new_stmt, true, dest);
 896   append_to_statement_list_force (new_stmt, new_stmt_list);
 897
 898   if (offset)
 899     {
 900       tree tmp = create_tmp_var (sizetype, "offset");
 901
 902       add_referenced_var (tmp);
 903       offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
 904       base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
 905                                  base_offset, offset);
 906       base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
 907       append_to_statement_list_force (new_stmt, new_stmt_list);
 908     }
 909
 910   /* base + base_offset */
 911   addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
 912                            data_ref_base, base_offset);
 913
 914   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
 915
 916   /* addr_expr = addr_base */
 917   addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
 918                                      get_name (base_name));
 919   add_referenced_var (addr_expr);
 920   vec_stmt = fold_convert (vect_ptr_type, addr_base);
 921   addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
 922                                      get_name (base_name));
 923   add_referenced_var (addr_expr2);
 924   vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
 925   append_to_statement_list_force (new_stmt, new_stmt_list);
 926
 927   if (vect_print_dump_info (REPORT_DETAILS))
 928     {
 929       fprintf (vect_dump, "created ");
 930       print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
 931     }
 932   return vec_stmt;
 933 }
 934
 935
 936 /* Function vect_create_data_ref_ptr.
 937
 938    Create a new pointer to vector type (vp), that points to the first location
 939    accessed in the loop by STMT, along with the def-use update chain to
 940    appropriately advance the pointer through the loop iterations. Also set
 941    aliasing information for the pointer.  This vector pointer is used by the
 942    callers to this function to create a memory reference expression for vector
 943    load/store access.
 944
 945    Input:
 946    1. STMT: a stmt that references memory. Expected to be of the form
 947          GIMPLE_MODIFY_STMT <name, data-ref> or
 948          GIMPLE_MODIFY_STMT <data-ref, name>.
 949    2. AT_LOOP: the loop where the vector memref is to be created.
 950    3. OFFSET (optional): an offset to be added to the initial address accessed
 951         by the data-ref in STMT.
 952    4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
 953         pointing to the initial address.
 954    5. TYPE: if not NULL indicates the required type of the data-ref
 955
 956    Output:
 957    1. Declare a new ptr to vector_type, and have it point to the base of the
 958       data reference (initial addressed accessed by the data reference).
 959       For example, for vector of type V8HI, the following code is generated:
 960
 961       v8hi *vp;
 962       vp = (v8hi *)initial_address;
 963
 964       if OFFSET is not supplied:
 965          initial_address = &a[init];
 966       if OFFSET is supplied:
 967          initial_address = &a[init + OFFSET];
 968
 969       Return the initial_address in INITIAL_ADDRESS.
 970
 971    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
 972       update the pointer in each iteration of the loop.
 973
 974       Return the increment stmt that updates the pointer in PTR_INCR.
 975
 976    3. Set INV_P to true if the access pattern of the data reference in the
 977       vectorized loop is invariant. Set it to false otherwise.
 978
 979    4. Return the pointer.  */
 980
 981 static tree
 982 vect_create_data_ref_ptr (tree stmt, struct loop *at_loop,
 983                           tree offset, tree *initial_address, tree *ptr_incr,
 984                           bool only_init, tree type, bool *inv_p)
 985 {
 986   tree base_name;
 987   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 988   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 989   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 990   bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
 991   struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
 992   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 993   tree vect_ptr_type;
 994   tree vect_ptr;
 995   tree tag;
 996   tree new_temp;
 997   tree vec_stmt;
 998   tree new_stmt_list = NULL_TREE;
 999   edge pe;
1000   basic_block new_bb;
1001   tree vect_ptr_init;
1002   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1003   tree vptr;
1004   block_stmt_iterator incr_bsi;
1005   bool insert_after;
1006   tree indx_before_incr, indx_after_incr;
1007   tree incr;
1008   tree step;
1009
1010   /* Check the step (evolution) of the load in LOOP, and record
1011      whether it's invariant.  */
1012   if (nested_in_vect_loop)
1013     step = STMT_VINFO_DR_STEP (stmt_info);
1014   else
1015     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1016
1017   if (tree_int_cst_compare (step, size_zero_node) == 0)
1018     *inv_p = true;
1019   else
1020     *inv_p = false;
1021
1022   /* Create an expression for the first address accessed by this load
1023      in LOOP.  */
1024   base_name =  build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1025
1026   if (vect_print_dump_info (REPORT_DETAILS))
1027     {
1028       tree data_ref_base = base_name;
1029       fprintf (vect_dump, "create vector-pointer variable to type: ");
1030       print_generic_expr (vect_dump, vectype, TDF_SLIM);
1031       if (TREE_CODE (data_ref_base) == VAR_DECL)
1032         fprintf (vect_dump, "  vectorizing a one dimensional array ref: ");
1033       else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1034         fprintf (vect_dump, "  vectorizing a multidimensional array ref: ");
1035       else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1036         fprintf (vect_dump, "  vectorizing a record based array ref: ");
1037       else if (TREE_CODE (data_ref_base) == SSA_NAME)
1038         fprintf (vect_dump, "  vectorizing a pointer ref: ");
1039       print_generic_expr (vect_dump, base_name, TDF_SLIM);
1040     }
1041
1042   /** (1) Create the new vector-pointer variable:  **/
1043   if (type)
1044     vect_ptr_type = build_pointer_type (type);
1045   else
1046     vect_ptr_type = build_pointer_type (vectype);
1047   vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1048                                     get_name (base_name));
1049   add_referenced_var (vect_ptr);
1050
1051   /** (2) Add aliasing information to the new vector-pointer:
1052           (The points-to info (DR_PTR_INFO) may be defined later.)  **/
1053
1054   tag = DR_SYMBOL_TAG (dr);
1055   gcc_assert (tag);
1056
1057   /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1058      tag must be created with tag added to its may alias list.  */
1059   if (!MTAG_P (tag))
1060     new_type_alias (vect_ptr, tag, DR_REF (dr));
1061   else
1062     set_symbol_mem_tag (vect_ptr, tag);
1063
1064   var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
1065
1066   /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1067       vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1068       def-use update cycles for the pointer: One relative to the outer-loop
1069       (LOOP), which is what steps (3) and (4) below do. The other is relative
1070       to the inner-loop (which is the inner-most loop containing the dataref),
1071       and this is done be step (5) below.
1072
1073       When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1074       inner-most loop, and so steps (3),(4) work the same, and step (5) is
1075       redundant.  Steps (3),(4) create the following:
1076
1077         vp0 = &base_addr;
1078         LOOP:   vp1 = phi(vp0,vp2)
1079                 ...
1080                 ...
1081                 vp2 = vp1 + step
1082                 goto LOOP
1083
1084       If there is an inner-loop nested in loop, then step (5) will also be
1085       applied, and an additional update in the inner-loop will be created:
1086
1087         vp0 = &base_addr;
1088         LOOP:   vp1 = phi(vp0,vp2)
1089                 ...
1090         inner:     vp3 = phi(vp1,vp4)
1091                    vp4 = vp3 + inner_step
1092                    if () goto inner
1093                 ...
1094                 vp2 = vp1 + step
1095                 if () goto LOOP   */
1096
1097   /** (3) Calculate the initial address the vector-pointer, and set
1098           the vector-pointer to point to it before the loop:  **/
1099
1100   /* Create: (&(base[init_val+offset]) in the loop preheader.  */
1101
1102   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1103                                                    offset, loop);
1104   pe = loop_preheader_edge (loop);
1105   new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
1106   gcc_assert (!new_bb);
1107   *initial_address = new_temp;
1108
1109   /* Create: p = (vectype *) initial_base  */
1110   vec_stmt = fold_convert (vect_ptr_type, new_temp);
1111   vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
1112   vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1113   GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
1114   new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
1115   gcc_assert (!new_bb);
1116
1117
1118   /** (4) Handle the updating of the vector-pointer inside the loop.
1119           This is needed when ONLY_INIT is false, and also when AT_LOOP
1120           is the inner-loop nested in LOOP (during outer-loop vectorization).
1121    **/
1122
1123   if (only_init && at_loop == loop) /* No update in loop is required.  */
1124     {
1125       /* Copy the points-to information if it exists. */
1126       if (DR_PTR_INFO (dr))
1127         duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1128       vptr = vect_ptr_init;
1129     }
1130   else
1131     {
1132       /* The step of the vector pointer is the Vector Size.  */
1133       tree step = TYPE_SIZE_UNIT (vectype);
1134       /* One exception to the above is when the scalar step of the load in
1135          LOOP is zero. In this case the step here is also zero.  */
1136       if (*inv_p)
1137         step = size_zero_node;
1138
1139       standard_iv_increment_position (loop, &incr_bsi, &insert_after);
1140
1141       create_iv (vect_ptr_init,
1142                  fold_convert (vect_ptr_type, step),
1143                  NULL_TREE, loop, &incr_bsi, insert_after,
1144                  &indx_before_incr, &indx_after_incr);
1145       incr = bsi_stmt (incr_bsi);
1146       set_stmt_info (stmt_ann (incr),
1147                      new_stmt_vec_info (incr, loop_vinfo));
1148
1149       /* Copy the points-to information if it exists. */
1150       if (DR_PTR_INFO (dr))
1151         {
1152           duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1153           duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1154         }
1155       merge_alias_info (vect_ptr_init, indx_before_incr);
1156       merge_alias_info (vect_ptr_init, indx_after_incr);
1157       if (ptr_incr)
1158         *ptr_incr = incr;
1159
1160       vptr = indx_before_incr;
1161     }
1162
1163   if (!nested_in_vect_loop || only_init)
1164     return vptr;
1165
1166
1167   /** (5) Handle the updating of the vector-pointer inside the inner-loop
1168           nested in LOOP, if exists: **/
1169
1170   gcc_assert (nested_in_vect_loop);
1171   if (!only_init)
1172     {
1173       standard_iv_increment_position (containing_loop, &incr_bsi,
1174                                       &insert_after);
1175       create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE,
1176                  containing_loop, &incr_bsi, insert_after, &indx_before_incr,
1177                  &indx_after_incr);
1178       incr = bsi_stmt (incr_bsi);
1179       set_stmt_info (stmt_ann (incr), new_stmt_vec_info (incr, loop_vinfo));
1180
1181       /* Copy the points-to information if it exists. */
1182       if (DR_PTR_INFO (dr))
1183         {
1184           duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1185           duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1186         }
1187       merge_alias_info (vect_ptr_init, indx_before_incr);
1188       merge_alias_info (vect_ptr_init, indx_after_incr);
1189       if (ptr_incr)
1190         *ptr_incr = incr;
1191
1192       return indx_before_incr;
1193     }
1194   else
1195     gcc_unreachable ();
1196 }
1197
1198
1199 /* Function bump_vector_ptr
1200
1201    Increment a pointer (to a vector type) by vector-size. If requested,
1202    i.e. if PTR-INCR is given, then also connect the new increment stmt
1203    to the existing def-use update-chain of the pointer, by modifying
1204    the PTR_INCR as illustrated below:
1205
1206    The pointer def-use update-chain before this function:
1207                         DATAREF_PTR = phi (p_0, p_2)
1208                         ....
1209         PTR_INCR:       p_2 = DATAREF_PTR + step
1210
1211    The pointer def-use update-chain after this function:
1212                         DATAREF_PTR = phi (p_0, p_2)
1213                         ....
1214                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1215                         ....
1216         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
1217
1218    Input:
1219    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1220                  in the loop.
1221    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1222               the loop.  The increment amount across iterations is expected
1223               to be vector_size.
1224    BSI - location where the new update stmt is to be placed.
1225    STMT - the original scalar memory-access stmt that is being vectorized.
1226    BUMP - optional. The offset by which to bump the pointer. If not given,
1227           the offset is assumed to be vector_size.
1228
1229    Output: Return NEW_DATAREF_PTR as illustrated above.
1230
1231 */
1232
1233 static tree
1234 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
1235                  tree stmt, tree bump)
1236 {
1237   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1238   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1239   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1240   tree vptr_type = TREE_TYPE (dataref_ptr);
1241   tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1242   tree update = TYPE_SIZE_UNIT (vectype);
1243   tree incr_stmt;
1244   ssa_op_iter iter;
1245   use_operand_p use_p;
1246   tree new_dataref_ptr;
1247
1248   if (bump)
1249     update = bump;
1250
1251   incr_stmt = build_gimple_modify_stmt (ptr_var,
1252                                         build2 (POINTER_PLUS_EXPR, vptr_type,
1253                                                 dataref_ptr, update));
1254   new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1255   GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
1256   vect_finish_stmt_generation (stmt, incr_stmt, bsi);
1257
1258   /* Copy the points-to information if it exists. */
1259   if (DR_PTR_INFO (dr))
1260     duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1261   merge_alias_info (new_dataref_ptr, dataref_ptr);
1262
1263   if (!ptr_incr)
1264     return new_dataref_ptr;
1265
1266   /* Update the vector-pointer's cross-iteration increment.  */
1267   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1268     {
1269       tree use = USE_FROM_PTR (use_p);
1270
1271       if (use == dataref_ptr)
1272         SET_USE (use_p, new_dataref_ptr);
1273       else
1274         gcc_assert (tree_int_cst_compare (use, update) == 0);
1275     }
1276
1277   return new_dataref_ptr;
1278 }
1279
1280
1281 /* Function vect_create_destination_var.
1282
1283    Create a new temporary of type VECTYPE.  */
1284
1285 static tree
1286 vect_create_destination_var (tree scalar_dest, tree vectype)
1287 {
1288   tree vec_dest;
1289   const char *new_name;
1290   tree type;
1291   enum vect_var_kind kind;
1292
1293   kind = vectype ? vect_simple_var : vect_scalar_var;
1294   type = vectype ? vectype : TREE_TYPE (scalar_dest);
1295
1296   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1297
1298   new_name = get_name (scalar_dest);
1299   if (!new_name)
1300     new_name = "var_";
1301   vec_dest = vect_get_new_vect_var (type, kind, new_name);
1302   add_referenced_var (vec_dest);
1303
1304   return vec_dest;
1305 }
1306
1307
1308 /* Function vect_init_vector.
1309
1310    Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1311    the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1312    is not NULL. Otherwise, place the initialization at the loop preheader.
1313    Return the DEF of INIT_STMT.
1314    It will be used in the vectorization of STMT.  */
1315
1316 static tree
1317 vect_init_vector (tree stmt, tree vector_var, tree vector_type,
1318                   block_stmt_iterator *bsi)
1319 {
1320   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1321   tree new_var;
1322   tree init_stmt;
1323   tree vec_oprnd;
1324   edge pe;
1325   tree new_temp;
1326   basic_block new_bb;
1327
1328   new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1329   add_referenced_var (new_var);
1330   init_stmt = build_gimple_modify_stmt (new_var, vector_var);
1331   new_temp = make_ssa_name (new_var, init_stmt);
1332   GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
1333
1334   if (bsi)
1335     vect_finish_stmt_generation (stmt, init_stmt, bsi);
1336   else
1337     {
1338       loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1339       struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1340
1341       if (nested_in_vect_loop_p (loop, stmt))
1342         loop = loop->inner;
1343       pe = loop_preheader_edge (loop);
1344       new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1345       gcc_assert (!new_bb);
1346     }
1347
1348   if (vect_print_dump_info (REPORT_DETAILS))
1349     {
1350       fprintf (vect_dump, "created new init_stmt: ");
1351       print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1352     }
1353
1354   vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
1355   return vec_oprnd;
1356 }
1357
1358
1359 /* For constant and loop invariant defs of SLP_NODE this function returns
1360    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1361    OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1362    stmts.  */
1363
1364 static void
1365 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1366                            unsigned int op_num)
1367 {
1368   VEC (tree, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1369   tree stmt = VEC_index (tree, stmts, 0);
1370   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1371   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1372   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1373   tree vec_cst;
1374   tree t = NULL_TREE;
1375   int j, number_of_places_left_in_vector;
1376   tree vector_type;
1377   tree op, vop, operation;
1378   int group_size = VEC_length (tree, stmts);
1379   unsigned int vec_num, i;
1380   int number_of_copies = 1;
1381   bool is_store = false;
1382   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1383   VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1384   bool constant_p;
1385
1386   if (STMT_VINFO_DATA_REF (stmt_vinfo))
1387     is_store = true;
1388
1389   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1390      created vectors. It is greater than 1 if unrolling is performed.
1391
1392      For example, we have two scalar operands, s1 and s2 (e.g., group of
1393      strided accesses of size two), while NUINTS is four (i.e., four scalars
1394      of this type can be packed in a vector). The output vector will contain
1395      two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1396      will be 2).
1397
1398      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1399      containing the operands.
1400
1401      For example, NUINTS is four as before, and the group size is 8
1402      (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1403      {s5, s6, s7, s8}.  */
1404
1405   number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1406
1407   number_of_places_left_in_vector = nunits;
1408   constant_p = true;
1409   for (j = 0; j < number_of_copies; j++)
1410     {
1411       for (i = group_size - 1; VEC_iterate (tree, stmts, i, stmt); i--)
1412         {
1413           operation = GIMPLE_STMT_OPERAND (stmt, 1);
1414           if (is_store)
1415             op = operation;
1416           else
1417             op = TREE_OPERAND (operation, op_num);
1418           if (!CONSTANT_CLASS_P (op))
1419             constant_p = false;
1420
1421           /* Create 'vect_ = {op0,op1,...,opn}'.  */
1422           t = tree_cons (NULL_TREE, op, t);
1423
1424           number_of_places_left_in_vector--;
1425
1426           if (number_of_places_left_in_vector == 0)
1427             {
1428               number_of_places_left_in_vector = nunits;
1429
1430               vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1431               gcc_assert (vector_type);
1432               if (constant_p)
1433                 vec_cst = build_vector (vector_type, t);
1434               else
1435                 vec_cst = build_constructor_from_list (vector_type, t);
1436               constant_p = true;
1437               VEC_quick_push (tree, voprnds,
1438                               vect_init_vector (stmt, vec_cst, vector_type,
1439                                                 NULL));
1440               t = NULL_TREE;
1441             }
1442         }
1443     }
1444
1445   /* Since the vectors are created in the reverse order, we should invert
1446      them.  */
1447   vec_num = VEC_length (tree, voprnds);
1448   for (j = vec_num - 1; j >= 0; j--)
1449     {
1450       vop = VEC_index (tree, voprnds, j);
1451       VEC_quick_push (tree, *vec_oprnds, vop);
1452     }
1453
1454   VEC_free (tree, heap, voprnds);
1455
1456   /* In case that VF is greater than the unrolling factor needed for the SLP
1457      group of stmts, NUMBER_OF_VECTORS to be created is greater than
1458      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1459      to replicate the vectors.  */
1460   while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1461     {
1462       for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1463         VEC_quick_push (tree, *vec_oprnds, vop);
1464     }
1465 }
1466
1467
1468 /* Get vectorized definitions from SLP_NODE that contains corresponding
1469    vectorized def-stmts.  */
1470
1471 static void
1472 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1473 {
1474   tree vec_oprnd;
1475   tree vec_def_stmt;
1476   unsigned int i;
1477
1478   gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1479
1480   for (i = 0;
1481        VEC_iterate (tree, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1482        i++)
1483     {
1484       gcc_assert (vec_def_stmt);
1485       vec_oprnd = GIMPLE_STMT_OPERAND (vec_def_stmt, 0);
1486       VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1487     }
1488 }
1489
1490
1491 /* Get vectorized definitions for SLP_NODE.
1492    If the scalar definitions are loop invariants or constants, collect them and
1493    call vect_get_constant_vectors() to create vector stmts.
1494    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1495    must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1496    vect_get_slp_vect_defs() to retrieve them.
1497    If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1498    the right node. This is used when the second operand must remain scalar.  */
1499
1500 static void
1501 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1502                    VEC (tree,heap) **vec_oprnds1)
1503 {
1504   tree operation, first_stmt;
1505
1506   /* Allocate memory for vectorized defs.  */
1507   *vec_oprnds0 = VEC_alloc (tree, heap,
1508                             SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1509
1510   /* SLP_NODE corresponds either to a group of stores or to a group of
1511      unary/binary operations. We don't call this function for loads.  */
1512   if (SLP_TREE_LEFT (slp_node))
1513     /* The defs are already vectorized.  */
1514     vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1515   else
1516     /* Build vectors from scalar defs.  */
1517     vect_get_constant_vectors (slp_node, vec_oprnds0, 0);
1518
1519   first_stmt = VEC_index (tree, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1520   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1521     /* Since we don't call this function with loads, this is a group of
1522        stores.  */
1523     return;
1524
1525   operation = GIMPLE_STMT_OPERAND (first_stmt, 1);
1526   if (TREE_OPERAND_LENGTH (operation) == unary_op || !vec_oprnds1)
1527     return;
1528
1529   *vec_oprnds1 = VEC_alloc (tree, heap,
1530                             SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1531
1532   if (SLP_TREE_RIGHT (slp_node))
1533     /* The defs are already vectorized.  */
1534     vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1535   else
1536     /* Build vectors from scalar defs.  */
1537     vect_get_constant_vectors (slp_node, vec_oprnds1, 1);
1538 }
1539
1540
1541 /* Function get_initial_def_for_induction
1542
1543    Input:
1544    STMT - a stmt that performs an induction operation in the loop.
1545    IV_PHI - the initial value of the induction variable
1546
1547    Output:
1548    Return a vector variable, initialized with the first VF values of
1549    the induction variable. E.g., for an iv with IV_PHI='X' and
1550    evolution S, for a vector of 4 units, we want to return:
1551    [X, X + S, X + 2*S, X + 3*S].  */
1552
1553 static tree
1554 get_initial_def_for_induction (tree iv_phi)
1555 {
1556   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1557   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1558   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1559   tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
1560   tree vectype;
1561   int nunits;
1562   edge pe = loop_preheader_edge (loop);
1563   struct loop *iv_loop;
1564   basic_block new_bb;
1565   tree vec, vec_init, vec_step, t;
1566   tree access_fn;
1567   tree new_var;
1568   tree new_name;
1569   tree init_stmt;
1570   tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
1571   tree init_expr, step_expr;
1572   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1573   int i;
1574   bool ok;
1575   int ncopies;
1576   tree expr;
1577   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1578   bool nested_in_vect_loop = false;
1579   tree stmts;
1580   imm_use_iterator imm_iter;
1581   use_operand_p use_p;
1582   tree exit_phi;
1583   edge latch_e;
1584   tree loop_arg;
1585   block_stmt_iterator si;
1586   basic_block bb = bb_for_stmt (iv_phi);
1587
1588   vectype = get_vectype_for_scalar_type (scalar_type);
1589   gcc_assert (vectype);
1590   nunits = TYPE_VECTOR_SUBPARTS (vectype);
1591   ncopies = vf / nunits;
1592
1593   gcc_assert (phi_info);
1594   gcc_assert (ncopies >= 1);
1595
1596   /* Find the first insertion point in the BB.  */
1597   si = bsi_after_labels (bb);
1598
1599   if (INTEGRAL_TYPE_P (scalar_type))
1600     step_expr = build_int_cst (scalar_type, 0);
1601   else
1602     step_expr = build_real (scalar_type, dconst0);
1603
1604   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
1605   if (nested_in_vect_loop_p (loop, iv_phi))
1606     {
1607       nested_in_vect_loop = true;
1608       iv_loop = loop->inner;
1609     }
1610   else
1611     iv_loop = loop;
1612   gcc_assert (iv_loop == (bb_for_stmt (iv_phi))->loop_father);
1613
1614   latch_e = loop_latch_edge (iv_loop);
1615   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1616
1617   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1618   gcc_assert (access_fn);
1619   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1620                                   &init_expr, &step_expr);
1621   gcc_assert (ok);
1622   pe = loop_preheader_edge (iv_loop);
1623
1624   /* Create the vector that holds the initial_value of the induction.  */
1625   if (nested_in_vect_loop)
1626     {
1627       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
1628          been created during vectorization of previous stmts; We obtain it from
1629          the STMT_VINFO_VEC_STMT of the defining stmt. */
1630       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1631       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1632     }
1633   else
1634     {
1635       /* iv_loop is the loop to be vectorized. Create:
1636          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
1637       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1638       add_referenced_var (new_var);
1639
1640       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1641       if (stmts)
1642         {
1643           new_bb = bsi_insert_on_edge_immediate (pe, stmts);
1644           gcc_assert (!new_bb);
1645         }
1646
1647       t = NULL_TREE;
1648       t = tree_cons (NULL_TREE, init_expr, t);
1649       for (i = 1; i < nunits; i++)
1650         {
1651           tree tmp;
1652
1653           /* Create: new_name_i = new_name + step_expr  */
1654           tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
1655           init_stmt = build_gimple_modify_stmt (new_var, tmp);
1656           new_name = make_ssa_name (new_var, init_stmt);
1657           GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
1658
1659           new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1660           gcc_assert (!new_bb);
1661
1662           if (vect_print_dump_info (REPORT_DETAILS))
1663             {
1664               fprintf (vect_dump, "created new init_stmt: ");
1665               print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1666             }
1667           t = tree_cons (NULL_TREE, new_name, t);
1668         }
1669       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
1670       vec = build_constructor_from_list (vectype, nreverse (t));
1671       vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1672     }
1673
1674
1675   /* Create the vector that holds the step of the induction.  */
1676   if (nested_in_vect_loop)
1677     /* iv_loop is nested in the loop to be vectorized. Generate:
1678        vec_step = [S, S, S, S]  */
1679     new_name = step_expr;
1680   else
1681     {
1682       /* iv_loop is the loop to be vectorized. Generate:
1683           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
1684       expr = build_int_cst (scalar_type, vf);
1685       new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1686     }
1687
1688   t = NULL_TREE;
1689   for (i = 0; i < nunits; i++)
1690     t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1691   gcc_assert (CONSTANT_CLASS_P (new_name));
1692   vec = build_vector (vectype, t);
1693   vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1694
1695
1696   /* Create the following def-use cycle:
1697      loop prolog:
1698          vec_init = ...
1699          vec_step = ...
1700      loop:
1701          vec_iv = PHI <vec_init, vec_loop>
1702          ...
1703          STMT
1704          ...
1705          vec_loop = vec_iv + vec_step;  */
1706
1707   /* Create the induction-phi that defines the induction-operand.  */
1708   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1709   add_referenced_var (vec_dest);
1710   induction_phi = create_phi_node (vec_dest, iv_loop->header);
1711   set_stmt_info (get_stmt_ann (induction_phi),
1712                  new_stmt_vec_info (induction_phi, loop_vinfo));
1713   induc_def = PHI_RESULT (induction_phi);
1714
1715   /* Create the iv update inside the loop  */
1716   new_stmt = build_gimple_modify_stmt (NULL_TREE,
1717                                        build2 (PLUS_EXPR, vectype,
1718                                                induc_def, vec_step));
1719   vec_def = make_ssa_name (vec_dest, new_stmt);
1720   GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1721   bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1722   set_stmt_info (get_stmt_ann (new_stmt),
1723                  new_stmt_vec_info (new_stmt, loop_vinfo));
1724
1725   /* Set the arguments of the phi node:  */
1726   add_phi_arg (induction_phi, vec_init, pe);
1727   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1728
1729
1730   /* In case that vectorization factor (VF) is bigger than the number
1731      of elements that we can fit in a vectype (nunits), we have to generate
1732      more than one vector stmt - i.e - we need to "unroll" the
1733      vector stmt by a factor VF/nunits.  For more details see documentation
1734      in vectorizable_operation.  */
1735
1736   if (ncopies > 1)
1737     {
1738       stmt_vec_info prev_stmt_vinfo;
1739       /* FORNOW. This restriction should be relaxed.  */
1740       gcc_assert (!nested_in_vect_loop);
1741
1742       /* Create the vector that holds the step of the induction.  */
1743       expr = build_int_cst (scalar_type, nunits);
1744       new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1745       t = NULL_TREE;
1746       for (i = 0; i < nunits; i++)
1747         t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1748       gcc_assert (CONSTANT_CLASS_P (new_name));
1749       vec = build_vector (vectype, t);
1750       vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1751
1752       vec_def = induc_def;
1753       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1754       for (i = 1; i < ncopies; i++)
1755         {
1756           tree tmp;
1757
1758           /* vec_i = vec_prev + vec_step  */
1759           tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
1760           new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
1761           vec_def = make_ssa_name (vec_dest, new_stmt);
1762           GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1763           bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1764           set_stmt_info (get_stmt_ann (new_stmt),
1765                          new_stmt_vec_info (new_stmt, loop_vinfo));
1766           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1767           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1768         }
1769     }
1770
1771   if (nested_in_vect_loop)
1772     {
1773       /* Find the loop-closed exit-phi of the induction, and record
1774          the final vector of induction results:  */
1775       exit_phi = NULL;
1776       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1777         {
1778           if (!flow_bb_inside_loop_p (iv_loop, bb_for_stmt (USE_STMT (use_p))))
1779             {
1780               exit_phi = USE_STMT (use_p);
1781               break;
1782             }
1783         }
1784       if (exit_phi)
1785         {
1786           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1787           /* FORNOW. Currently not supporting the case that an inner-loop induction
1788              is not used in the outer-loop (i.e. only outside the outer-loop).  */
1789           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1790                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
1791
1792           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1793           if (vect_print_dump_info (REPORT_DETAILS))
1794             {
1795               fprintf (vect_dump, "vector of inductions after inner-loop:");
1796               print_generic_expr (vect_dump, new_stmt, TDF_SLIM);
1797             }
1798         }
1799     }
1800
1801
1802   if (vect_print_dump_info (REPORT_DETAILS))
1803     {
1804       fprintf (vect_dump, "transform induction: created def-use cycle:");
1805       print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
1806       fprintf (vect_dump, "\n");
1807       print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
1808     }
1809
1810   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1811   return induc_def;
1812 }
1813
1814
1815 /* Function vect_get_vec_def_for_operand.
1816
1817    OP is an operand in STMT. This function returns a (vector) def that will be
1818    used in the vectorized stmt for STMT.
1819
1820    In the case that OP is an SSA_NAME which is defined in the loop, then
1821    STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1822
1823    In case OP is an invariant or constant, a new stmt that creates a vector def
1824    needs to be introduced.  */
1825
1826 static tree
1827 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
1828 {
1829   tree vec_oprnd;
1830   tree vec_stmt;
1831   tree def_stmt;
1832   stmt_vec_info def_stmt_info = NULL;
1833   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1834   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1835   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1836   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1837   tree vec_inv;
1838   tree vec_cst;
1839   tree t = NULL_TREE;
1840   tree def;
1841   int i;
1842   enum vect_def_type dt;
1843   bool is_simple_use;
1844   tree vector_type;
1845
1846   if (vect_print_dump_info (REPORT_DETAILS))
1847     {
1848       fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1849       print_generic_expr (vect_dump, op, TDF_SLIM);
1850     }
1851
1852   is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1853   gcc_assert (is_simple_use);
1854   if (vect_print_dump_info (REPORT_DETAILS))
1855     {
1856       if (def)
1857         {
1858           fprintf (vect_dump, "def =  ");
1859           print_generic_expr (vect_dump, def, TDF_SLIM);
1860         }
1861       if (def_stmt)
1862         {
1863           fprintf (vect_dump, "  def_stmt =  ");
1864           print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
1865         }
1866     }
1867
1868   switch (dt)
1869     {
1870     /* Case 1: operand is a constant.  */
1871     case vect_constant_def:
1872       {
1873         if (scalar_def)
1874           *scalar_def = op;
1875
1876         /* Create 'vect_cst_ = {cst,cst,...,cst}'  */
1877         if (vect_print_dump_info (REPORT_DETAILS))
1878           fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1879
1880         for (i = nunits - 1; i >= 0; --i)
1881           {
1882             t = tree_cons (NULL_TREE, op, t);
1883           }
1884         vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1885         gcc_assert (vector_type);
1886         vec_cst = build_vector (vector_type, t);
1887
1888         return vect_init_vector (stmt, vec_cst, vector_type, NULL);
1889       }
1890
1891     /* Case 2: operand is defined outside the loop - loop invariant.  */
1892     case vect_invariant_def:
1893       {
1894         if (scalar_def)
1895           *scalar_def = def;
1896
1897         /* Create 'vec_inv = {inv,inv,..,inv}'  */
1898         if (vect_print_dump_info (REPORT_DETAILS))
1899           fprintf (vect_dump, "Create vector_inv.");
1900
1901         for (i = nunits - 1; i >= 0; --i)
1902           {
1903             t = tree_cons (NULL_TREE, def, t);
1904           }
1905
1906         /* FIXME: use build_constructor directly.  */
1907         vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1908         gcc_assert (vector_type);
1909         vec_inv = build_constructor_from_list (vector_type, t);
1910         return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1911       }
1912
1913     /* Case 3: operand is defined inside the loop.  */
1914     case vect_loop_def:
1915       {
1916         if (scalar_def)
1917           *scalar_def = def_stmt;
1918
1919         /* Get the def from the vectorized stmt.  */
1920         def_stmt_info = vinfo_for_stmt (def_stmt);
1921         vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1922         gcc_assert (vec_stmt);
1923         if (TREE_CODE (vec_stmt) == PHI_NODE)
1924           vec_oprnd = PHI_RESULT (vec_stmt);
1925         else
1926           vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
1927         return vec_oprnd;
1928       }
1929
1930     /* Case 4: operand is defined by a loop header phi - reduction  */
1931     case vect_reduction_def:
1932       {
1933         struct loop *loop;
1934
1935         gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1936         loop = (bb_for_stmt (def_stmt))->loop_father;
1937
1938         /* Get the def before the loop  */
1939         op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1940         return get_initial_def_for_reduction (stmt, op, scalar_def);
1941      }
1942
1943     /* Case 5: operand is defined by loop-header phi - induction.  */
1944     case vect_induction_def:
1945       {
1946         gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1947
1948         /* Get the def from the vectorized stmt.  */
1949         def_stmt_info = vinfo_for_stmt (def_stmt);
1950         vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1951         gcc_assert (vec_stmt && (TREE_CODE (vec_stmt) == PHI_NODE));
1952         vec_oprnd = PHI_RESULT (vec_stmt);
1953         return vec_oprnd;
1954       }
1955
1956     default:
1957       gcc_unreachable ();
1958     }
1959 }
1960
1961
1962 /* Function vect_get_vec_def_for_stmt_copy
1963
1964    Return a vector-def for an operand. This function is used when the
1965    vectorized stmt to be created (by the caller to this function) is a "copy"
1966    created in case the vectorized result cannot fit in one vector, and several
1967    copies of the vector-stmt are required. In this case the vector-def is
1968    retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
1969    of the stmt that defines VEC_OPRND.
1970    DT is the type of the vector def VEC_OPRND.
1971
1972    Context:
1973         In case the vectorization factor (VF) is bigger than the number
1974    of elements that can fit in a vectype (nunits), we have to generate
1975    more than one vector stmt to vectorize the scalar stmt. This situation
1976    arises when there are multiple data-types operated upon in the loop; the
1977    smallest data-type determines the VF, and as a result, when vectorizing
1978    stmts operating on wider types we need to create 'VF/nunits' "copies" of the
1979    vector stmt (each computing a vector of 'nunits' results, and together
1980    computing 'VF' results in each iteration).  This function is called when
1981    vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
1982    which VF=16 and nunits=4, so the number of copies required is 4):
1983
1984    scalar stmt:         vectorized into:        STMT_VINFO_RELATED_STMT
1985
1986    S1: x = load         VS1.0:  vx.0 = memref0      VS1.1
1987                         VS1.1:  vx.1 = memref1      VS1.2
1988                         VS1.2:  vx.2 = memref2      VS1.3
1989                         VS1.3:  vx.3 = memref3
1990
1991    S2: z = x + ...      VSnew.0:  vz0 = vx.0 + ...  VSnew.1
1992                         VSnew.1:  vz1 = vx.1 + ...  VSnew.2
1993                         VSnew.2:  vz2 = vx.2 + ...  VSnew.3
1994                         VSnew.3:  vz3 = vx.3 + ...
1995
1996    The vectorization of S1 is explained in vectorizable_load.
1997    The vectorization of S2:
1998         To create the first vector-stmt out of the 4 copies - VSnew.0 -
1999    the function 'vect_get_vec_def_for_operand' is called to
2000    get the relevant vector-def for each operand of S2. For operand x it
2001    returns  the vector-def 'vx.0'.
2002
2003         To create the remaining copies of the vector-stmt (VSnew.j), this
2004    function is called to get the relevant vector-def for each operand.  It is
2005    obtained from the respective VS1.j stmt, which is recorded in the
2006    STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2007
2008         For example, to obtain the vector-def 'vx.1' in order to create the
2009    vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2010    Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2011    STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2012    and return its def ('vx.1').
2013    Overall, to create the above sequence this function will be called 3 times:
2014         vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2015         vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2016         vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2);  */
2017
2018 static tree
2019 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2020 {
2021   tree vec_stmt_for_operand;
2022   stmt_vec_info def_stmt_info;
2023
2024   /* Do nothing; can reuse same def.  */
2025   if (dt == vect_invariant_def || dt == vect_constant_def )
2026     return vec_oprnd;
2027
2028   vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2029   def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2030   gcc_assert (def_stmt_info);
2031   vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2032   gcc_assert (vec_stmt_for_operand);
2033   vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
2034   return vec_oprnd;
2035 }
2036
2037
2038 /* Get vectorized definitions for the operands to create a copy of an original
2039    stmt. See vect_get_vec_def_for_stmt_copy() for details.  */
2040
2041 static void
2042 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2043                                  VEC(tree,heap) **vec_oprnds0,
2044                                  VEC(tree,heap) **vec_oprnds1)
2045 {
2046   tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2047
2048   vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2049   VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2050
2051   if (vec_oprnds1 && *vec_oprnds1)
2052     {
2053       vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2054       vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2055       VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2056     }
2057 }
2058
2059
2060 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL.  */
2061
2062 static void
2063 vect_get_vec_defs (tree op0, tree op1, tree stmt, VEC(tree,heap) **vec_oprnds0,
2064                    VEC(tree,heap) **vec_oprnds1, slp_tree slp_node)
2065 {
2066   if (slp_node)
2067     vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2068   else
2069     {
2070       tree vec_oprnd;
2071
2072       *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2073       vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2074       VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2075
2076       if (op1)
2077         {
2078           *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2079           vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2080           VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2081         }
2082     }
2083 }
2084
2085
2086 /* Function vect_finish_stmt_generation.
2087
2088    Insert a new stmt.  */
2089
2090 static void
2091 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
2092                              block_stmt_iterator *bsi)
2093 {
2094   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2095   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2096
2097   gcc_assert (stmt == bsi_stmt (*bsi));
2098   gcc_assert (TREE_CODE (stmt) != LABEL_EXPR);
2099
2100   bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
2101
2102   set_stmt_info (get_stmt_ann (vec_stmt),
2103                  new_stmt_vec_info (vec_stmt, loop_vinfo));
2104
2105   if (vect_print_dump_info (REPORT_DETAILS))
2106     {
2107       fprintf (vect_dump, "add new stmt: ");
2108       print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
2109     }
2110
2111   /* Make sure bsi points to the stmt that is being vectorized.  */
2112   gcc_assert (stmt == bsi_stmt (*bsi));
2113
2114   SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
2115 }
2116
2117
2118 /* Function get_initial_def_for_reduction
2119
2120    Input:
2121    STMT - a stmt that performs a reduction operation in the loop.
2122    INIT_VAL - the initial value of the reduction variable
2123
2124    Output:
2125    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2126         of the reduction (used for adjusting the epilog - see below).
2127    Return a vector variable, initialized according to the operation that STMT
2128         performs. This vector will be used as the initial value of the
2129         vector of partial results.
2130
2131    Option1 (adjust in epilog): Initialize the vector as follows:
2132      add:         [0,0,...,0,0]
2133      mult:        [1,1,...,1,1]
2134      min/max:     [init_val,init_val,..,init_val,init_val]
2135      bit and/or:  [init_val,init_val,..,init_val,init_val]
2136    and when necessary (e.g. add/mult case) let the caller know
2137    that it needs to adjust the result by init_val.
2138
2139    Option2: Initialize the vector as follows:
2140      add:         [0,0,...,0,init_val]
2141      mult:        [1,1,...,1,init_val]
2142      min/max:     [init_val,init_val,...,init_val]
2143      bit and/or:  [init_val,init_val,...,init_val]
2144    and no adjustments are needed.
2145
2146    For example, for the following code:
2147
2148    s = init_val;
2149    for (i=0;i<n;i++)
2150      s = s + a[i];
2151
2152    STMT is 's = s + a[i]', and the reduction variable is 's'.
2153    For a vector of 4 units, we want to return either [0,0,0,init_val],
2154    or [0,0,0,0] and let the caller know that it needs to adjust
2155    the result at the end by 'init_val'.
2156
2157    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2158    initialization vector is simpler (same element in all entries).
2159    A cost model should help decide between these two schemes.  */
2160
2161 static tree
2162 get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
2163 {
2164   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2165   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2166   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2167   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2168   int nunits =  TYPE_VECTOR_SUBPARTS (vectype);
2169   enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2170   tree type = TREE_TYPE (init_val);
2171   tree vecdef;
2172   tree def_for_init;
2173   tree init_def;
2174   tree t = NULL_TREE;
2175   int i;
2176   tree vector_type;
2177   bool nested_in_vect_loop = false;
2178
2179   gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2180   if (nested_in_vect_loop_p (loop, stmt))
2181     nested_in_vect_loop = true;
2182   else
2183     gcc_assert (loop == (bb_for_stmt (stmt))->loop_father);
2184
2185   vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2186
2187   switch (code)
2188   {
2189   case WIDEN_SUM_EXPR:
2190   case DOT_PROD_EXPR:
2191   case PLUS_EXPR:
2192     if (nested_in_vect_loop)
2193       *adjustment_def = vecdef;
2194     else
2195       *adjustment_def = init_val;
2196     /* Create a vector of zeros for init_def.  */
2197     if (SCALAR_FLOAT_TYPE_P (type))
2198       def_for_init = build_real (type, dconst0);
2199     else
2200       def_for_init = build_int_cst (type, 0);
2201     for (i = nunits - 1; i >= 0; --i)
2202       t = tree_cons (NULL_TREE, def_for_init, t);
2203     vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
2204     gcc_assert (vector_type);
2205     init_def = build_vector (vector_type, t);
2206     break;
2207
2208   case MIN_EXPR:
2209   case MAX_EXPR:
2210     *adjustment_def = NULL_TREE;
2211     init_def = vecdef;
2212     break;
2213
2214   default:
2215     gcc_unreachable ();
2216   }
2217
2218   return init_def;
2219 }
2220
2221
2222 /* Function vect_create_epilog_for_reduction
2223
2224    Create code at the loop-epilog to finalize the result of a reduction
2225    computation.
2226
2227    VECT_DEF is a vector of partial results.
2228    REDUC_CODE is the tree-code for the epilog reduction.
2229    STMT is the scalar reduction stmt that is being vectorized.
2230    REDUCTION_PHI is the phi-node that carries the reduction computation.
2231
2232    This function:
2233    1. Creates the reduction def-use cycle: sets the arguments for
2234       REDUCTION_PHI:
2235       The loop-entry argument is the vectorized initial-value of the reduction.
2236       The loop-latch argument is VECT_DEF - the vector of partial sums.
2237    2. "Reduces" the vector of partial results VECT_DEF into a single result,
2238       by applying the operation specified by REDUC_CODE if available, or by
2239       other means (whole-vector shifts or a scalar loop).
2240       The function also creates a new phi node at the loop exit to preserve
2241       loop-closed form, as illustrated below.
2242
2243      The flow at the entry to this function:
2244
2245         loop:
2246           vec_def = phi <null, null>            # REDUCTION_PHI
2247           VECT_DEF = vector_stmt                # vectorized form of STMT
2248           s_loop = scalar_stmt                  # (scalar) STMT
2249         loop_exit:
2250           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2251           use <s_out0>
2252           use <s_out0>
2253
2254      The above is transformed by this function into:
2255
2256         loop:
2257           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
2258           VECT_DEF = vector_stmt                # vectorized form of STMT
2259           s_loop = scalar_stmt                  # (scalar) STMT
2260         loop_exit:
2261           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2262           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
2263           v_out2 = reduce <v_out1>
2264           s_out3 = extract_field <v_out2, 0>
2265           s_out4 = adjust_result <s_out3>
2266           use <s_out4>
2267           use <s_out4>
2268 */
2269
2270 static void
2271 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
2272                                   enum tree_code reduc_code, tree reduction_phi)
2273 {
2274   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2275   tree vectype;
2276   enum machine_mode mode;
2277   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2278   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2279   basic_block exit_bb;
2280   tree scalar_dest;
2281   tree scalar_type;
2282   tree new_phi;
2283   block_stmt_iterator exit_bsi;
2284   tree vec_dest;
2285   tree new_temp = NULL_TREE;
2286   tree new_name;
2287   tree epilog_stmt = NULL_TREE;
2288   tree new_scalar_dest, exit_phi, new_dest;
2289   tree bitsize, bitpos, bytesize;
2290   enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2291   tree adjustment_def;
2292   tree vec_initial_def;
2293   tree orig_name;
2294   imm_use_iterator imm_iter;
2295   use_operand_p use_p;
2296   bool extract_scalar_result = false;
2297   tree reduction_op, expr;
2298   tree orig_stmt;
2299   tree use_stmt;
2300   tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
2301   bool nested_in_vect_loop = false;
2302   int op_type;
2303   VEC(tree,heap) *phis = NULL;
2304   int i;
2305
2306   if (nested_in_vect_loop_p (loop, stmt))
2307     {
2308       loop = loop->inner;
2309       nested_in_vect_loop = true;
2310     }
2311
2312   op_type = TREE_OPERAND_LENGTH (operation);
2313   reduction_op = TREE_OPERAND (operation, op_type-1);
2314   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2315   gcc_assert (vectype);
2316   mode = TYPE_MODE (vectype);
2317
2318   /*** 1. Create the reduction def-use cycle  ***/
2319
2320   /* 1.1 set the loop-entry arg of the reduction-phi:  */
2321   /* For the case of reduction, vect_get_vec_def_for_operand returns
2322      the scalar def before the loop, that defines the initial value
2323      of the reduction variable.  */
2324   vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2325                                                   &adjustment_def);
2326   add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
2327
2328   /* 1.2 set the loop-latch arg for the reduction-phi:  */
2329   add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
2330
2331   if (vect_print_dump_info (REPORT_DETAILS))
2332     {
2333       fprintf (vect_dump, "transform reduction: created def-use cycle:");
2334       print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
2335       fprintf (vect_dump, "\n");
2336       print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
2337     }
2338
2339
2340   /*** 2. Create epilog code
2341           The reduction epilog code operates across the elements of the vector
2342           of partial results computed by the vectorized loop.
2343           The reduction epilog code consists of:
2344           step 1: compute the scalar result in a vector (v_out2)
2345           step 2: extract the scalar result (s_out3) from the vector (v_out2)
2346           step 3: adjust the scalar result (s_out3) if needed.
2347
2348           Step 1 can be accomplished using one the following three schemes:
2349           (scheme 1) using reduc_code, if available.
2350           (scheme 2) using whole-vector shifts, if available.
2351           (scheme 3) using a scalar loop. In this case steps 1+2 above are
2352                      combined.
2353
2354           The overall epilog code looks like this:
2355
2356           s_out0 = phi <s_loop>         # original EXIT_PHI
2357           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
2358           v_out2 = reduce <v_out1>              # step 1
2359           s_out3 = extract_field <v_out2, 0>    # step 2
2360           s_out4 = adjust_result <s_out3>       # step 3
2361
2362           (step 3 is optional, and step2 1 and 2 may be combined).
2363           Lastly, the uses of s_out0 are replaced by s_out4.
2364
2365           ***/
2366
2367   /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2368         v_out1 = phi <v_loop>  */
2369
2370   exit_bb = single_exit (loop)->dest;
2371   new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2372   SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
2373   exit_bsi = bsi_after_labels (exit_bb);
2374
2375   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2376          (i.e. when reduc_code is not available) and in the final adjustment
2377          code (if needed).  Also get the original scalar reduction variable as
2378          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
2379          represents a reduction pattern), the tree-code and scalar-def are
2380          taken from the original stmt that the pattern-stmt (STMT) replaces.
2381          Otherwise (it is a regular reduction) - the tree-code and scalar-def
2382          are taken from STMT.  */
2383
2384   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2385   if (!orig_stmt)
2386     {
2387       /* Regular reduction  */
2388       orig_stmt = stmt;
2389     }
2390   else
2391     {
2392       /* Reduction pattern  */
2393       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2394       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2395       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2396     }
2397   code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2398   scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
2399   scalar_type = TREE_TYPE (scalar_dest);
2400   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2401   bitsize = TYPE_SIZE (scalar_type);
2402   bytesize = TYPE_SIZE_UNIT (scalar_type);
2403
2404
2405   /* In case this is a reduction in an inner-loop while vectorizing an outer
2406      loop - we don't need to extract a single scalar result at the end of the
2407      inner-loop.  The final vector of partial results will be used in the
2408      vectorized outer-loop, or reduced to a scalar result at the end of the
2409      outer-loop.  */
2410   if (nested_in_vect_loop)
2411     goto vect_finalize_reduction;
2412
2413   /* 2.3 Create the reduction code, using one of the three schemes described
2414          above.  */
2415
2416   if (reduc_code < NUM_TREE_CODES)
2417     {
2418       tree tmp;
2419
2420       /*** Case 1:  Create:
2421            v_out2 = reduc_expr <v_out1>  */
2422
2423       if (vect_print_dump_info (REPORT_DETAILS))
2424         fprintf (vect_dump, "Reduce using direct vector reduction.");
2425
2426       vec_dest = vect_create_destination_var (scalar_dest, vectype);
2427       tmp = build1 (reduc_code, vectype,  PHI_RESULT (new_phi));
2428       epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2429       new_temp = make_ssa_name (vec_dest, epilog_stmt);
2430       GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2431       bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2432
2433       extract_scalar_result = true;
2434     }
2435   else
2436     {
2437       enum tree_code shift_code = 0;
2438       bool have_whole_vector_shift = true;
2439       int bit_offset;
2440       int element_bitsize = tree_low_cst (bitsize, 1);
2441       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2442       tree vec_temp;
2443
2444       if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2445         shift_code = VEC_RSHIFT_EXPR;
2446       else
2447         have_whole_vector_shift = false;
2448
2449       /* Regardless of whether we have a whole vector shift, if we're
2450          emulating the operation via tree-vect-generic, we don't want
2451          to use it.  Only the first round of the reduction is likely
2452          to still be profitable via emulation.  */
2453       /* ??? It might be better to emit a reduction tree code here, so that
2454          tree-vect-generic can expand the first round via bit tricks.  */
2455       if (!VECTOR_MODE_P (mode))
2456         have_whole_vector_shift = false;
2457       else
2458         {
2459           optab optab = optab_for_tree_code (code, vectype);
2460           if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2461             have_whole_vector_shift = false;
2462         }
2463
2464       if (have_whole_vector_shift)
2465         {
2466           /*** Case 2: Create:
2467              for (offset = VS/2; offset >= element_size; offset/=2)
2468                 {
2469                   Create:  va' = vec_shift <va, offset>
2470                   Create:  va = vop <va, va'>
2471                 }  */
2472
2473           if (vect_print_dump_info (REPORT_DETAILS))
2474             fprintf (vect_dump, "Reduce using vector shifts");
2475
2476           vec_dest = vect_create_destination_var (scalar_dest, vectype);
2477           new_temp = PHI_RESULT (new_phi);
2478
2479           for (bit_offset = vec_size_in_bits/2;
2480                bit_offset >= element_bitsize;
2481                bit_offset /= 2)
2482             {
2483               tree bitpos = size_int (bit_offset);
2484               tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
2485               epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2486               new_name = make_ssa_name (vec_dest, epilog_stmt);
2487               GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2488               bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2489
2490               tmp = build2 (code, vectype, new_name, new_temp);
2491               epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2492               new_temp = make_ssa_name (vec_dest, epilog_stmt);
2493               GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2494               bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2495             }
2496
2497           extract_scalar_result = true;
2498         }
2499       else
2500         {
2501           tree rhs;
2502
2503           /*** Case 3: Create:
2504              s = extract_field <v_out2, 0>
2505              for (offset = element_size;
2506                   offset < vector_size;
2507                   offset += element_size;)
2508                {
2509                  Create:  s' = extract_field <v_out2, offset>
2510                  Create:  s = op <s, s'>
2511                }  */
2512
2513           if (vect_print_dump_info (REPORT_DETAILS))
2514             fprintf (vect_dump, "Reduce using scalar code. ");
2515
2516           vec_temp = PHI_RESULT (new_phi);
2517           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2518           rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2519                          bitsize_zero_node);
2520           epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2521           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2522           GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2523           bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2524
2525           for (bit_offset = element_bitsize;
2526                bit_offset < vec_size_in_bits;
2527                bit_offset += element_bitsize)
2528             {
2529               tree tmp;
2530               tree bitpos = bitsize_int (bit_offset);
2531               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2532                                  bitpos);
2533
2534               epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2535               new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2536               GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2537               bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2538
2539               tmp = build2 (code, scalar_type, new_name, new_temp);
2540               epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
2541               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2542               GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2543               bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2544             }
2545
2546           extract_scalar_result = false;
2547         }
2548     }
2549
2550   /* 2.4  Extract the final scalar result.  Create:
2551          s_out3 = extract_field <v_out2, bitpos>  */
2552
2553   if (extract_scalar_result)
2554     {
2555       tree rhs;
2556
2557       gcc_assert (!nested_in_vect_loop);
2558       if (vect_print_dump_info (REPORT_DETAILS))
2559         fprintf (vect_dump, "extract scalar result");
2560
2561       if (BYTES_BIG_ENDIAN)
2562         bitpos = size_binop (MULT_EXPR,
2563                        bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2564                        TYPE_SIZE (scalar_type));
2565       else
2566         bitpos = bitsize_zero_node;
2567
2568       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2569       epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2570       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2571       GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2572       bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2573     }
2574
2575 vect_finalize_reduction:
2576
2577   /* 2.5 Adjust the final result by the initial value of the reduction
2578          variable. (When such adjustment is not needed, then
2579          'adjustment_def' is zero).  For example, if code is PLUS we create:
2580          new_temp = loop_exit_def + adjustment_def  */
2581
2582   if (adjustment_def)
2583     {
2584       if (nested_in_vect_loop)
2585         {
2586           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2587           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2588           new_dest = vect_create_destination_var (scalar_dest, vectype);
2589         }
2590       else
2591         {
2592           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2593           expr = build2 (code, scalar_type, new_temp, adjustment_def);
2594           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2595         }
2596       epilog_stmt = build_gimple_modify_stmt (new_dest, expr);
2597       new_temp = make_ssa_name (new_dest, epilog_stmt);
2598       GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2599       bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2600     }
2601
2602
2603   /* 2.6  Handle the loop-exit phi  */
2604
2605   /* Replace uses of s_out0 with uses of s_out3:
2606      Find the loop-closed-use at the loop exit of the original scalar result.
2607      (The reduction result is expected to have two immediate uses - one at the
2608      latch block, and one at the loop exit).  */
2609   phis = VEC_alloc (tree, heap, 10);
2610   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2611     {
2612       if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
2613         {
2614           exit_phi = USE_STMT (use_p);
2615           VEC_quick_push (tree, phis, exit_phi);
2616         }
2617     }
2618   /* We expect to have found an exit_phi because of loop-closed-ssa form.  */
2619   gcc_assert (!VEC_empty (tree, phis));
2620
2621   for (i = 0; VEC_iterate (tree, phis, i, exit_phi); i++)
2622     {
2623       if (nested_in_vect_loop)
2624         {
2625           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2626
2627           /* FORNOW. Currently not supporting the case that an inner-loop reduction
2628              is not used in the outer-loop (but only outside the outer-loop).  */
2629           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2630                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
2631
2632           epilog_stmt = adjustment_def ? epilog_stmt :  new_phi;
2633           STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2634           set_stmt_info (get_stmt_ann (epilog_stmt),
2635           new_stmt_vec_info (epilog_stmt, loop_vinfo));
2636           continue;
2637         }
2638
2639       /* Replace the uses:  */
2640       orig_name = PHI_RESULT (exit_phi);
2641       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2642         FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2643           SET_USE (use_p, new_temp);
2644     }
2645   VEC_free (tree, heap, phis);
2646 }
2647
2648
2649 /* Function vectorizable_reduction.
2650
2651    Check if STMT performs a reduction operation that can be vectorized.
2652    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2653    stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2654    Return FALSE if not a vectorizable STMT, TRUE otherwise.
2655
2656    This function also handles reduction idioms (patterns) that have been
2657    recognized in advance during vect_pattern_recog. In this case, STMT may be
2658    of this form:
2659      X = pattern_expr (arg0, arg1, ..., X)
2660    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2661    sequence that had been detected and replaced by the pattern-stmt (STMT).
2662
2663    In some cases of reduction patterns, the type of the reduction variable X is
2664    different than the type of the other arguments of STMT.
2665    In such cases, the vectype that is used when transforming STMT into a vector
2666    stmt is different than the vectype that is used to determine the
2667    vectorization factor, because it consists of a different number of elements
2668    than the actual number of elements that are being operated upon in parallel.
2669
2670    For example, consider an accumulation of shorts into an int accumulator.
2671    On some targets it's possible to vectorize this pattern operating on 8
2672    shorts at a time (hence, the vectype for purposes of determining the
2673    vectorization factor should be V8HI); on the other hand, the vectype that
2674    is used to create the vector form is actually V4SI (the type of the result).
2675
2676    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2677    indicates what is the actual level of parallelism (V8HI in the example), so
2678    that the right vectorization factor would be derived. This vectype
2679    corresponds to the type of arguments to the reduction stmt, and should *NOT*
2680    be used to create the vectorized stmt. The right vectype for the vectorized
2681    stmt is obtained from the type of the result X:
2682         get_vectype_for_scalar_type (TREE_TYPE (X))
2683
2684    This means that, contrary to "regular" reductions (or "regular" stmts in
2685    general), the following equation:
2686       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2687    does *NOT* necessarily hold for reduction patterns.  */
2688
2689 bool
2690 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2691 {
2692   tree vec_dest;
2693   tree scalar_dest;
2694   tree op;
2695   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2696   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2697   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2698   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2699   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2700   tree operation;
2701   enum tree_code code, orig_code, epilog_reduc_code = 0;
2702   enum machine_mode vec_mode;
2703   int op_type;
2704   optab optab, reduc_optab;
2705   tree new_temp = NULL_TREE;
2706   tree def, def_stmt;
2707   enum vect_def_type dt;
2708   tree new_phi;
2709   tree scalar_type;
2710   bool is_simple_use;
2711   tree orig_stmt;
2712   stmt_vec_info orig_stmt_info;
2713   tree expr = NULL_TREE;
2714   int i;
2715   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2716   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2717   stmt_vec_info prev_stmt_info;
2718   tree reduc_def;
2719   tree new_stmt = NULL_TREE;
2720   int j;
2721
2722   if (nested_in_vect_loop_p (loop, stmt))
2723     {
2724       loop = loop->inner;
2725       /* FORNOW. This restriction should be relaxed.  */
2726       if (ncopies > 1)
2727         {
2728           if (vect_print_dump_info (REPORT_DETAILS))
2729             fprintf (vect_dump, "multiple types in nested loop.");
2730           return false;
2731         }
2732     }
2733
2734   gcc_assert (ncopies >= 1);
2735
2736   /* FORNOW: SLP not supported.  */
2737   if (STMT_SLP_TYPE (stmt_info))
2738     return false;
2739
2740   /* 1. Is vectorizable reduction?  */
2741
2742   /* Not supportable if the reduction variable is used in the loop.  */
2743   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2744     return false;
2745
2746   /* Reductions that are not used even in an enclosing outer-loop,
2747      are expected to be "live" (used out of the loop).  */
2748   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2749       && !STMT_VINFO_LIVE_P (stmt_info))
2750     return false;
2751
2752   /* Make sure it was already recognized as a reduction computation.  */
2753   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2754     return false;
2755
2756   /* 2. Has this been recognized as a reduction pattern?
2757
2758      Check if STMT represents a pattern that has been recognized
2759      in earlier analysis stages.  For stmts that represent a pattern,
2760      the STMT_VINFO_RELATED_STMT field records the last stmt in
2761      the original sequence that constitutes the pattern.  */
2762
2763   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2764   if (orig_stmt)
2765     {
2766       orig_stmt_info = vinfo_for_stmt (orig_stmt);
2767       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2768       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2769       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2770     }
2771
2772   /* 3. Check the operands of the operation. The first operands are defined
2773         inside the loop body. The last operand is the reduction variable,
2774         which is defined by the loop-header-phi.  */
2775
2776   gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
2777
2778   operation = GIMPLE_STMT_OPERAND (stmt, 1);
2779   code = TREE_CODE (operation);
2780   op_type = TREE_OPERAND_LENGTH (operation);
2781   if (op_type != binary_op && op_type != ternary_op)
2782     return false;
2783   scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2784   scalar_type = TREE_TYPE (scalar_dest);
2785   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2786       && !SCALAR_FLOAT_TYPE_P (scalar_type))
2787     return false;
2788
2789   /* All uses but the last are expected to be defined in the loop.
2790      The last use is the reduction variable.  */
2791   for (i = 0; i < op_type-1; i++)
2792     {
2793       op = TREE_OPERAND (operation, i);
2794       is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2795       gcc_assert (is_simple_use);
2796       if (dt != vect_loop_def
2797           && dt != vect_invariant_def
2798           && dt != vect_constant_def
2799           && dt != vect_induction_def)
2800         return false;
2801     }
2802
2803   op = TREE_OPERAND (operation, i);
2804   is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2805   gcc_assert (is_simple_use);
2806   gcc_assert (dt == vect_reduction_def);
2807   gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
2808   if (orig_stmt)
2809     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2810   else
2811     gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2812
2813   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2814     return false;
2815
2816   /* 4. Supportable by target?  */
2817
2818   /* 4.1. check support for the operation in the loop  */
2819   optab = optab_for_tree_code (code, vectype);
2820   if (!optab)
2821     {
2822       if (vect_print_dump_info (REPORT_DETAILS))
2823         fprintf (vect_dump, "no optab.");
2824       return false;
2825     }
2826   vec_mode = TYPE_MODE (vectype);
2827   if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2828     {
2829       if (vect_print_dump_info (REPORT_DETAILS))
2830         fprintf (vect_dump, "op not supported by target.");
2831       if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2832           || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2833              < vect_min_worthwhile_factor (code))
2834         return false;
2835       if (vect_print_dump_info (REPORT_DETAILS))
2836         fprintf (vect_dump, "proceeding using word mode.");
2837     }
2838
2839   /* Worthwhile without SIMD support?  */
2840   if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2841       && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2842          < vect_min_worthwhile_factor (code))
2843     {
2844       if (vect_print_dump_info (REPORT_DETAILS))
2845         fprintf (vect_dump, "not worthwhile without SIMD support.");
2846       return false;
2847     }
2848
2849   /* 4.2. Check support for the epilog operation.
2850
2851           If STMT represents a reduction pattern, then the type of the
2852           reduction variable may be different than the type of the rest
2853           of the arguments.  For example, consider the case of accumulation
2854           of shorts into an int accumulator; The original code:
2855                         S1: int_a = (int) short_a;
2856           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
2857
2858           was replaced with:
2859                         STMT: int_acc = widen_sum <short_a, int_acc>
2860
2861           This means that:
2862           1. The tree-code that is used to create the vector operation in the
2863              epilog code (that reduces the partial results) is not the
2864              tree-code of STMT, but is rather the tree-code of the original
2865              stmt from the pattern that STMT is replacing. I.e, in the example
2866              above we want to use 'widen_sum' in the loop, but 'plus' in the
2867              epilog.
2868           2. The type (mode) we use to check available target support
2869              for the vector operation to be created in the *epilog*, is
2870              determined by the type of the reduction variable (in the example
2871              above we'd check this: plus_optab[vect_int_mode]).
2872              However the type (mode) we use to check available target support
2873              for the vector operation to be created *inside the loop*, is
2874              determined by the type of the other arguments to STMT (in the
2875              example we'd check this: widen_sum_optab[vect_short_mode]).
2876
2877           This is contrary to "regular" reductions, in which the types of all
2878           the arguments are the same as the type of the reduction variable.
2879           For "regular" reductions we can therefore use the same vector type
2880           (and also the same tree-code) when generating the epilog code and
2881           when generating the code inside the loop.  */
2882
2883   if (orig_stmt)
2884     {
2885       /* This is a reduction pattern: get the vectype from the type of the
2886          reduction variable, and get the tree-code from orig_stmt.  */
2887       orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2888       vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
2889       if (!vectype)
2890         {
2891           if (vect_print_dump_info (REPORT_DETAILS))
2892             {
2893               fprintf (vect_dump, "unsupported data-type ");
2894               print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
2895             }
2896           return false;
2897         }
2898
2899       vec_mode = TYPE_MODE (vectype);
2900     }
2901   else
2902     {
2903       /* Regular reduction: use the same vectype and tree-code as used for
2904          the vector code inside the loop can be used for the epilog code. */
2905       orig_code = code;
2906     }
2907
2908   if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
2909     return false;
2910   reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
2911   if (!reduc_optab)
2912     {
2913       if (vect_print_dump_info (REPORT_DETAILS))
2914         fprintf (vect_dump, "no optab for reduction.");
2915       epilog_reduc_code = NUM_TREE_CODES;
2916     }
2917   if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
2918     {
2919       if (vect_print_dump_info (REPORT_DETAILS))
2920         fprintf (vect_dump, "reduc op not supported by target.");
2921       epilog_reduc_code = NUM_TREE_CODES;
2922     }
2923
2924   if (!vec_stmt) /* transformation not required.  */
2925     {
2926       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
2927       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
2928         return false;
2929       return true;
2930     }
2931
2932   /** Transform.  **/
2933
2934   if (vect_print_dump_info (REPORT_DETAILS))
2935     fprintf (vect_dump, "transform reduction.");
2936
2937   /* Create the destination vector  */
2938   vec_dest = vect_create_destination_var (scalar_dest, vectype);
2939
2940   /* Create the reduction-phi that defines the reduction-operand.  */
2941   new_phi = create_phi_node (vec_dest, loop->header);
2942
2943   /* In case the vectorization factor (VF) is bigger than the number
2944      of elements that we can fit in a vectype (nunits), we have to generate
2945      more than one vector stmt - i.e - we need to "unroll" the
2946      vector stmt by a factor VF/nunits.  For more details see documentation
2947      in vectorizable_operation.  */
2948
2949   prev_stmt_info = NULL;
2950   for (j = 0; j < ncopies; j++)
2951     {
2952       /* Handle uses.  */
2953       if (j == 0)
2954         {
2955           op = TREE_OPERAND (operation, 0);
2956           loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
2957           if (op_type == ternary_op)
2958             {
2959               op = TREE_OPERAND (operation, 1);
2960               loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
2961             }
2962
2963           /* Get the vector def for the reduction variable from the phi node */
2964           reduc_def = PHI_RESULT (new_phi);
2965         }
2966       else
2967         {
2968           enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2969           loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
2970           if (op_type == ternary_op)
2971             loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
2972
2973           /* Get the vector def for the reduction variable from the vectorized
2974              reduction operation generated in the previous iteration (j-1)  */
2975           reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
2976         }
2977
2978       /* Arguments are ready. create the new vector stmt.  */
2979       if (op_type == binary_op)
2980         expr = build2 (code, vectype, loop_vec_def0, reduc_def);
2981       else
2982         expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
2983                        reduc_def);
2984       new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2985       new_temp = make_ssa_name (vec_dest, new_stmt);
2986       GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2987       vect_finish_stmt_generation (stmt, new_stmt, bsi);
2988
2989       if (j == 0)
2990         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2991       else
2992         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2993       prev_stmt_info = vinfo_for_stmt (new_stmt);
2994     }
2995
2996   /* Finalize the reduction-phi (set it's arguments) and create the
2997      epilog reduction code.  */
2998   vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
2999   return true;
3000 }
3001
3002 /* Checks if CALL can be vectorized in type VECTYPE.  Returns
3003    a function declaration if the target has a vectorized version
3004    of the function, or NULL_TREE if the function cannot be vectorized.  */
3005
3006 tree
3007 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
3008 {
3009   tree fndecl = get_callee_fndecl (call);
3010   enum built_in_function code;
3011
3012   /* We only handle functions that do not read or clobber memory -- i.e.
3013      const or novops ones.  */
3014   if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3015     return NULL_TREE;
3016
3017   if (!fndecl
3018       || TREE_CODE (fndecl) != FUNCTION_DECL
3019       || !DECL_BUILT_IN (fndecl))
3020     return NULL_TREE;
3021
3022   code = DECL_FUNCTION_CODE (fndecl);
3023   return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3024                                                         vectype_in);
3025 }
3026
3027 /* Function vectorizable_call.
3028
3029    Check if STMT performs a function call that can be vectorized.
3030    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3031    stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3032    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
3033
3034 bool
3035 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3036 {
3037   tree vec_dest;
3038   tree scalar_dest;
3039   tree operation;
3040   tree op, type;
3041   tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3042   stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3043   tree vectype_out, vectype_in;
3044   int nunits_in;
3045   int nunits_out;
3046   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3047   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3048   tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
3049   enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3050   tree new_stmt;
3051   int ncopies, j, nargs;
3052   call_expr_arg_iterator iter;
3053   tree vargs;
3054   enum { NARROW, NONE, WIDEN } modifier;
3055
3056   if (!STMT_VINFO_RELEVANT_P (stmt_info))
3057     return false;
3058
3059   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3060     return false;
3061
3062   /* FORNOW: SLP not supported.  */
3063   if (STMT_SLP_TYPE (stmt_info))
3064     return false;
3065
3066   /* Is STMT a vectorizable call?   */
3067   if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3068     return false;
3069
3070   if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3071     return false;
3072
3073   operation = GIMPLE_STMT_OPERAND (stmt, 1);
3074   if (TREE_CODE (operation) != CALL_EXPR)
3075     return false;
3076
3077   /* Process function arguments.  */
3078   rhs_type = NULL_TREE;
3079   nargs = 0;
3080   FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3081     {
3082       /* Bail out if the function has more than two arguments, we
3083          do not have interesting builtin functions to vectorize with
3084          more than two arguments.  */
3085       if (nargs >= 2)
3086         return false;
3087
3088       /* We can only handle calls with arguments of the same type.  */
3089       if (rhs_type
3090           && rhs_type != TREE_TYPE (op))
3091         {
3092           if (vect_print_dump_info (REPORT_DETAILS))
3093             fprintf (vect_dump, "argument types differ.");
3094           return false;
3095         }
3096       rhs_type = TREE_TYPE (op);
3097
3098       if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
3099         {
3100           if (vect_print_dump_info (REPORT_DETAILS))
3101             fprintf (vect_dump, "use not simple.");
3102           return false;
3103         }
3104
3105       ++nargs;
3106     }
3107
3108   /* No arguments is also not good.  */
3109   if (nargs == 0)
3110     return false;
3111
3112   vectype_in = get_vectype_for_scalar_type (rhs_type);
3113   if (!vectype_in)
3114     return false;
3115   nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3116
3117   lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
3118   vectype_out = get_vectype_for_scalar_type (lhs_type);
3119   if (!vectype_out)
3120     return false;
3121   nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3122
3123   /* FORNOW */
3124   if (nunits_in == nunits_out / 2)
3125     modifier = NARROW;
3126   else if (nunits_out == nunits_in)
3127     modifier = NONE;
3128   else if (nunits_out == nunits_in / 2)
3129     modifier = WIDEN;
3130   else
3131     return false;
3132
3133   /* For now, we only vectorize functions if a target specific builtin
3134      is available.  TODO -- in some cases, it might be profitable to
3135      insert the calls for pieces of the vector, in order to be able
3136      to vectorize other operations in the loop.  */
3137   fndecl = vectorizable_function (operation, vectype_out, vectype_in);
3138   if (fndecl == NULL_TREE)
3139     {
3140       if (vect_print_dump_info (REPORT_DETAILS))
3141         fprintf (vect_dump, "function is not vectorizable.");
3142
3143       return false;
3144     }
3145
3146   gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3147
3148   if (modifier == NARROW)
3149     ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3150   else
3151     ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3152
3153   /* Sanity check: make sure that at least one copy of the vectorized stmt
3154      needs to be generated.  */
3155   gcc_assert (ncopies >= 1);
3156
3157   /* FORNOW. This restriction should be relaxed.  */
3158   if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3159     {
3160       if (vect_print_dump_info (REPORT_DETAILS))
3161       fprintf (vect_dump, "multiple types in nested loop.");
3162       return false;
3163     }
3164
3165   if (!vec_stmt) /* transformation not required.  */
3166     {
3167       STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3168       if (vect_print_dump_info (REPORT_DETAILS))
3169         fprintf (vect_dump, "=== vectorizable_call ===");
3170       vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3171       return true;
3172     }
3173
3174   /** Transform.  **/
3175
3176   if (vect_print_dump_info (REPORT_DETAILS))
3177     fprintf (vect_dump, "transform operation.");
3178
3179   /* FORNOW. This restriction should be relaxed.  */
3180   if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3181     {
3182       if (vect_print_dump_info (REPORT_DETAILS))
3183         fprintf (vect_dump, "multiple types in nested loop.");
3184       return false;
3185     }
3186
3187   /* Handle def.  */
3188   scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3189   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3190
3191   prev_stmt_info = NULL;
3192   switch (modifier)
3193     {
3194     case NONE:
3195       for (j = 0; j < ncopies; ++j)
3196         {
3197           /* Build argument list for the vectorized call.  */
3198           /* FIXME: Rewrite this so that it doesn't
3199              construct a temporary list.  */
3200           vargs = NULL_TREE;
3201           nargs = 0;
3202           FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3203             {
3204               if (j == 0)
3205                 vec_oprnd0
3206                   = vect_get_vec_def_for_operand (op, stmt, NULL);
3207               else
3208                 vec_oprnd0
3209                   = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3210
3211               vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3212
3213               ++nargs;
3214             }
3215           vargs = nreverse (vargs);
3216
3217           rhs = build_function_call_expr (fndecl, vargs);
3218           new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3219           new_temp = make_ssa_name (vec_dest, new_stmt);
3220           GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3221
3222           vect_finish_stmt_generation (stmt, new_stmt, bsi);
3223
3224           if (j == 0)
3225             STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3226           else
3227             STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3228
3229           prev_stmt_info = vinfo_for_stmt (new_stmt);
3230         }
3231
3232       break;
3233
3234     case NARROW:
3235       for (j = 0; j < ncopies; ++j)
3236         {
3237           /* Build argument list for the vectorized call.  */
3238           /* FIXME: Rewrite this so that it doesn't
3239              construct a temporary list.  */
3240           vargs = NULL_TREE;
3241           nargs = 0;
3242           FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3243             {
3244               if (j == 0)
3245                 {
3246                   vec_oprnd0
3247                     = vect_get_vec_def_for_operand (op, stmt, NULL);
3248                   vec_oprnd1
3249                     = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3250                 }
3251               else
3252                 {
3253                   vec_oprnd0
3254                     = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3255                   vec_oprnd1
3256                     = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3257                 }
3258
3259               vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3260               vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs);
3261
3262               ++nargs;
3263             }
3264           vargs = nreverse (vargs);
3265
3266           rhs = build_function_call_expr (fndecl, vargs);
3267           new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3268           new_temp = make_ssa_name (vec_dest, new_stmt);
3269           GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3270
3271           vect_finish_stmt_generation (stmt, new_stmt, bsi);
3272
3273           if (j == 0)
3274             STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3275           else
3276             STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3277
3278           prev_stmt_info = vinfo_for_stmt (new_stmt);
3279         }
3280
3281       *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3282
3283       break;
3284
3285     case WIDEN:
3286       /* No current target implements this case.  */
3287       return false;
3288     }
3289
3290   /* The call in STMT might prevent it from being removed in dce.
3291      We however cannot remove it here, due to the way the ssa name
3292      it defines is mapped to the new definition.  So just replace
3293      rhs of the statement with something harmless.  */
3294   type = TREE_TYPE (scalar_dest);
3295   GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
3296   update_stmt (stmt);
3297
3298   return true;
3299 }
3300
3301
3302 /* Function vect_gen_widened_results_half
3303
3304    Create a vector stmt whose code, type, number of arguments, and result
3305    variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
3306    VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3307    In the case that CODE is a CALL_EXPR, this means that a call to DECL
3308    needs to be created (DECL is a function-decl of a target-builtin).
3309    STMT is the original scalar stmt that we are vectorizing.  */
3310
3311 static tree
3312 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
3313                                tree vec_oprnd0, tree vec_oprnd1, int op_type,
3314                                tree vec_dest, block_stmt_iterator *bsi,
3315                                tree stmt)
3316 {
3317   tree expr;
3318   tree new_stmt;
3319   tree new_temp;
3320   tree sym;
3321   ssa_op_iter iter;
3322
3323   /* Generate half of the widened result:  */
3324   if (code == CALL_EXPR)
3325     {
3326       /* Target specific support  */
3327       if (op_type == binary_op)
3328         expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
3329       else
3330         expr = build_call_expr (decl, 1, vec_oprnd0);
3331     }
3332   else
3333     {
3334       /* Generic support */
3335       gcc_assert (op_type == TREE_CODE_LENGTH (code));
3336       if (op_type == binary_op)
3337         expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
3338       else
3339         expr = build1 (code, vectype, vec_oprnd0);
3340     }
3341   new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3342   new_temp = make_ssa_name (vec_dest, new_stmt);
3343   GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3344   vect_finish_stmt_generation (stmt, new_stmt, bsi);
3345
3346   if (code == CALL_EXPR)
3347     {
3348       FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3349         {
3350           if (TREE_CODE (sym) == SSA_NAME)
3351             sym = SSA_NAME_VAR (sym);
3352           mark_sym_for_renaming (sym);
3353         }
3354     }
3355
3356   return new_stmt;
3357 }
3358
3359
3360 /* Check if STMT performs a conversion operation, that can be vectorized.
3361    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3362    stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3363    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
3364
3365 bool
3366 vectorizable_conversion (tree stmt, block_stmt_iterator *bsi,
3367                          tree *vec_stmt, slp_tree slp_node)
3368 {
3369   tree vec_dest;
3370   tree scalar_dest;
3371   tree operation;
3372   tree op0;
3373   tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3374   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3375   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3376   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3377   enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3378   tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3379   tree new_temp;
3380   tree def, def_stmt;
3381   enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3382   tree new_stmt = NULL_TREE;
3383   stmt_vec_info prev_stmt_info;
3384   int nunits_in;
3385   int nunits_out;
3386   tree vectype_out, vectype_in;
3387   int ncopies, j;
3388   tree expr;
3389   tree rhs_type, lhs_type;
3390   tree builtin_decl;
3391   enum { NARROW, NONE, WIDEN } modifier;
3392   int i;
3393   VEC(tree,heap) *vec_oprnds0 = NULL;
3394   tree vop0;
3395
3396   /* Is STMT a vectorizable conversion?   */
3397
3398   if (!STMT_VINFO_RELEVANT_P (stmt_info))
3399     return false;
3400
3401   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3402     return false;
3403
3404   if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3405     return false;
3406
3407   if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3408     return false;
3409
3410   operation = GIMPLE_STMT_OPERAND (stmt, 1);
3411   code = TREE_CODE (operation);
3412   if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3413     return false;
3414
3415   /* Check types of lhs and rhs.  */
3416   op0 = TREE_OPERAND (operation, 0);
3417   rhs_type = TREE_TYPE (op0);
3418   vectype_in = get_vectype_for_scalar_type (rhs_type);
3419   if (!vectype_in)
3420     return false;
3421   nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3422
3423   scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3424   lhs_type = TREE_TYPE (scalar_dest);
3425   vectype_out = get_vectype_for_scalar_type (lhs_type);
3426   if (!vectype_out)
3427     return false;
3428   nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3429
3430   /* FORNOW */
3431   if (nunits_in == nunits_out / 2)
3432     modifier = NARROW;
3433   else if (nunits_out == nunits_in)
3434     modifier = NONE;
3435   else if (nunits_out == nunits_in / 2)
3436     modifier = WIDEN;
3437   else
3438     return false;
3439
3440   if (modifier == NONE)
3441     gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3442
3443   /* Bail out if the types are both integral or non-integral.  */
3444   if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3445       || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3446     return false;
3447
3448   if (modifier == NARROW)
3449     ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3450   else
3451     ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3452
3453   /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3454      this, so we can safely override NCOPIES with 1 here.  */
3455   if (slp_node)
3456     ncopies = 1;
3457
3458   /* Sanity check: make sure that at least one copy of the vectorized stmt
3459      needs to be generated.  */
3460   gcc_assert (ncopies >= 1);
3461
3462   /* FORNOW. This restriction should be relaxed.  */
3463   if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3464     {
3465       if (vect_print_dump_info (REPORT_DETAILS))
3466       fprintf (vect_dump, "multiple types in nested loop.");
3467       return false;
3468     }
3469
3470   /* Check the operands of the operation.  */
3471   if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3472     {
3473       if (vect_print_dump_info (REPORT_DETAILS))
3474         fprintf (vect_dump, "use not simple.");
3475       return false;
3476     }
3477
3478   /* Supportable by target?  */
3479   if ((modifier == NONE
3480        && !targetm.vectorize.builtin_conversion (code, vectype_in))
3481       || (modifier == WIDEN
3482           && !supportable_widening_operation (code, stmt, vectype_in,
3483                                               &decl1, &decl2,
3484                                               &code1, &code2))
3485       || (modifier == NARROW
3486           && !supportable_narrowing_operation (code, stmt, vectype_in,
3487                                                &code1)))
3488     {
3489       if (vect_print_dump_info (REPORT_DETAILS))
3490         fprintf (vect_dump, "op not supported by target.");
3491       return false;
3492     }
3493
3494   if (modifier != NONE)
3495     {
3496       STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3497       /* FORNOW: SLP not supported.  */
3498       if (STMT_SLP_TYPE (stmt_info))
3499         return false;
3500     }
3501
3502   if (!vec_stmt)                /* transformation not required.  */
3503     {
3504       STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3505       return true;
3506     }
3507
3508   /** Transform.  **/
3509   if (vect_print_dump_info (REPORT_DETAILS))
3510     fprintf (vect_dump, "transform conversion.");
3511
3512   /* Handle def.  */
3513   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3514
3515   if (modifier == NONE && !slp_node)
3516     vec_oprnds0 = VEC_alloc (tree, heap, 1);
3517
3518   prev_stmt_info = NULL;
3519   switch (modifier)
3520     {
3521     case NONE:
3522       for (j = 0; j < ncopies; j++)
3523         {
3524           tree sym;
3525           ssa_op_iter iter;
3526
3527           if (j == 0)
3528             vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3529           else
3530             vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3531
3532           builtin_decl =
3533             targetm.vectorize.builtin_conversion (code, vectype_in);
3534           for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3535             {
3536               new_stmt = build_call_expr (builtin_decl, 1, vop0);
3537
3538               /* Arguments are ready. create the new vector stmt.  */
3539               new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3540               new_temp = make_ssa_name (vec_dest, new_stmt);
3541               GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3542               vect_finish_stmt_generation (stmt, new_stmt, bsi);
3543               FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3544                                          SSA_OP_ALL_VIRTUALS)
3545                 {
3546                   if (TREE_CODE (sym) == SSA_NAME)
3547                     sym = SSA_NAME_VAR (sym);
3548                   mark_sym_for_renaming (sym);
3549                 }
3550               if (slp_node)
3551                 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3552             }
3553
3554           if (j == 0)
3555             STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3556           else
3557             STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3558           prev_stmt_info = vinfo_for_stmt (new_stmt);
3559         }
3560       break;
3561
3562     case WIDEN:
3563       /* In case the vectorization factor (VF) is bigger than the number
3564          of elements that we can fit in a vectype (nunits), we have to
3565          generate more than one vector stmt - i.e - we need to "unroll"
3566          the vector stmt by a factor VF/nunits.  */
3567       for (j = 0; j < ncopies; j++)
3568         {
3569           if (j == 0)
3570             vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3571           else
3572             vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3573
3574           STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3575
3576           /* Generate first half of the widened result:  */
3577           new_stmt
3578             = vect_gen_widened_results_half (code1, vectype_out, decl1,
3579                                              vec_oprnd0, vec_oprnd1,
3580                                              unary_op, vec_dest, bsi, stmt);
3581           if (j == 0)
3582             STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3583           else
3584             STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3585           prev_stmt_info = vinfo_for_stmt (new_stmt);
3586
3587           /* Generate second half of the widened result:  */
3588           new_stmt
3589             = vect_gen_widened_results_half (code2, vectype_out, decl2,
3590                                              vec_oprnd0, vec_oprnd1,
3591                                              unary_op, vec_dest, bsi, stmt);
3592           STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3593           prev_stmt_info = vinfo_for_stmt (new_stmt);
3594         }
3595       break;
3596
3597     case NARROW:
3598       /* In case the vectorization factor (VF) is bigger than the number
3599          of elements that we can fit in a vectype (nunits), we have to
3600          generate more than one vector stmt - i.e - we need to "unroll"
3601          the vector stmt by a factor VF/nunits.  */
3602       for (j = 0; j < ncopies; j++)
3603         {
3604           /* Handle uses.  */
3605           if (j == 0)
3606             {
3607               vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3608               vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3609             }
3610           else
3611             {
3612               vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3613               vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3614             }
3615
3616           /* Arguments are ready. Create the new vector stmt.  */
3617           expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3618           new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3619           new_temp = make_ssa_name (vec_dest, new_stmt);
3620           GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3621           vect_finish_stmt_generation (stmt, new_stmt, bsi);
3622
3623           if (j == 0)
3624             STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3625           else
3626             STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3627
3628           prev_stmt_info = vinfo_for_stmt (new_stmt);
3629         }
3630
3631       *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3632     }
3633
3634   return true;
3635 }
3636
3637
3638 /* Function vectorizable_assignment.
3639
3640    Check if STMT performs an assignment (copy) that can be vectorized.
3641    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3642    stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3643    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
3644
3645 bool
3646 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3647                          slp_tree slp_node)
3648 {
3649   tree vec_dest;
3650   tree scalar_dest;
3651   tree op;
3652   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3653   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3654   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3655   tree new_temp;
3656   tree def, def_stmt;
3657   enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3658   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3659   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3660   int i;
3661   VEC(tree,heap) *vec_oprnds = NULL;
3662   tree vop;
3663
3664   gcc_assert (ncopies >= 1);
3665   if (ncopies > 1)
3666     return false; /* FORNOW */
3667
3668   if (!STMT_VINFO_RELEVANT_P (stmt_info))
3669     return false;
3670
3671   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3672     return false;
3673
3674   /* Is vectorizable assignment?  */
3675   if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3676     return false;
3677
3678   scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3679   if (TREE_CODE (scalar_dest) != SSA_NAME)
3680     return false;
3681
3682   op = GIMPLE_STMT_OPERAND (stmt, 1);
3683   if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3684     {
3685       if (vect_print_dump_info (REPORT_DETAILS))
3686         fprintf (vect_dump, "use not simple.");
3687       return false;
3688     }
3689
3690   if (!vec_stmt) /* transformation not required.  */
3691     {
3692       STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3693       if (vect_print_dump_info (REPORT_DETAILS))
3694         fprintf (vect_dump, "=== vectorizable_assignment ===");
3695       vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3696       return true;
3697     }
3698
3699   /** Transform.  **/
3700   if (vect_print_dump_info (REPORT_DETAILS))
3701     fprintf (vect_dump, "transform assignment.");
3702
3703   /* Handle def.  */
3704   vec_dest = vect_create_destination_var (scalar_dest, vectype);
3705
3706   /* Handle use.  */
3707   vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3708
3709   /* Arguments are ready. create the new vector stmt.  */
3710   for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3711     {
3712       *vec_stmt = build_gimple_modify_stmt (vec_dest, vop);
3713       new_temp = make_ssa_name (vec_dest, *vec_stmt);
3714       GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
3715       vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3716       STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3717
3718       if (slp_node)
3719         VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3720    }
3721
3722   VEC_free (tree, heap, vec_oprnds);
3723   return true;
3724 }
3725
3726
3727 /* Function vect_min_worthwhile_factor.
3728
3729    For a loop where we could vectorize the operation indicated by CODE,
3730    return the minimum vectorization factor that makes it worthwhile
3731    to use generic vectors.  */
3732 static int
3733 vect_min_worthwhile_factor (enum tree_code code)
3734 {
3735   switch (code)
3736     {
3737     case PLUS_EXPR:
3738     case MINUS_EXPR:
3739     case NEGATE_EXPR:
3740       return 4;
3741
3742     case BIT_AND_EXPR:
3743     case BIT_IOR_EXPR:
3744     case BIT_XOR_EXPR:
3745     case BIT_NOT_EXPR:
3746       return 2;
3747
3748     default:
3749       return INT_MAX;
3750     }
3751 }
3752
3753
3754 /* Function vectorizable_induction
3755
3756    Check if PHI performs an induction computation that can be vectorized.
3757    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3758    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3759    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
3760
3761 bool
3762 vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3763                         tree *vec_stmt)
3764 {
3765   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3766   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3767   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3768   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3769   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3770   tree vec_def;
3771
3772   gcc_assert (ncopies >= 1);
3773
3774   if (!STMT_VINFO_RELEVANT_P (stmt_info))
3775     return false;
3776
3777   /* FORNOW: SLP not supported.  */
3778   if (STMT_SLP_TYPE (stmt_info))
3779     return false;
3780
3781   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3782
3783   if (TREE_CODE (phi) != PHI_NODE)
3784     return false;
3785
3786   if (!vec_stmt) /* transformation not required.  */
3787     {
3788       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3789       if (vect_print_dump_info (REPORT_DETAILS))
3790         fprintf (vect_dump, "=== vectorizable_induction ===");
3791       vect_model_induction_cost (stmt_info, ncopies);
3792       return true;
3793     }
3794
3795   /** Transform.  **/
3796
3797   if (vect_print_dump_info (REPORT_DETAILS))
3798     fprintf (vect_dump, "transform induction phi.");
3799
3800   vec_def = get_initial_def_for_induction (phi);
3801   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3802   return true;
3803 }
3804
3805
3806 /* Function vectorizable_operation.
3807
3808    Check if STMT performs a binary or unary operation that can be vectorized.
3809    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3810    stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3811    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
3812
3813 bool
3814 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3815                         slp_tree slp_node)
3816 {
3817   tree vec_dest;
3818   tree scalar_dest;
3819   tree operation;
3820   tree op0, op1 = NULL;
3821   tree vec_oprnd1 = NULL_TREE;
3822   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3823   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3824   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3825   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3826   enum tree_code code;
3827   enum machine_mode vec_mode;
3828   tree new_temp;
3829   int op_type;
3830   optab optab;
3831   int icode;
3832   enum machine_mode optab_op2_mode;
3833   tree def, def_stmt;
3834   enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3835   tree new_stmt = NULL_TREE;
3836   stmt_vec_info prev_stmt_info;
3837   int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
3838   int nunits_out;
3839   tree vectype_out;
3840   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3841   int j, i;
3842   VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
3843   tree vop0, vop1;
3844   unsigned int k;
3845   bool scalar_shift_arg = false;
3846
3847   /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3848      this, so we can safely override NCOPIES with 1 here.  */
3849   if (slp_node)
3850     ncopies = 1;
3851   gcc_assert (ncopies >= 1);
3852   /* FORNOW. This restriction should be relaxed.  */
3853   if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3854     {
3855       if (vect_print_dump_info (REPORT_DETAILS))
3856         fprintf (vect_dump, "multiple types in nested loop.");
3857       return false;
3858     }
3859
3860   if (!STMT_VINFO_RELEVANT_P (stmt_info))
3861     return false;
3862
3863   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3864     return false;
3865
3866   /* Is STMT a vectorizable binary/unary operation?   */
3867   if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3868     return false;
3869
3870   if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3871     return false;
3872
3873   scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3874   vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3875   if (!vectype_out)
3876     return false;
3877   nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3878   if (nunits_out != nunits_in)
3879     return false;
3880
3881   operation = GIMPLE_STMT_OPERAND (stmt, 1);
3882   code = TREE_CODE (operation);
3883
3884   /* For pointer addition, we should use the normal plus for
3885      the vector addition.  */
3886   if (code == POINTER_PLUS_EXPR)
3887     code = PLUS_EXPR;
3888
3889   optab = optab_for_tree_code (code, vectype);
3890
3891   /* Support only unary or binary operations.  */
3892   op_type = TREE_OPERAND_LENGTH (operation);
3893   if (op_type != unary_op && op_type != binary_op)
3894     {
3895       if (vect_print_dump_info (REPORT_DETAILS))
3896         fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
3897       return false;
3898     }
3899
3900   op0 = TREE_OPERAND (operation, 0);
3901   if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3902     {
3903       if (vect_print_dump_info (REPORT_DETAILS))
3904         fprintf (vect_dump, "use not simple.");
3905       return false;
3906     }
3907
3908   if (op_type == binary_op)
3909     {
3910       op1 = TREE_OPERAND (operation, 1);
3911       if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
3912         {
3913           if (vect_print_dump_info (REPORT_DETAILS))
3914             fprintf (vect_dump, "use not simple.");
3915           return false;
3916         }
3917     }
3918
3919   /* Supportable by target?  */
3920   if (!optab)
3921     {
3922       if (vect_print_dump_info (REPORT_DETAILS))
3923         fprintf (vect_dump, "no optab.");
3924       return false;
3925     }
3926   vec_mode = TYPE_MODE (vectype);
3927   icode = (int) optab_handler (optab, vec_mode)->insn_code;
3928   if (icode == CODE_FOR_nothing)
3929     {
3930       if (vect_print_dump_info (REPORT_DETAILS))
3931         fprintf (vect_dump, "op not supported by target.");
3932       /* Check only during analysis.  */
3933       if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3934           || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3935               < vect_min_worthwhile_factor (code)
3936               && !vec_stmt))
3937         return false;
3938       if (vect_print_dump_info (REPORT_DETAILS))
3939         fprintf (vect_dump, "proceeding using word mode.");
3940     }
3941
3942   /* Worthwhile without SIMD support? Check only during analysis.  */
3943   if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3944       && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3945          < vect_min_worthwhile_factor (code)
3946       && !vec_stmt)
3947     {
3948       if (vect_print_dump_info (REPORT_DETAILS))
3949         fprintf (vect_dump, "not worthwhile without SIMD support.");
3950       return false;
3951     }
3952
3953   if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
3954     {
3955       /* FORNOW: not yet supported.  */
3956       if (!VECTOR_MODE_P (vec_mode))
3957         return false;
3958
3959       /* Invariant argument is needed for a vector shift
3960          by a scalar shift operand.  */
3961       optab_op2_mode = insn_data[icode].operand[2].mode;
3962       if (!VECTOR_MODE_P (optab_op2_mode))
3963         {
3964           if (dt[1] != vect_constant_def && dt[1] != vect_invariant_def)
3965             {
3966               if (vect_print_dump_info (REPORT_DETAILS))
3967                 fprintf (vect_dump, "operand mode requires invariant"
3968                                     " argument.");
3969               return false;
3970             }
3971
3972           scalar_shift_arg = true;
3973         }
3974     }
3975
3976   if (!vec_stmt) /* transformation not required.  */
3977     {
3978       STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
3979       if (vect_print_dump_info (REPORT_DETAILS))
3980         fprintf (vect_dump, "=== vectorizable_operation ===");
3981       vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3982       return true;
3983     }
3984
3985   /** Transform.  **/
3986
3987   if (vect_print_dump_info (REPORT_DETAILS))
3988     fprintf (vect_dump, "transform binary/unary operation.");
3989
3990   /* Handle def.  */
3991   vec_dest = vect_create_destination_var (scalar_dest, vectype);
3992
3993   /* Allocate VECs for vector operands. In case of SLP, vector operands are
3994      created in the previous stages of the recursion, so no allocation is
3995      needed, except for the case of shift with scalar shift argument. In that
3996      case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
3997      be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
3998      In case of loop-based vectorization we allocate VECs of size 1. We
3999      allocate VEC_OPRNDS1 only in case of binary operation.  */
4000   if (!slp_node)
4001     {
4002       vec_oprnds0 = VEC_alloc (tree, heap, 1);
4003       if (op_type == binary_op)
4004         vec_oprnds1 = VEC_alloc (tree, heap, 1);
4005     }
4006   else if (scalar_shift_arg)
4007     vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4008
4009   /* In case the vectorization factor (VF) is bigger than the number
4010      of elements that we can fit in a vectype (nunits), we have to generate
4011      more than one vector stmt - i.e - we need to "unroll" the
4012      vector stmt by a factor VF/nunits. In doing so, we record a pointer
4013      from one copy of the vector stmt to the next, in the field
4014      STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4015      stages to find the correct vector defs to be used when vectorizing
4016      stmts that use the defs of the current stmt. The example below illustrates
4017      the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4018      4 vectorized stmts):
4019
4020      before vectorization:
4021                                 RELATED_STMT    VEC_STMT
4022         S1:     x = memref      -               -
4023         S2:     z = x + 1       -               -
4024
4025      step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4026              there):
4027                                 RELATED_STMT    VEC_STMT
4028         VS1_0:  vx0 = memref0   VS1_1           -
4029         VS1_1:  vx1 = memref1   VS1_2           -
4030         VS1_2:  vx2 = memref2   VS1_3           -
4031         VS1_3:  vx3 = memref3   -               -
4032         S1:     x = load        -               VS1_0
4033         S2:     z = x + 1       -               -
4034
4035      step2: vectorize stmt S2 (done here):
4036         To vectorize stmt S2 we first need to find the relevant vector
4037         def for the first operand 'x'. This is, as usual, obtained from
4038         the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4039         that defines 'x' (S1). This way we find the stmt VS1_0, and the
4040         relevant vector def 'vx0'. Having found 'vx0' we can generate
4041         the vector stmt VS2_0, and as usual, record it in the
4042         STMT_VINFO_VEC_STMT of stmt S2.
4043         When creating the second copy (VS2_1), we obtain the relevant vector
4044         def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4045         stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4046         vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4047         pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4048         Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4049         chain of stmts and pointers:
4050                                 RELATED_STMT    VEC_STMT
4051         VS1_0:  vx0 = memref0   VS1_1           -
4052         VS1_1:  vx1 = memref1   VS1_2           -
4053         VS1_2:  vx2 = memref2   VS1_3           -
4054         VS1_3:  vx3 = memref3   -               -
4055         S1:     x = load        -               VS1_0
4056         VS2_0:  vz0 = vx0 + v1  VS2_1           -
4057         VS2_1:  vz1 = vx1 + v1  VS2_2           -
4058         VS2_2:  vz2 = vx2 + v1  VS2_3           -
4059         VS2_3:  vz3 = vx3 + v1  -               -
4060         S2:     z = x + 1       -               VS2_0  */
4061
4062   prev_stmt_info = NULL;
4063   for (j = 0; j < ncopies; j++)
4064     {
4065       /* Handle uses.  */
4066       if (j == 0)
4067         {
4068           if (op_type == binary_op
4069               && (code == LSHIFT_EXPR || code == RSHIFT_EXPR))
4070             {
4071               /* Vector shl and shr insn patterns can be defined with scalar
4072                  operand 2 (shift operand). In this case, use constant or loop
4073                  invariant op1 directly, without extending it to vector mode
4074                  first.  */
4075               optab_op2_mode = insn_data[icode].operand[2].mode;
4076               if (!VECTOR_MODE_P (optab_op2_mode))
4077                 {
4078                   if (vect_print_dump_info (REPORT_DETAILS))
4079                     fprintf (vect_dump, "operand 1 using scalar mode.");
4080                   vec_oprnd1 = op1;
4081                   VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4082                   if (slp_node)
4083                     {
4084                       /* Store vec_oprnd1 for every vector stmt to be created
4085                          for SLP_NODE. We check during the analysis that all the
4086                          shift arguments are the same.
4087                          TODO: Allow different constants for different vector
4088                          stmts generated for an SLP instance.  */
4089                       for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4090                         VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4091                     }
4092                 }
4093             }
4094
4095           /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4096              (a special case for certain kind of vector shifts); otherwise,
4097              operand 1 should be of a vector type (the usual case).  */
4098           if (op_type == binary_op && !vec_oprnd1)
4099             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4100                                slp_node);
4101           else
4102             vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4103                                slp_node);
4104         }
4105       else
4106         vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4107
4108       /* Arguments are ready. Create the new vector stmt.  */
4109       for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4110         {
4111           if (op_type == binary_op)
4112             {
4113               vop1 = VEC_index (tree, vec_oprnds1, i);
4114               new_stmt = build_gimple_modify_stmt (vec_dest,
4115                                          build2 (code, vectype, vop0, vop1));
4116             }
4117           else
4118             new_stmt = build_gimple_modify_stmt (vec_dest,
4119                                     build1 (code, vectype, vop0));
4120
4121           new_temp = make_ssa_name (vec_dest, new_stmt);
4122           GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4123           vect_finish_stmt_generation (stmt, new_stmt, bsi);
4124           if (slp_node)
4125             VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4126         }
4127
4128       if (j == 0)
4129         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4130       else
4131         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4132       prev_stmt_info = vinfo_for_stmt (new_stmt);
4133     }
4134
4135   VEC_free (tree, heap, vec_oprnds0);
4136   if (vec_oprnds1)
4137     VEC_free (tree, heap, vec_oprnds1);
4138
4139   return true;
4140 }
4141
4142
4143 /* Function vectorizable_type_demotion
4144
4145    Check if STMT performs a binary or unary operation that involves
4146    type demotion, and if it can be vectorized.
4147    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4148    stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4149    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
4150
4151 bool
4152 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
4153                             tree *vec_stmt)
4154 {
4155   tree vec_dest;
4156   tree scalar_dest;
4157   tree operation;
4158   tree op0;
4159   tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4160   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4161   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4162   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4163   enum tree_code code, code1 = ERROR_MARK;
4164   tree new_temp;
4165   tree def, def_stmt;
4166   enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4167   tree new_stmt;
4168   stmt_vec_info prev_stmt_info;
4169   int nunits_in;
4170   int nunits_out;
4171   tree vectype_out;
4172   int ncopies;
4173   int j;
4174   tree expr;
4175   tree vectype_in;
4176
4177   if (!STMT_VINFO_RELEVANT_P (stmt_info))
4178     return false;
4179
4180   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4181     return false;
4182
4183   /* Is STMT a vectorizable type-demotion operation?  */
4184   if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4185     return false;
4186
4187   if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4188     return false;
4189
4190   operation = GIMPLE_STMT_OPERAND (stmt, 1);
4191   code = TREE_CODE (operation);
4192   if (code != NOP_EXPR && code != CONVERT_EXPR)
4193     return false;
4194
4195   op0 = TREE_OPERAND (operation, 0);
4196   vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4197   if (!vectype_in)
4198     return false;
4199   nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4200
4201   scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4202   vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4203   if (!vectype_out)
4204     return false;
4205   nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4206   if (nunits_in != nunits_out / 2) /* FORNOW */
4207     return false;
4208
4209   ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4210   gcc_assert (ncopies >= 1);
4211   /* FORNOW. This restriction should be relaxed.  */
4212   if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4213     {
4214       if (vect_print_dump_info (REPORT_DETAILS))
4215         fprintf (vect_dump, "multiple types in nested loop.");
4216       return false;
4217     }
4218
4219   if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4220           && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4221          || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4222              && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4223              && (code == NOP_EXPR || code == CONVERT_EXPR))))
4224     return false;
4225
4226   /* Check the operands of the operation.  */
4227   if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4228     {
4229       if (vect_print_dump_info (REPORT_DETAILS))
4230         fprintf (vect_dump, "use not simple.");
4231       return false;
4232     }
4233
4234   /* Supportable by target?  */
4235   if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1))
4236     return false;
4237
4238   STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4239
4240   if (!vec_stmt) /* transformation not required.  */
4241     {
4242       STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4243       if (vect_print_dump_info (REPORT_DETAILS))
4244         fprintf (vect_dump, "=== vectorizable_demotion ===");
4245       vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4246       return true;
4247     }
4248
4249   /** Transform.  **/
4250   if (vect_print_dump_info (REPORT_DETAILS))
4251     fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4252              ncopies);
4253
4254   /* Handle def.  */
4255   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4256
4257   /* In case the vectorization factor (VF) is bigger than the number
4258      of elements that we can fit in a vectype (nunits), we have to generate
4259      more than one vector stmt - i.e - we need to "unroll" the
4260      vector stmt by a factor VF/nunits.   */
4261   prev_stmt_info = NULL;
4262   for (j = 0; j < ncopies; j++)
4263     {
4264       /* Handle uses.  */
4265       if (j == 0)
4266         {
4267           vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4268           vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4269         }
4270       else
4271         {
4272           vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
4273           vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4274         }
4275
4276       /* Arguments are ready. Create the new vector stmt.  */
4277       expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
4278       new_stmt = build_gimple_modify_stmt (vec_dest, expr);
4279       new_temp = make_ssa_name (vec_dest, new_stmt);
4280       GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4281       vect_finish_stmt_generation (stmt, new_stmt, bsi);
4282
4283       if (j == 0)
4284         STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4285       else
4286         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4287
4288       prev_stmt_info = vinfo_for_stmt (new_stmt);
4289     }
4290
4291   *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4292   return true;
4293 }
4294
4295
4296 /* Function vectorizable_type_promotion
4297
4298    Check if STMT performs a binary or unary operation that involves
4299    type promotion, and if it can be vectorized.
4300    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4301    stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4302    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
4303
4304 bool
4305 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
4306                              tree *vec_stmt)
4307 {
4308   tree vec_dest;
4309   tree scalar_dest;
4310   tree operation;
4311   tree op0, op1 = NULL;
4312   tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4313   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4314   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4315   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4316   enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4317   tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4318   int op_type;
4319   tree def, def_stmt;
4320   enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4321   tree new_stmt;
4322   stmt_vec_info prev_stmt_info;
4323   int nunits_in;
4324   int nunits_out;
4325   tree vectype_out;
4326   int ncopies;
4327   int j;
4328   tree vectype_in;
4329
4330   if (!STMT_VINFO_RELEVANT_P (stmt_info))
4331     return false;
4332
4333   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4334     return false;
4335
4336   /* Is STMT a vectorizable type-promotion operation?  */
4337   if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4338     return false;
4339
4340   if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4341     return false;
4342
4343   operation = GIMPLE_STMT_OPERAND (stmt, 1);
4344   code = TREE_CODE (operation);
4345   if (code != NOP_EXPR && code != CONVERT_EXPR
4346       && code != WIDEN_MULT_EXPR)
4347     return false;
4348
4349   op0 = TREE_OPERAND (operation, 0);
4350   vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4351   if (!vectype_in)
4352     return false;
4353   nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4354
4355   scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4356   vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4357   if (!vectype_out)
4358     return false;
4359   nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4360   if (nunits_out != nunits_in / 2) /* FORNOW */
4361     return false;
4362
4363   ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4364   gcc_assert (ncopies >= 1);
4365   /* FORNOW. This restriction should be relaxed.  */
4366   if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4367     {
4368       if (vect_print_dump_info (REPORT_DETAILS))
4369         fprintf (vect_dump, "multiple types in nested loop.");
4370       return false;
4371     }
4372
4373   if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4374           && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4375          || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4376              && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4377              && (code == CONVERT_EXPR || code == NOP_EXPR))))
4378     return false;
4379
4380   /* Check the operands of the operation.  */
4381   if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4382     {
4383       if (vect_print_dump_info (REPORT_DETAILS))
4384         fprintf (vect_dump, "use not simple.");
4385       return false;
4386     }
4387
4388   op_type = TREE_CODE_LENGTH (code);
4389   if (op_type == binary_op)
4390     {
4391       op1 = TREE_OPERAND (operation, 1);
4392       if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4393         {
4394           if (vect_print_dump_info (REPORT_DETAILS))
4395             fprintf (vect_dump, "use not simple.");
4396           return false;
4397         }
4398     }
4399
4400   /* Supportable by target?  */
4401   if (!supportable_widening_operation (code, stmt, vectype_in,
4402                                        &decl1, &decl2, &code1, &code2))
4403     return false;
4404
4405   STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4406
4407   if (!vec_stmt) /* transformation not required.  */
4408     {
4409       STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4410       if (vect_print_dump_info (REPORT_DETAILS))
4411         fprintf (vect_dump, "=== vectorizable_promotion ===");
4412       vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4413       return true;
4414     }
4415
4416   /** Transform.  **/
4417
4418   if (vect_print_dump_info (REPORT_DETAILS))
4419     fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4420                         ncopies);
4421
4422   /* Handle def.  */
4423   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4424
4425   /* In case the vectorization factor (VF) is bigger than the number
4426      of elements that we can fit in a vectype (nunits), we have to generate
4427      more than one vector stmt - i.e - we need to "unroll" the
4428      vector stmt by a factor VF/nunits.   */
4429
4430   prev_stmt_info = NULL;
4431   for (j = 0; j < ncopies; j++)
4432     {
4433       /* Handle uses.  */
4434       if (j == 0)
4435         {
4436           vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4437           if (op_type == binary_op)
4438             vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4439         }
4440       else
4441         {
4442           vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4443           if (op_type == binary_op)
4444             vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4445         }
4446
4447       /* Arguments are ready. Create the new vector stmt.  We are creating
4448          two vector defs because the widened result does not fit in one vector.
4449          The vectorized stmt can be expressed as a call to a taregt builtin,
4450          or a using a tree-code.  */
4451       /* Generate first half of the widened result:  */
4452       new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
4453                         vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4454       if (j == 0)
4455         STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4456       else
4457         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4458       prev_stmt_info = vinfo_for_stmt (new_stmt);
4459
4460       /* Generate second half of the widened result:  */
4461       new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
4462                         vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4463       STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4464       prev_stmt_info = vinfo_for_stmt (new_stmt);
4465
4466     }
4467
4468   *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4469   return true;
4470 }
4471
4472
4473 /* Function vect_strided_store_supported.
4474
4475    Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4476    and FALSE otherwise.  */
4477
4478 static bool
4479 vect_strided_store_supported (tree vectype)
4480 {
4481   optab interleave_high_optab, interleave_low_optab;
4482   int mode;
4483
4484   mode = (int) TYPE_MODE (vectype);
4485
4486   /* Check that the operation is supported.  */
4487   interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4488                                                vectype);
4489   interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4490                                               vectype);
4491   if (!interleave_high_optab || !interleave_low_optab)
4492     {
4493       if (vect_print_dump_info (REPORT_DETAILS))
4494         fprintf (vect_dump, "no optab for interleave.");
4495       return false;
4496     }
4497
4498   if (optab_handler (interleave_high_optab, mode)->insn_code
4499       == CODE_FOR_nothing
4500       || optab_handler (interleave_low_optab, mode)->insn_code
4501       == CODE_FOR_nothing)
4502     {
4503       if (vect_print_dump_info (REPORT_DETAILS))
4504         fprintf (vect_dump, "interleave op not supported by target.");
4505       return false;
4506     }
4507
4508   return true;
4509 }
4510
4511
4512 /* Function vect_permute_store_chain.
4513
4514    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4515    a power of 2, generate interleave_high/low stmts to reorder the data
4516    correctly for the stores. Return the final references for stores in
4517    RESULT_CHAIN.
4518
4519    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4520    The input is 4 vectors each containing 8 elements. We assign a number to each
4521    element, the input sequence is:
4522
4523    1st vec:   0  1  2  3  4  5  6  7
4524    2nd vec:   8  9 10 11 12 13 14 15
4525    3rd vec:  16 17 18 19 20 21 22 23
4526    4th vec:  24 25 26 27 28 29 30 31
4527
4528    The output sequence should be:
4529
4530    1st vec:  0  8 16 24  1  9 17 25
4531    2nd vec:  2 10 18 26  3 11 19 27
4532    3rd vec:  4 12 20 28  5 13 21 30
4533    4th vec:  6 14 22 30  7 15 23 31
4534
4535    i.e., we interleave the contents of the four vectors in their order.
4536
4537    We use interleave_high/low instructions to create such output. The input of
4538    each interleave_high/low operation is two vectors:
4539    1st vec    2nd vec
4540    0 1 2 3    4 5 6 7
4541    the even elements of the result vector are obtained left-to-right from the
4542    high/low elements of the first vector. The odd elements of the result are
4543    obtained left-to-right from the high/low elements of the second vector.
4544    The output of interleave_high will be:   0 4 1 5
4545    and of interleave_low:                   2 6 3 7
4546
4547
4548    The permutation is done in log LENGTH stages. In each stage interleave_high
4549    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4550    where the first argument is taken from the first half of DR_CHAIN and the
4551    second argument from it's second half.
4552    In our example,
4553
4554    I1: interleave_high (1st vec, 3rd vec)
4555    I2: interleave_low (1st vec, 3rd vec)
4556    I3: interleave_high (2nd vec, 4th vec)
4557    I4: interleave_low (2nd vec, 4th vec)
4558
4559    The output for the first stage is:
4560
4561    I1:  0 16  1 17  2 18  3 19
4562    I2:  4 20  5 21  6 22  7 23
4563    I3:  8 24  9 25 10 26 11 27
4564    I4: 12 28 13 29 14 30 15 31
4565
4566    The output of the second stage, i.e. the final result is:
4567
4568    I1:  0  8 16 24  1  9 17 25
4569    I2:  2 10 18 26  3 11 19 27
4570    I3:  4 12 20 28  5 13 21 30
4571    I4:  6 14 22 30  7 15 23 31.  */
4572
4573 static bool
4574 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
4575                           unsigned int length,
4576                           tree stmt,
4577                           block_stmt_iterator *bsi,
4578                           VEC(tree,heap) **result_chain)
4579 {
4580   tree perm_dest, perm_stmt, vect1, vect2, high, low;
4581   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4582   tree scalar_dest, tmp;
4583   int i;
4584   unsigned int j;
4585   VEC(tree,heap) *first, *second;
4586
4587   scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4588   first = VEC_alloc (tree, heap, length/2);
4589   second = VEC_alloc (tree, heap, length/2);
4590
4591   /* Check that the operation is supported.  */
4592   if (!vect_strided_store_supported (vectype))
4593     return false;
4594
4595   *result_chain = VEC_copy (tree, heap, dr_chain);
4596
4597   for (i = 0; i < exact_log2 (length); i++)
4598     {
4599       for (j = 0; j < length/2; j++)
4600         {
4601           vect1 = VEC_index (tree, dr_chain, j);
4602           vect2 = VEC_index (tree, dr_chain, j+length/2);
4603
4604           /* Create interleaving stmt:
4605              in the case of big endian:
4606                                 high = interleave_high (vect1, vect2)
4607              and in the case of little endian:
4608                                 high = interleave_low (vect1, vect2).  */
4609           perm_dest = create_tmp_var (vectype, "vect_inter_high");
4610           DECL_GIMPLE_REG_P (perm_dest) = 1;
4611           add_referenced_var (perm_dest);
4612           if (BYTES_BIG_ENDIAN)
4613             tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4614           else
4615             tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4616           perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4617           high = make_ssa_name (perm_dest, perm_stmt);
4618           GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
4619           vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4620           VEC_replace (tree, *result_chain, 2*j, high);
4621
4622           /* Create interleaving stmt:
4623              in the case of big endian:
4624                                low  = interleave_low (vect1, vect2)
4625              and in the case of little endian:
4626                                low  = interleave_high (vect1, vect2).  */
4627           perm_dest = create_tmp_var (vectype, "vect_inter_low");
4628           DECL_GIMPLE_REG_P (perm_dest) = 1;
4629           add_referenced_var (perm_dest);
4630           if (BYTES_BIG_ENDIAN)
4631             tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4632           else
4633             tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4634           perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4635           low = make_ssa_name (perm_dest, perm_stmt);
4636           GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
4637           vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4638           VEC_replace (tree, *result_chain, 2*j+1, low);
4639         }
4640       dr_chain = VEC_copy (tree, heap, *result_chain);
4641     }
4642   return true;
4643 }
4644
4645
4646 /* Function vectorizable_store.
4647
4648    Check if STMT defines a non scalar data-ref (array/pointer/structure) that
4649    can be vectorized.
4650    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4651    stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4652    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
4653
4654 bool
4655 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
4656                     slp_tree slp_node)
4657 {
4658   tree scalar_dest;
4659   tree data_ref;
4660   tree op;
4661   tree vec_oprnd = NULL_TREE;
4662   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4663   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
4664   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4665   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4666   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4667   enum machine_mode vec_mode;
4668   tree dummy;
4669   enum dr_alignment_support alignment_support_scheme;
4670   tree def, def_stmt;
4671   enum vect_def_type dt;
4672   stmt_vec_info prev_stmt_info = NULL;
4673   tree dataref_ptr = NULL_TREE;
4674   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4675   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4676   int j;
4677   tree next_stmt, first_stmt = NULL_TREE;
4678   bool strided_store = false;
4679   unsigned int group_size, i;
4680   VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
4681   bool inv_p;
4682   VEC(tree,heap) *vec_oprnds = NULL;
4683   bool slp = (slp_node != NULL);
4684   stmt_vec_info first_stmt_vinfo;
4685   unsigned int vec_num;
4686
4687    /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
4688       this, so we can safely override NCOPIES with 1 here.  */
4689   if (slp)
4690     ncopies = 1;
4691
4692   gcc_assert (ncopies >= 1);
4693
4694   /* FORNOW. This restriction should be relaxed.  */
4695   if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4696     {
4697       if (vect_print_dump_info (REPORT_DETAILS))
4698         fprintf (vect_dump, "multiple types in nested loop.");
4699       return false;
4700     }
4701
4702   if (!STMT_VINFO_RELEVANT_P (stmt_info))
4703     return false;
4704
4705   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4706     return false;
4707
4708   /* Is vectorizable store? */
4709
4710   if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4711     return false;
4712
4713   scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4714   if (TREE_CODE (scalar_dest) != ARRAY_REF
4715       && TREE_CODE (scalar_dest) != INDIRECT_REF
4716       && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
4717     return false;
4718
4719   op = GIMPLE_STMT_OPERAND (stmt, 1);
4720   if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4721     {
4722       if (vect_print_dump_info (REPORT_DETAILS))
4723         fprintf (vect_dump, "use not simple.");
4724       return false;
4725     }
4726
4727   vec_mode = TYPE_MODE (vectype);
4728   /* FORNOW. In some cases can vectorize even if data-type not supported
4729      (e.g. - array initialization with 0).  */
4730   if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
4731     return false;
4732
4733   if (!STMT_VINFO_DATA_REF (stmt_info))
4734     return false;
4735
4736   if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4737     {
4738       strided_store = true;
4739       first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4740       if (!vect_strided_store_supported (vectype)
4741           && !PURE_SLP_STMT (stmt_info) && !slp)
4742         return false;
4743
4744       if (first_stmt == stmt)
4745         {
4746           /* STMT is the leader of the group. Check the operands of all the
4747              stmts of the group.  */
4748           next_stmt = DR_GROUP_NEXT_DR (stmt_info);
4749           while (next_stmt)
4750             {
4751               op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4752               if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4753                 {
4754                   if (vect_print_dump_info (REPORT_DETAILS))
4755                     fprintf (vect_dump, "use not simple.");
4756                   return false;
4757                 }
4758               next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4759             }
4760         }
4761     }
4762
4763   if (!vec_stmt) /* transformation not required.  */
4764     {
4765       STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
4766       if (!PURE_SLP_STMT (stmt_info))
4767         vect_model_store_cost (stmt_info, ncopies, dt, NULL);
4768       return true;
4769     }
4770
4771   /** Transform.  **/
4772
4773   if (strided_store)
4774     {
4775       first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
4776       group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
4777
4778       DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
4779
4780       /* FORNOW */
4781       gcc_assert (!nested_in_vect_loop_p (loop, stmt));
4782
4783       /* We vectorize all the stmts of the interleaving group when we
4784          reach the last stmt in the group.  */
4785       if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
4786           < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
4787           && !slp)
4788         {
4789           *vec_stmt = NULL_TREE;
4790           return true;
4791         }
4792
4793       if (slp)
4794         strided_store = false;
4795
4796       /* VEC_NUM is the number of vect stmts to be created for this group.  */
4797       if (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) < group_size)
4798         vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4799       else
4800         vec_num = group_size;
4801     }
4802   else
4803     {
4804       first_stmt = stmt;
4805       first_dr = dr;
4806       group_size = vec_num = 1;
4807       first_stmt_vinfo = stmt_info;
4808     }
4809
4810   if (vect_print_dump_info (REPORT_DETAILS))
4811     fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
4812
4813   dr_chain = VEC_alloc (tree, heap, group_size);
4814   oprnds = VEC_alloc (tree, heap, group_size);
4815
4816   alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
4817   gcc_assert (alignment_support_scheme);
4818   gcc_assert (alignment_support_scheme == dr_aligned);  /* FORNOW */
4819
4820   /* In case the vectorization factor (VF) is bigger than the number
4821      of elements that we can fit in a vectype (nunits), we have to generate
4822      more than one vector stmt - i.e - we need to "unroll" the
4823      vector stmt by a factor VF/nunits.  For more details see documentation in
4824      vect_get_vec_def_for_copy_stmt.  */
4825
4826   /* In case of interleaving (non-unit strided access):
4827
4828         S1:  &base + 2 = x2
4829         S2:  &base = x0
4830         S3:  &base + 1 = x1
4831         S4:  &base + 3 = x3
4832
4833      We create vectorized stores starting from base address (the access of the
4834      first stmt in the chain (S2 in the above example), when the last store stmt
4835      of the chain (S4) is reached:
4836
4837         VS1: &base = vx2
4838         VS2: &base + vec_size*1 = vx0
4839         VS3: &base + vec_size*2 = vx1
4840         VS4: &base + vec_size*3 = vx3
4841
4842      Then permutation statements are generated:
4843
4844         VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
4845         VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
4846         ...
4847
4848      And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
4849      (the order of the data-refs in the output of vect_permute_store_chain
4850      corresponds to the order of scalar stmts in the interleaving chain - see
4851      the documentation of vect_permute_store_chain()).
4852
4853      In case of both multiple types and interleaving, above vector stores and
4854      permutation stmts are created for every copy. The result vector stmts are
4855      put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
4856      STMT_VINFO_RELATED_STMT for the next copies.
4857   */
4858
4859   prev_stmt_info = NULL;
4860   for (j = 0; j < ncopies; j++)
4861     {
4862       tree new_stmt;
4863       tree ptr_incr;
4864
4865       if (j == 0)
4866         {
4867           if (slp)
4868             {
4869               /* Get vectorized arguments for SLP_NODE.  */
4870               vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
4871
4872               vec_oprnd = VEC_index (tree, vec_oprnds, 0);
4873             }
4874           else
4875             {
4876               /* For interleaved stores we collect vectorized defs for all the
4877                  stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
4878                  used as an input to vect_permute_store_chain(), and OPRNDS as
4879                  an input to vect_get_vec_def_for_stmt_copy() for the next copy.
4880
4881                  If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4882                  OPRNDS are of size 1.  */
4883               next_stmt = first_stmt;
4884               for (i = 0; i < group_size; i++)
4885                 {
4886                   /* Since gaps are not supported for interleaved stores,
4887                      GROUP_SIZE is the exact number of stmts in the chain.
4888                      Therefore, NEXT_STMT can't be NULL_TREE.  In case that
4889                      there is no interleaving, GROUP_SIZE is 1, and only one
4890                      iteration of the loop will be executed.  */
4891                   gcc_assert (next_stmt);
4892                   op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4893
4894                   vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
4895                                                             NULL);
4896                   VEC_quick_push(tree, dr_chain, vec_oprnd);
4897                   VEC_quick_push(tree, oprnds, vec_oprnd);
4898                   next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4899                 }
4900             }
4901           dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
4902                                                   &dummy, &ptr_incr, false,
4903                                                   TREE_TYPE (vec_oprnd), &inv_p);
4904           gcc_assert (!inv_p);
4905         }
4906       else
4907         {
4908           /* FORNOW SLP doesn't work for multiple types.  */
4909           gcc_assert (!slp);
4910
4911           /* For interleaved stores we created vectorized defs for all the
4912              defs stored in OPRNDS in the previous iteration (previous copy).
4913              DR_CHAIN is then used as an input to vect_permute_store_chain(),
4914              and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
4915              next copy.
4916              If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4917              OPRNDS are of size 1.  */
4918           for (i = 0; i < group_size; i++)
4919             {
4920               op = VEC_index (tree, oprnds, i);
4921               vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
4922               vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
4923               VEC_replace(tree, dr_chain, i, vec_oprnd);
4924               VEC_replace(tree, oprnds, i, vec_oprnd);
4925             }
4926           dataref_ptr =
4927                 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
4928         }
4929
4930       if (strided_store)
4931         {
4932           result_chain = VEC_alloc (tree, heap, group_size);
4933           /* Permute.  */
4934           if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
4935                                          &result_chain))
4936             return false;
4937         }
4938
4939       next_stmt = first_stmt;
4940       for (i = 0; i < vec_num; i++)
4941         {
4942           if (i > 0)
4943             /* Bump the vector pointer.  */
4944             dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
4945                                            NULL_TREE);
4946
4947           if (slp)
4948             vec_oprnd = VEC_index (tree, vec_oprnds, i);
4949           else if (strided_store)
4950             /* For strided stores vectorized defs are interleaved in
4951                vect_permute_store_chain().  */
4952             vec_oprnd = VEC_index (tree, result_chain, i);
4953
4954           data_ref = build_fold_indirect_ref (dataref_ptr);
4955           /* Arguments are ready. Create the new vector stmt.  */
4956           new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
4957           vect_finish_stmt_generation (stmt, new_stmt, bsi);
4958           mark_symbols_for_renaming (new_stmt);
4959
4960           if (j == 0)
4961             STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt =  new_stmt;
4962           else
4963             STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4964
4965           prev_stmt_info = vinfo_for_stmt (new_stmt);
4966           next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4967           if (!next_stmt)
4968             break;
4969         }
4970     }
4971
4972   return true;
4973 }
4974
4975
4976 /* Function vect_setup_realignment
4977
4978    This function is called when vectorizing an unaligned load using
4979    the dr_explicit_realign[_optimized] scheme.
4980    This function generates the following code at the loop prolog:
4981
4982       p = initial_addr;
4983    x  msq_init = *(floor(p));   # prolog load
4984       realignment_token = call target_builtin;
4985     loop:
4986    x  msq = phi (msq_init, ---)
4987
4988    The stmts marked with x are generated only for the case of
4989    dr_explicit_realign_optimized.
4990
4991    The code above sets up a new (vector) pointer, pointing to the first
4992    location accessed by STMT, and a "floor-aligned" load using that pointer.
4993    It also generates code to compute the "realignment-token" (if the relevant
4994    target hook was defined), and creates a phi-node at the loop-header bb
4995    whose arguments are the result of the prolog-load (created by this
4996    function) and the result of a load that takes place in the loop (to be
4997    created by the caller to this function).
4998
4999    For the case of dr_explicit_realign_optimized:
5000    The caller to this function uses the phi-result (msq) to create the
5001    realignment code inside the loop, and sets up the missing phi argument,
5002    as follows:
5003     loop:
5004       msq = phi (msq_init, lsq)
5005       lsq = *(floor(p'));        # load in loop
5006       result = realign_load (msq, lsq, realignment_token);
5007
5008    For the case of dr_explicit_realign:
5009     loop:
5010       msq = *(floor(p));        # load in loop
5011       p' = p + (VS-1);
5012       lsq = *(floor(p'));       # load in loop
5013       result = realign_load (msq, lsq, realignment_token);
5014
5015    Input:
5016    STMT - (scalar) load stmt to be vectorized. This load accesses
5017           a memory location that may be unaligned.
5018    BSI - place where new code is to be inserted.
5019    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5020                               is used.
5021
5022    Output:
5023    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5024                        target hook, if defined.
5025    Return value - the result of the loop-header phi node.  */
5026
5027 static tree
5028 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
5029                         tree *realignment_token,
5030                         enum dr_alignment_support alignment_support_scheme,
5031                         tree init_addr,
5032                         struct loop **at_loop)
5033 {
5034   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5035   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5036   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5037   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5038   edge pe;
5039   tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5040   tree vec_dest;
5041   tree inc;
5042   tree ptr;
5043   tree data_ref;
5044   tree new_stmt;
5045   basic_block new_bb;
5046   tree msq_init = NULL_TREE;
5047   tree new_temp;
5048   tree phi_stmt;
5049   tree msq = NULL_TREE;
5050   tree stmts = NULL_TREE;
5051   bool inv_p;
5052   bool compute_in_loop = false;
5053   bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5054   struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5055   struct loop *loop_for_initial_load;
5056
5057   gcc_assert (alignment_support_scheme == dr_explicit_realign
5058               || alignment_support_scheme == dr_explicit_realign_optimized);
5059
5060   /* We need to generate three things:
5061      1. the misalignment computation
5062      2. the extra vector load (for the optimized realignment scheme).
5063      3. the phi node for the two vectors from which the realignment is
5064       done (for the optimized realignment scheme).
5065    */
5066
5067   /* 1. Determine where to generate the misalignment computation.
5068
5069      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5070      calculation will be generated by this function, outside the loop (in the
5071      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5072      caller, inside the loop.
5073
5074      Background: If the misalignment remains fixed throughout the iterations of
5075      the loop, then both realignment schemes are applicable, and also the
5076      misalignment computation can be done outside LOOP.  This is because we are
5077      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5078      are a multiple of VS (the Vector Size), and therefore the misalignment in
5079      different vectorized LOOP iterations is always the same.
5080      The problem arises only if the memory access is in an inner-loop nested
5081      inside LOOP, which is now being vectorized using outer-loop vectorization.
5082      This is the only case when the misalignment of the memory access may not
5083      remain fixed throughout the iterations of the inner-loop (as explained in
5084      detail in vect_supportable_dr_alignment).  In this case, not only is the
5085      optimized realignment scheme not applicable, but also the misalignment
5086      computation (and generation of the realignment token that is passed to
5087      REALIGN_LOAD) have to be done inside the loop.
5088
5089      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5090      or not, which in turn determines if the misalignment is computed inside
5091      the inner-loop, or outside LOOP.  */
5092
5093   if (init_addr != NULL_TREE)
5094     {
5095       compute_in_loop = true;
5096       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5097     }
5098
5099
5100   /* 2. Determine where to generate the extra vector load.
5101
5102      For the optimized realignment scheme, instead of generating two vector
5103      loads in each iteration, we generate a single extra vector load in the
5104      preheader of the loop, and in each iteration reuse the result of the
5105      vector load from the previous iteration.  In case the memory access is in
5106      an inner-loop nested inside LOOP, which is now being vectorized using
5107      outer-loop vectorization, we need to determine whether this initial vector
5108      load should be generated at the preheader of the inner-loop, or can be
5109      generated at the preheader of LOOP.  If the memory access has no evolution
5110      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5111      to be generated inside LOOP (in the preheader of the inner-loop).  */
5112
5113   if (nested_in_vect_loop)
5114     {
5115       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5116       bool invariant_in_outerloop =
5117             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5118       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5119     }
5120   else
5121     loop_for_initial_load = loop;
5122   if (at_loop)
5123     *at_loop = loop_for_initial_load;
5124
5125   /* 3. For the case of the optimized realignment, create the first vector
5126       load at the loop preheader.  */
5127
5128   if (alignment_support_scheme == dr_explicit_realign_optimized)
5129     {
5130       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5131
5132       gcc_assert (!compute_in_loop);
5133       pe = loop_preheader_edge (loop_for_initial_load);
5134       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5135       ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5136                                 &init_addr, &inc, true, NULL_TREE, &inv_p);
5137       data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5138       new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5139       new_temp = make_ssa_name (vec_dest, new_stmt);
5140       GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5141       mark_symbols_for_renaming (new_stmt);
5142       new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5143       gcc_assert (!new_bb);
5144       msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
5145     }
5146
5147   /* 4. Create realignment token using a target builtin, if available.
5148       It is done either inside the containing loop, or before LOOP (as
5149       determined above).  */
5150
5151   if (targetm.vectorize.builtin_mask_for_load)
5152     {
5153       tree builtin_decl;
5154
5155       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5156       if (compute_in_loop)
5157         gcc_assert (init_addr); /* already computed by the caller.  */
5158       else
5159         {
5160           /* Generate the INIT_ADDR computation outside LOOP.  */
5161           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5162                                                         NULL_TREE, loop);
5163           pe = loop_preheader_edge (loop);
5164           new_bb = bsi_insert_on_edge_immediate (pe, stmts);
5165           gcc_assert (!new_bb);
5166         }
5167
5168       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5169       new_stmt = build_call_expr (builtin_decl, 1, init_addr);
5170       vec_dest = vect_create_destination_var (scalar_dest,
5171                                               TREE_TYPE (new_stmt));
5172       new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5173       new_temp = make_ssa_name (vec_dest, new_stmt);
5174       GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5175
5176       if (compute_in_loop)
5177         bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT);
5178       else
5179         {
5180           /* Generate the misalignment computation outside LOOP.  */
5181           pe = loop_preheader_edge (loop);
5182           new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5183           gcc_assert (!new_bb);
5184         }
5185
5186       *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
5187
5188       /* The result of the CALL_EXPR to this builtin is determined from
5189          the value of the parameter and no global variables are touched
5190          which makes the builtin a "const" function.  Requiring the
5191          builtin to have the "const" attribute makes it unnecessary
5192          to call mark_call_clobbered.  */
5193       gcc_assert (TREE_READONLY (builtin_decl));
5194     }
5195
5196   if (alignment_support_scheme == dr_explicit_realign)
5197     return msq;
5198
5199   gcc_assert (!compute_in_loop);
5200   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5201
5202
5203   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5204
5205   pe = loop_preheader_edge (containing_loop);
5206   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5207   msq = make_ssa_name (vec_dest, NULL_TREE);
5208   phi_stmt = create_phi_node (msq, containing_loop->header);
5209   SSA_NAME_DEF_STMT (msq) = phi_stmt;
5210   add_phi_arg (phi_stmt, msq_init, pe);
5211
5212   return msq;
5213 }
5214
5215
5216 /* Function vect_strided_load_supported.
5217
5218    Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5219    and FALSE otherwise.  */
5220
5221 static bool
5222 vect_strided_load_supported (tree vectype)
5223 {
5224   optab perm_even_optab, perm_odd_optab;
5225   int mode;
5226
5227   mode = (int) TYPE_MODE (vectype);
5228
5229   perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
5230   if (!perm_even_optab)
5231     {
5232       if (vect_print_dump_info (REPORT_DETAILS))
5233         fprintf (vect_dump, "no optab for perm_even.");
5234       return false;
5235     }
5236
5237   if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5238     {
5239       if (vect_print_dump_info (REPORT_DETAILS))
5240         fprintf (vect_dump, "perm_even op not supported by target.");
5241       return false;
5242     }
5243
5244   perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
5245   if (!perm_odd_optab)
5246     {
5247       if (vect_print_dump_info (REPORT_DETAILS))
5248         fprintf (vect_dump, "no optab for perm_odd.");
5249       return false;
5250     }
5251
5252   if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5253     {
5254       if (vect_print_dump_info (REPORT_DETAILS))
5255         fprintf (vect_dump, "perm_odd op not supported by target.");
5256       return false;
5257     }
5258   return true;
5259 }
5260
5261
5262 /* Function vect_permute_load_chain.
5263
5264    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5265    a power of 2, generate extract_even/odd stmts to reorder the input data
5266    correctly. Return the final references for loads in RESULT_CHAIN.
5267
5268    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5269    The input is 4 vectors each containing 8 elements. We assign a number to each
5270    element, the input sequence is:
5271
5272    1st vec:   0  1  2  3  4  5  6  7
5273    2nd vec:   8  9 10 11 12 13 14 15
5274    3rd vec:  16 17 18 19 20 21 22 23
5275    4th vec:  24 25 26 27 28 29 30 31
5276
5277    The output sequence should be:
5278
5279    1st vec:  0 4  8 12 16 20 24 28
5280    2nd vec:  1 5  9 13 17 21 25 29
5281    3rd vec:  2 6 10 14 18 22 26 30
5282    4th vec:  3 7 11 15 19 23 27 31
5283
5284    i.e., the first output vector should contain the first elements of each
5285    interleaving group, etc.
5286
5287    We use extract_even/odd instructions to create such output. The input of each
5288    extract_even/odd operation is two vectors
5289    1st vec    2nd vec
5290    0 1 2 3    4 5 6 7
5291
5292    and the output is the vector of extracted even/odd elements. The output of
5293    extract_even will be:   0 2 4 6
5294    and of extract_odd:     1 3 5 7
5295
5296
5297    The permutation is done in log LENGTH stages. In each stage extract_even and
5298    extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5299    order. In our example,
5300
5301    E1: extract_even (1st vec, 2nd vec)
5302    E2: extract_odd (1st vec, 2nd vec)
5303    E3: extract_even (3rd vec, 4th vec)
5304    E4: extract_odd (3rd vec, 4th vec)
5305
5306    The output for the first stage will be:
5307
5308    E1:  0  2  4  6  8 10 12 14
5309    E2:  1  3  5  7  9 11 13 15
5310    E3: 16 18 20 22 24 26 28 30
5311    E4: 17 19 21 23 25 27 29 31
5312
5313    In order to proceed and create the correct sequence for the next stage (or
5314    for the correct output, if the second stage is the last one, as in our
5315    example), we first put the output of extract_even operation and then the
5316    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5317    The input for the second stage is:
5318
5319    1st vec (E1):  0  2  4  6  8 10 12 14
5320    2nd vec (E3): 16 18 20 22 24 26 28 30
5321    3rd vec (E2):  1  3  5  7  9 11 13 15
5322    4th vec (E4): 17 19 21 23 25 27 29 31
5323
5324    The output of the second stage:
5325
5326    E1: 0 4  8 12 16 20 24 28
5327    E2: 2 6 10 14 18 22 26 30
5328    E3: 1 5  9 13 17 21 25 29
5329    E4: 3 7 11 15 19 23 27 31
5330
5331    And RESULT_CHAIN after reordering:
5332
5333    1st vec (E1):  0 4  8 12 16 20 24 28
5334    2nd vec (E3):  1 5  9 13 17 21 25 29
5335    3rd vec (E2):  2 6 10 14 18 22 26 30
5336    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5337
5338 static bool
5339 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5340                          unsigned int length,
5341                          tree stmt,
5342                          block_stmt_iterator *bsi,
5343                          VEC(tree,heap) **result_chain)
5344 {
5345   tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
5346   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5347   tree tmp;
5348   int i;
5349   unsigned int j;
5350
5351   /* Check that the operation is supported.  */
5352   if (!vect_strided_load_supported (vectype))
5353     return false;
5354
5355   *result_chain = VEC_copy (tree, heap, dr_chain);
5356   for (i = 0; i < exact_log2 (length); i++)
5357     {
5358       for (j = 0; j < length; j +=2)
5359         {
5360           first_vect = VEC_index (tree, dr_chain, j);
5361           second_vect = VEC_index (tree, dr_chain, j+1);
5362
5363           /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5364           perm_dest = create_tmp_var (vectype, "vect_perm_even");
5365           DECL_GIMPLE_REG_P (perm_dest) = 1;
5366           add_referenced_var (perm_dest);
5367
5368           tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
5369                         first_vect, second_vect);
5370           perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5371
5372           data_ref = make_ssa_name (perm_dest, perm_stmt);
5373           GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5374           vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5375           mark_symbols_for_renaming (perm_stmt);
5376
5377           VEC_replace (tree, *result_chain, j/2, data_ref);
5378
5379           /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5380           perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5381           DECL_GIMPLE_REG_P (perm_dest) = 1;
5382           add_referenced_var (perm_dest);
5383
5384           tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
5385                         first_vect, second_vect);
5386           perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5387           data_ref = make_ssa_name (perm_dest, perm_stmt);
5388           GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5389           vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5390           mark_symbols_for_renaming (perm_stmt);
5391
5392           VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5393         }
5394       dr_chain = VEC_copy (tree, heap, *result_chain);
5395     }
5396   return true;
5397 }
5398
5399
5400 /* Function vect_transform_strided_load.
5401
5402    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5403    to perform their permutation and ascribe the result vectorized statements to
5404    the scalar statements.
5405 */
5406
5407 static bool
5408 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
5409                              block_stmt_iterator *bsi)
5410 {
5411   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5412   tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5413   tree next_stmt, new_stmt;
5414   VEC(tree,heap) *result_chain = NULL;
5415   unsigned int i, gap_count;
5416   tree tmp_data_ref;
5417
5418   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5419      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5420      vectors, that are ready for vector computation.  */
5421   result_chain = VEC_alloc (tree, heap, size);
5422   /* Permute.  */
5423   if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
5424     return false;
5425
5426   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5427      Since we scan the chain starting from it's first node, their order
5428      corresponds the order of data-refs in RESULT_CHAIN.  */
5429   next_stmt = first_stmt;
5430   gap_count = 1;
5431   for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5432     {
5433       if (!next_stmt)
5434         break;
5435
5436       /* Skip the gaps. Loads created for the gaps will be removed by dead
5437        code elimination pass later.
5438        DR_GROUP_GAP is the number of steps in elements from the previous
5439        access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5440        correspond to the gaps.
5441       */
5442       if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5443       {
5444         gap_count++;
5445         continue;
5446       }
5447
5448       while (next_stmt)
5449         {
5450           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5451           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5452              copies, and we put the new vector statement in the first available
5453              RELATED_STMT.  */
5454           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5455             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5456           else
5457             {
5458               tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5459               tree rel_stmt = STMT_VINFO_RELATED_STMT (
5460                                                        vinfo_for_stmt (prev_stmt));
5461               while (rel_stmt)
5462                 {
5463                   prev_stmt = rel_stmt;
5464                   rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5465                 }
5466               STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
5467             }
5468           next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5469           gap_count = 1;
5470           /* If NEXT_STMT accesses the same DR as the previous statement,
5471              put the same TMP_DATA_REF as its vectorized statement; otherwise
5472              get the next data-ref from RESULT_CHAIN.  */
5473           if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5474             break;
5475         }
5476     }
5477   return true;
5478 }
5479
5480
5481 /* vectorizable_load.
5482
5483    Check if STMT reads a non scalar data-ref (array/pointer/structure) that
5484    can be vectorized.
5485    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5486    stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5487    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5488
5489 bool
5490 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
5491                    slp_tree slp_node)
5492 {
5493   tree scalar_dest;
5494   tree vec_dest = NULL;
5495   tree data_ref = NULL;
5496   tree op;
5497   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5498   stmt_vec_info prev_stmt_info;
5499   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5500   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5501   struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5502   bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5503   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
5504   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5505   tree new_temp;
5506   int mode;
5507   tree new_stmt = NULL_TREE;
5508   tree dummy;
5509   enum dr_alignment_support alignment_support_scheme;
5510   tree dataref_ptr = NULL_TREE;
5511   tree ptr_incr;
5512   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5513   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5514   int i, j, group_size;
5515   tree msq = NULL_TREE, lsq;
5516   tree offset = NULL_TREE;
5517   tree realignment_token = NULL_TREE;
5518   tree phi = NULL_TREE;
5519   VEC(tree,heap) *dr_chain = NULL;
5520   bool strided_load = false;
5521   tree first_stmt;
5522   tree scalar_type;
5523   bool inv_p;
5524   bool compute_in_loop = false;
5525   struct loop *at_loop;
5526   int vec_num;
5527   bool slp = (slp_node != NULL);
5528
5529   /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
5530       this, so we can safely override NCOPIES with 1 here.  */
5531   if (slp)
5532     ncopies = 1;
5533
5534   gcc_assert (ncopies >= 1);
5535
5536   /* FORNOW. This restriction should be relaxed.  */
5537   if (nested_in_vect_loop && ncopies > 1)
5538     {
5539       if (vect_print_dump_info (REPORT_DETAILS))
5540         fprintf (vect_dump, "multiple types in nested loop.");
5541       return false;
5542     }
5543
5544   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5545     return false;
5546
5547   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5548     return false;
5549
5550   /* Is vectorizable load? */
5551   if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5552     return false;
5553
5554   scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5555   if (TREE_CODE (scalar_dest) != SSA_NAME)
5556     return false;
5557
5558   op = GIMPLE_STMT_OPERAND (stmt, 1);
5559   if (TREE_CODE (op) != ARRAY_REF
5560       && TREE_CODE (op) != INDIRECT_REF
5561       && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5562     return false;
5563
5564   if (!STMT_VINFO_DATA_REF (stmt_info))
5565     return false;
5566
5567   scalar_type = TREE_TYPE (DR_REF (dr));
5568   mode = (int) TYPE_MODE (vectype);
5569
5570   /* FORNOW. In some cases can vectorize even if data-type not supported
5571     (e.g. - data copies).  */
5572   if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
5573     {
5574       if (vect_print_dump_info (REPORT_DETAILS))
5575         fprintf (vect_dump, "Aligned load, but unsupported type.");
5576       return false;
5577     }
5578
5579   /* Check if the load is a part of an interleaving chain.  */
5580   if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5581     {
5582       strided_load = true;
5583       /* FORNOW */
5584       gcc_assert (! nested_in_vect_loop);
5585
5586       /* Check if interleaving is supported.  */
5587       if (!vect_strided_load_supported (vectype)
5588           && !PURE_SLP_STMT (stmt_info) && !slp)
5589         return false;
5590     }
5591
5592   if (!vec_stmt) /* transformation not required.  */
5593     {
5594       STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
5595       vect_model_load_cost (stmt_info, ncopies, NULL);
5596       return true;
5597     }
5598
5599   if (vect_print_dump_info (REPORT_DETAILS))
5600     fprintf (vect_dump, "transform load.");
5601
5602   /** Transform.  **/
5603
5604   if (strided_load)
5605     {
5606       first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5607       /* Check if the chain of loads is already vectorized.  */
5608       if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
5609         {
5610           *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5611           return true;
5612         }
5613       first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5614       group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5615       dr_chain = VEC_alloc (tree, heap, group_size);
5616
5617       /* VEC_NUM is the number of vect stmts to be created for this group.  */
5618       if (slp)
5619         {
5620           strided_load = false;
5621           vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5622         }
5623       else
5624         vec_num = group_size;
5625     }
5626   else
5627     {
5628       first_stmt = stmt;
5629       first_dr = dr;
5630       group_size = vec_num = 1;
5631     }
5632
5633   alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5634   gcc_assert (alignment_support_scheme);
5635
5636   /* In case the vectorization factor (VF) is bigger than the number
5637      of elements that we can fit in a vectype (nunits), we have to generate
5638      more than one vector stmt - i.e - we need to "unroll" the
5639      vector stmt by a factor VF/nunits. In doing so, we record a pointer
5640      from one copy of the vector stmt to the next, in the field
5641      STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
5642      stages to find the correct vector defs to be used when vectorizing
5643      stmts that use the defs of the current stmt. The example below illustrates
5644      the vectorization process when VF=16 and nunits=4 (i.e - we need to create
5645      4 vectorized stmts):
5646
5647      before vectorization:
5648                                 RELATED_STMT    VEC_STMT
5649         S1:     x = memref      -               -
5650         S2:     z = x + 1       -               -
5651
5652      step 1: vectorize stmt S1:
5653         We first create the vector stmt VS1_0, and, as usual, record a
5654         pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
5655         Next, we create the vector stmt VS1_1, and record a pointer to
5656         it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
5657         Similarly, for VS1_2 and VS1_3. This is the resulting chain of
5658         stmts and pointers:
5659                                 RELATED_STMT    VEC_STMT
5660         VS1_0:  vx0 = memref0   VS1_1           -
5661         VS1_1:  vx1 = memref1   VS1_2           -
5662         VS1_2:  vx2 = memref2   VS1_3           -
5663         VS1_3:  vx3 = memref3   -               -
5664         S1:     x = load        -               VS1_0
5665         S2:     z = x + 1       -               -
5666
5667      See in documentation in vect_get_vec_def_for_stmt_copy for how the
5668      information we recorded in RELATED_STMT field is used to vectorize
5669      stmt S2.  */
5670
5671   /* In case of interleaving (non-unit strided access):
5672
5673      S1:  x2 = &base + 2
5674      S2:  x0 = &base
5675      S3:  x1 = &base + 1
5676      S4:  x3 = &base + 3
5677
5678      Vectorized loads are created in the order of memory accesses
5679      starting from the access of the first stmt of the chain:
5680
5681      VS1: vx0 = &base
5682      VS2: vx1 = &base + vec_size*1
5683      VS3: vx3 = &base + vec_size*2
5684      VS4: vx4 = &base + vec_size*3
5685
5686      Then permutation statements are generated:
5687
5688      VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
5689      VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
5690        ...
5691
5692      And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5693      (the order of the data-refs in the output of vect_permute_load_chain
5694      corresponds to the order of scalar stmts in the interleaving chain - see
5695      the documentation of vect_permute_load_chain()).
5696      The generation of permutation stmts and recording them in
5697      STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
5698
5699      In case of both multiple types and interleaving, the vector loads and
5700      permutation stmts above are created for every copy. The result vector stmts
5701      are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5702      STMT_VINFO_RELATED_STMT for the next copies.  */
5703
5704   /* If the data reference is aligned (dr_aligned) or potentially unaligned
5705      on a target that supports unaligned accesses (dr_unaligned_supported)
5706      we generate the following code:
5707          p = initial_addr;
5708          indx = 0;
5709          loop {
5710            p = p + indx * vectype_size;
5711            vec_dest = *(p);
5712            indx = indx + 1;
5713          }
5714
5715      Otherwise, the data reference is potentially unaligned on a target that
5716      does not support unaligned accesses (dr_explicit_realign_optimized) -
5717      then generate the following code, in which the data in each iteration is
5718      obtained by two vector loads, one from the previous iteration, and one
5719      from the current iteration:
5720          p1 = initial_addr;
5721          msq_init = *(floor(p1))
5722          p2 = initial_addr + VS - 1;
5723          realignment_token = call target_builtin;
5724          indx = 0;
5725          loop {
5726            p2 = p2 + indx * vectype_size
5727            lsq = *(floor(p2))
5728            vec_dest = realign_load (msq, lsq, realignment_token)
5729            indx = indx + 1;
5730            msq = lsq;
5731          }   */
5732
5733   /* If the misalignment remains the same throughout the execution of the
5734      loop, we can create the init_addr and permutation mask at the loop
5735      preheader. Otherwise, it needs to be created inside the loop.
5736      This can only occur when vectorizing memory accesses in the inner-loop
5737      nested within an outer-loop that is being vectorized.  */
5738
5739   if (nested_in_vect_loop_p (loop, stmt)
5740       && (TREE_INT_CST_LOW (DR_STEP (dr)) % UNITS_PER_SIMD_WORD != 0))
5741     {
5742       gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
5743       compute_in_loop = true;
5744     }
5745
5746   if ((alignment_support_scheme == dr_explicit_realign_optimized
5747        || alignment_support_scheme == dr_explicit_realign)
5748       && !compute_in_loop)
5749     {
5750       msq = vect_setup_realignment (first_stmt, bsi, &realignment_token,
5751                                     alignment_support_scheme, NULL_TREE,
5752                                     &at_loop);
5753       if (alignment_support_scheme == dr_explicit_realign_optimized)
5754         {
5755           phi = SSA_NAME_DEF_STMT (msq);
5756           offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5757         }
5758     }
5759   else
5760     at_loop = loop;
5761
5762   prev_stmt_info = NULL;
5763   for (j = 0; j < ncopies; j++)
5764     {
5765       /* 1. Create the vector pointer update chain.  */
5766       if (j == 0)
5767         dataref_ptr = vect_create_data_ref_ptr (first_stmt,
5768                                                 at_loop, offset,
5769                                                 &dummy, &ptr_incr, false,
5770                                                 NULL_TREE, &inv_p);
5771       else
5772         dataref_ptr =
5773                 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
5774
5775       for (i = 0; i < vec_num; i++)
5776         {
5777           if (i > 0)
5778             dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
5779                                            NULL_TREE);
5780
5781           /* 2. Create the vector-load in the loop.  */
5782           switch (alignment_support_scheme)
5783             {
5784             case dr_aligned:
5785               gcc_assert (aligned_access_p (first_dr));
5786               data_ref = build_fold_indirect_ref (dataref_ptr);
5787               break;
5788             case dr_unaligned_supported:
5789               {
5790                 int mis = DR_MISALIGNMENT (first_dr);
5791                 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
5792
5793                 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
5794                 data_ref =
5795                   build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
5796                 break;
5797               }
5798             case dr_explicit_realign:
5799               {
5800                 tree ptr, bump;
5801                 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5802
5803                 if (compute_in_loop)
5804                   msq = vect_setup_realignment (first_stmt, bsi,
5805                                                 &realignment_token,
5806                                                 dr_explicit_realign,
5807                                                 dataref_ptr, NULL);
5808
5809                 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5810                 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5811                 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5812                 new_temp = make_ssa_name (vec_dest, new_stmt);
5813                 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5814                 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5815                 copy_virtual_operands (new_stmt, stmt);
5816                 mark_symbols_for_renaming (new_stmt);
5817                 msq = new_temp;
5818
5819                 bump = size_binop (MULT_EXPR, vs_minus_1,
5820                                    TYPE_SIZE_UNIT (scalar_type));
5821                 ptr = bump_vector_ptr (dataref_ptr, NULL_TREE, bsi, stmt, bump);
5822                 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5823                 break;
5824               }
5825             case dr_explicit_realign_optimized:
5826               data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5827               break;
5828             default:
5829               gcc_unreachable ();
5830             }
5831           vec_dest = vect_create_destination_var (scalar_dest, vectype);
5832           new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5833           new_temp = make_ssa_name (vec_dest, new_stmt);
5834           GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5835           vect_finish_stmt_generation (stmt, new_stmt, bsi);
5836           mark_symbols_for_renaming (new_stmt);
5837
5838           /* 3. Handle explicit realignment if necessary/supported. Create in
5839                 loop: vec_dest = realign_load (msq, lsq, realignment_token)  */
5840           if (alignment_support_scheme == dr_explicit_realign_optimized
5841               || alignment_support_scheme == dr_explicit_realign)
5842             {
5843               lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
5844               if (!realignment_token)
5845                 realignment_token = dataref_ptr;
5846               vec_dest = vect_create_destination_var (scalar_dest, vectype);
5847               new_stmt = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
5848                                  realignment_token);
5849               new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5850               new_temp = make_ssa_name (vec_dest, new_stmt);
5851               GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5852               vect_finish_stmt_generation (stmt, new_stmt, bsi);
5853
5854               if (alignment_support_scheme == dr_explicit_realign_optimized)
5855                 {
5856                   if (i == vec_num - 1 && j == ncopies - 1)
5857                     add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
5858                   msq = lsq;
5859                 }
5860             }
5861
5862           /* 4. Handle invariant-load.  */
5863           if (inv_p)
5864             {
5865               gcc_assert (!strided_load);
5866               gcc_assert (nested_in_vect_loop_p (loop, stmt));
5867               if (j == 0)
5868                 {
5869                   int k;
5870                   tree t = NULL_TREE;
5871                   tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
5872
5873                   /* CHECKME: bitpos depends on endianess?  */
5874                   bitpos = bitsize_zero_node;
5875                   vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5876                                                             bitsize, bitpos);
5877                   vec_dest =
5878                         vect_create_destination_var (scalar_dest, NULL_TREE);
5879                   new_stmt = build_gimple_modify_stmt (vec_dest, vec_inv);
5880                   new_temp = make_ssa_name (vec_dest, new_stmt);
5881                   GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5882                   vect_finish_stmt_generation (stmt, new_stmt, bsi);
5883
5884                   for (k = nunits - 1; k >= 0; --k)
5885                     t = tree_cons (NULL_TREE, new_temp, t);
5886                   /* FIXME: use build_constructor directly.  */
5887                   vec_inv = build_constructor_from_list (vectype, t);
5888                   new_temp = vect_init_vector (stmt, vec_inv, vectype, bsi);
5889                   new_stmt = SSA_NAME_DEF_STMT (new_temp);
5890                 }
5891               else
5892                 gcc_unreachable (); /* FORNOW. */
5893             }
5894
5895           /* Collect vector loads and later create their permutation in
5896              vect_transform_strided_load ().  */
5897           if (strided_load)
5898             VEC_quick_push (tree, dr_chain, new_temp);
5899
5900          /* Store vector loads in the corresponding SLP_NODE.  */
5901           if (slp)
5902             VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
5903         }
5904
5905       /* FORNOW: SLP with multiple types is unsupported.  */
5906       if (slp)
5907         return true;
5908
5909       if (strided_load)
5910         {
5911           if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
5912             return false;
5913           *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5914           dr_chain = VEC_alloc (tree, heap, group_size);
5915         }
5916       else
5917         {
5918           if (j == 0)
5919             STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5920           else
5921             STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5922           prev_stmt_info = vinfo_for_stmt (new_stmt);
5923         }
5924     }
5925
5926   return true;
5927 }
5928
5929
5930 /* Function vectorizable_live_operation.
5931
5932    STMT computes a value that is used outside the loop. Check if
5933    it can be supported.  */
5934
5935 bool
5936 vectorizable_live_operation (tree stmt,
5937                              block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
5938                              tree *vec_stmt ATTRIBUTE_UNUSED)
5939 {
5940   tree operation;
5941   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5942   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5943   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5944   int i;
5945   int op_type;
5946   tree op;
5947   tree def, def_stmt;
5948   enum vect_def_type dt;
5949
5950   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5951
5952   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5953     return false;
5954
5955   if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5956     return false;
5957
5958   if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
5959     return false;
5960
5961   /* FORNOW. CHECKME. */
5962   if (nested_in_vect_loop_p (loop, stmt))
5963     return false;
5964
5965   operation = GIMPLE_STMT_OPERAND (stmt, 1);
5966   op_type = TREE_OPERAND_LENGTH (operation);
5967
5968   /* FORNOW: support only if all uses are invariant. This means
5969      that the scalar operations can remain in place, unvectorized.
5970      The original last scalar value that they compute will be used.  */
5971
5972   for (i = 0; i < op_type; i++)
5973     {
5974       op = TREE_OPERAND (operation, i);
5975       if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5976         {
5977           if (vect_print_dump_info (REPORT_DETAILS))
5978             fprintf (vect_dump, "use not simple.");
5979           return false;
5980         }
5981
5982       if (dt != vect_invariant_def && dt != vect_constant_def)
5983         return false;
5984     }
5985
5986   /* No transformation is required for the cases we currently support.  */
5987   return true;
5988 }
5989
5990
5991 /* Function vect_is_simple_cond.
5992
5993    Input:
5994    LOOP - the loop that is being vectorized.
5995    COND - Condition that is checked for simple use.
5996
5997    Returns whether a COND can be vectorized.  Checks whether
5998    condition operands are supportable using vec_is_simple_use.  */
5999
6000 static bool
6001 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6002 {
6003   tree lhs, rhs;
6004   tree def;
6005   enum vect_def_type dt;
6006
6007   if (!COMPARISON_CLASS_P (cond))
6008     return false;
6009
6010   lhs = TREE_OPERAND (cond, 0);
6011   rhs = TREE_OPERAND (cond, 1);
6012
6013   if (TREE_CODE (lhs) == SSA_NAME)
6014     {
6015       tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6016       if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6017         return false;
6018     }
6019   else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6020            && TREE_CODE (lhs) != FIXED_CST)
6021     return false;
6022
6023   if (TREE_CODE (rhs) == SSA_NAME)
6024     {
6025       tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6026       if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6027         return false;
6028     }
6029   else if (TREE_CODE (rhs) != INTEGER_CST  && TREE_CODE (rhs) != REAL_CST
6030            && TREE_CODE (rhs) != FIXED_CST)
6031     return false;
6032
6033   return true;
6034 }
6035
6036 /* vectorizable_condition.
6037
6038    Check if STMT is conditional modify expression that can be vectorized.
6039    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6040    stmt using VEC_COND_EXPR  to replace it, put it in VEC_STMT, and insert it
6041    at BSI.
6042
6043    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6044
6045 bool
6046 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
6047 {
6048   tree scalar_dest = NULL_TREE;
6049   tree vec_dest = NULL_TREE;
6050   tree op = NULL_TREE;
6051   tree cond_expr, then_clause, else_clause;
6052   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6053   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6054   tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6055   tree vec_compare, vec_cond_expr;
6056   tree new_temp;
6057   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6058   enum machine_mode vec_mode;
6059   tree def;
6060   enum vect_def_type dt;
6061   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6062   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6063
6064   gcc_assert (ncopies >= 1);
6065   if (ncopies > 1)
6066     return false; /* FORNOW */
6067
6068   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6069     return false;
6070
6071   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6072     return false;
6073
6074   /* FORNOW: SLP not supported.  */
6075   if (STMT_SLP_TYPE (stmt_info))
6076     return false;
6077
6078   /* FORNOW: not yet supported.  */
6079   if (STMT_VINFO_LIVE_P (stmt_info))
6080     {
6081       if (vect_print_dump_info (REPORT_DETAILS))
6082         fprintf (vect_dump, "value used after loop.");
6083       return false;
6084     }
6085
6086   /* Is vectorizable conditional operation?  */
6087   if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
6088     return false;
6089
6090   op = GIMPLE_STMT_OPERAND (stmt, 1);
6091
6092   if (TREE_CODE (op) != COND_EXPR)
6093     return false;
6094
6095   cond_expr = TREE_OPERAND (op, 0);
6096   then_clause = TREE_OPERAND (op, 1);
6097   else_clause = TREE_OPERAND (op, 2);
6098
6099   if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6100     return false;
6101
6102   /* We do not handle two different vector types for the condition
6103      and the values.  */
6104   if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6105     return false;
6106
6107   if (TREE_CODE (then_clause) == SSA_NAME)
6108     {
6109       tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6110       if (!vect_is_simple_use (then_clause, loop_vinfo,
6111                                &then_def_stmt, &def, &dt))
6112         return false;
6113     }
6114   else if (TREE_CODE (then_clause) != INTEGER_CST
6115            && TREE_CODE (then_clause) != REAL_CST
6116            && TREE_CODE (then_clause) != FIXED_CST)
6117     return false;
6118
6119   if (TREE_CODE (else_clause) == SSA_NAME)
6120     {
6121       tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6122       if (!vect_is_simple_use (else_clause, loop_vinfo,
6123                                &else_def_stmt, &def, &dt))
6124         return false;
6125     }
6126   else if (TREE_CODE (else_clause) != INTEGER_CST
6127            && TREE_CODE (else_clause) != REAL_CST
6128            && TREE_CODE (else_clause) != FIXED_CST)
6129     return false;
6130
6131
6132   vec_mode = TYPE_MODE (vectype);
6133
6134   if (!vec_stmt)
6135     {
6136       STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
6137       return expand_vec_cond_expr_p (op, vec_mode);
6138     }
6139
6140   /* Transform */
6141
6142   /* Handle def.  */
6143   scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
6144   vec_dest = vect_create_destination_var (scalar_dest, vectype);
6145
6146   /* Handle cond expr.  */
6147   vec_cond_lhs =
6148     vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
6149   vec_cond_rhs =
6150     vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
6151   vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
6152   vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
6153
6154   /* Arguments are ready. create the new vector stmt.  */
6155   vec_compare = build2 (TREE_CODE (cond_expr), vectype,
6156                         vec_cond_lhs, vec_cond_rhs);
6157   vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
6158                           vec_compare, vec_then_clause, vec_else_clause);
6159
6160   *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
6161   new_temp = make_ssa_name (vec_dest, *vec_stmt);
6162   GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
6163   vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
6164
6165   return true;
6166 }
6167
6168
6169 /* Function vect_transform_stmt.
6170
6171    Create a vectorized stmt to replace STMT, and insert it at BSI.  */
6172
6173 static bool
6174 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store,
6175                      slp_tree slp_node)
6176 {
6177   bool is_store = false;
6178   tree vec_stmt = NULL_TREE;
6179   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6180   tree orig_stmt_in_pattern;
6181   bool done;
6182
6183   switch (STMT_VINFO_TYPE (stmt_info))
6184     {
6185     case type_demotion_vec_info_type:
6186       gcc_assert (!slp_node);
6187       done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
6188       gcc_assert (done);
6189       break;
6190
6191     case type_promotion_vec_info_type:
6192       gcc_assert (!slp_node);
6193       done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
6194       gcc_assert (done);
6195       break;
6196
6197     case type_conversion_vec_info_type:
6198       done = vectorizable_conversion (stmt, bsi, &vec_stmt, slp_node);
6199       gcc_assert (done);
6200       break;
6201
6202     case induc_vec_info_type:
6203       gcc_assert (!slp_node);
6204       done = vectorizable_induction (stmt, bsi, &vec_stmt);
6205       gcc_assert (done);
6206       break;
6207
6208     case op_vec_info_type:
6209       done = vectorizable_operation (stmt, bsi, &vec_stmt, slp_node);
6210       gcc_assert (done);
6211       break;
6212
6213     case assignment_vec_info_type:
6214       done = vectorizable_assignment (stmt, bsi, &vec_stmt, slp_node);
6215       gcc_assert (done);
6216       break;
6217
6218     case load_vec_info_type:
6219       done = vectorizable_load (stmt, bsi, &vec_stmt, slp_node);
6220       gcc_assert (done);
6221       break;
6222
6223     case store_vec_info_type:
6224       done = vectorizable_store (stmt, bsi, &vec_stmt, slp_node);
6225       gcc_assert (done);
6226       if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6227         {
6228           /* In case of interleaving, the whole chain is vectorized when the
6229              last store in the chain is reached. Store stmts before the last
6230              one are skipped, and there vec_stmt_info shouldn't be freed
6231              meanwhile.  */
6232           *strided_store = true;
6233           if (STMT_VINFO_VEC_STMT (stmt_info))
6234             is_store = true;
6235           }
6236       else
6237         is_store = true;
6238       break;
6239
6240     case condition_vec_info_type:
6241       gcc_assert (!slp_node);
6242       done = vectorizable_condition (stmt, bsi, &vec_stmt);
6243       gcc_assert (done);
6244       break;
6245
6246     case call_vec_info_type:
6247       gcc_assert (!slp_node);
6248       done = vectorizable_call (stmt, bsi, &vec_stmt);
6249       break;
6250
6251     case reduc_vec_info_type:
6252       gcc_assert (!slp_node);
6253       done = vectorizable_reduction (stmt, bsi, &vec_stmt);
6254       gcc_assert (done);
6255       break;
6256
6257     default:
6258       if (!STMT_VINFO_LIVE_P (stmt_info))
6259         {
6260           if (vect_print_dump_info (REPORT_DETAILS))
6261             fprintf (vect_dump, "stmt not supported.");
6262           gcc_unreachable ();
6263         }
6264     }
6265
6266   if (STMT_VINFO_LIVE_P (stmt_info)
6267       && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
6268     {
6269       done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
6270       gcc_assert (done);
6271     }
6272
6273   if (vec_stmt)
6274     {
6275       STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
6276       orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
6277       if (orig_stmt_in_pattern)
6278         {
6279           stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
6280           /* STMT was inserted by the vectorizer to replace a computation idiom.
6281              ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
6282              computed this idiom.  We need to record a pointer to VEC_STMT in
6283              the stmt_info of ORIG_STMT_IN_PATTERN.  See more details in the
6284              documentation of vect_pattern_recog.  */
6285           if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
6286             {
6287               gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
6288               STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
6289             }
6290         }
6291     }
6292
6293   return is_store;
6294 }
6295
6296
6297 /* This function builds ni_name = number of iterations loop executes
6298    on the loop preheader.  */
6299
6300 static tree
6301 vect_build_loop_niters (loop_vec_info loop_vinfo)
6302 {
6303   tree ni_name, stmt, var;
6304   edge pe;
6305   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6306   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6307
6308   var = create_tmp_var (TREE_TYPE (ni), "niters");
6309   add_referenced_var (var);
6310   ni_name = force_gimple_operand (ni, &stmt, false, var);
6311
6312   pe = loop_preheader_edge (loop);
6313   if (stmt)
6314     {
6315       basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6316       gcc_assert (!new_bb);
6317     }
6318
6319   return ni_name;
6320 }
6321
6322
6323 /* This function generates the following statements:
6324
6325  ni_name = number of iterations loop executes
6326  ratio = ni_name / vf
6327  ratio_mult_vf_name = ratio * vf
6328
6329  and places them at the loop preheader edge.  */
6330
6331 static void
6332 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6333                                  tree *ni_name_ptr,
6334                                  tree *ratio_mult_vf_name_ptr,
6335                                  tree *ratio_name_ptr)
6336 {
6337
6338   edge pe;
6339   basic_block new_bb;
6340   tree stmt, ni_name;
6341   tree var;
6342   tree ratio_name;
6343   tree ratio_mult_vf_name;
6344   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6345   tree ni = LOOP_VINFO_NITERS (loop_vinfo);
6346   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6347   tree log_vf;
6348
6349   pe = loop_preheader_edge (loop);
6350
6351   /* Generate temporary variable that contains
6352      number of iterations loop executes.  */
6353
6354   ni_name = vect_build_loop_niters (loop_vinfo);
6355   log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
6356
6357   /* Create: ratio = ni >> log2(vf) */
6358
6359   ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
6360   if (!is_gimple_val (ratio_name))
6361     {
6362       var = create_tmp_var (TREE_TYPE (ni), "bnd");
6363       add_referenced_var (var);
6364
6365       ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
6366       pe = loop_preheader_edge (loop);
6367       new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6368       gcc_assert (!new_bb);
6369     }
6370
6371   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
6372
6373   ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6374                                     ratio_name, log_vf);
6375   if (!is_gimple_val (ratio_mult_vf_name))
6376     {
6377       var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
6378       add_referenced_var (var);
6379
6380       ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
6381                                                  true, var);
6382       pe = loop_preheader_edge (loop);
6383       new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6384       gcc_assert (!new_bb);
6385     }
6386
6387   *ni_name_ptr = ni_name;
6388   *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6389   *ratio_name_ptr = ratio_name;
6390
6391   return;
6392 }
6393
6394
6395 /*   Function vect_update_ivs_after_vectorizer.
6396
6397      "Advance" the induction variables of LOOP to the value they should take
6398      after the execution of LOOP.  This is currently necessary because the
6399      vectorizer does not handle induction variables that are used after the
6400      loop.  Such a situation occurs when the last iterations of LOOP are
6401      peeled, because:
6402      1. We introduced new uses after LOOP for IVs that were not originally used
6403         after LOOP: the IVs of LOOP are now used by an epilog loop.
6404      2. LOOP is going to be vectorized; this means that it will iterate N/VF
6405         times, whereas the loop IVs should be bumped N times.
6406
6407      Input:
6408      - LOOP - a loop that is going to be vectorized. The last few iterations
6409               of LOOP were peeled.
6410      - NITERS - the number of iterations that LOOP executes (before it is
6411                 vectorized). i.e, the number of times the ivs should be bumped.
6412      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
6413                   coming out from LOOP on which there are uses of the LOOP ivs
6414                   (this is the path from LOOP->exit to epilog_loop->preheader).
6415
6416                   The new definitions of the ivs are placed in LOOP->exit.
6417                   The phi args associated with the edge UPDATE_E in the bb
6418                   UPDATE_E->dest are updated accordingly.
6419
6420      Assumption 1: Like the rest of the vectorizer, this function assumes
6421      a single loop exit that has a single predecessor.
6422
6423      Assumption 2: The phi nodes in the LOOP header and in update_bb are
6424      organized in the same order.
6425
6426      Assumption 3: The access function of the ivs is simple enough (see
6427      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
6428
6429      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
6430      coming out of LOOP on which the ivs of LOOP are used (this is the path
6431      that leads to the epilog loop; other paths skip the epilog loop).  This
6432      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
6433      needs to have its phis updated.
6434  */
6435
6436 static void
6437 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
6438                                   edge update_e)
6439 {
6440   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6441   basic_block exit_bb = single_exit (loop)->dest;
6442   tree phi, phi1;
6443   basic_block update_bb = update_e->dest;
6444
6445   /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
6446
6447   /* Make sure there exists a single-predecessor exit bb:  */
6448   gcc_assert (single_pred_p (exit_bb));
6449
6450   for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
6451        phi && phi1;
6452        phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
6453     {
6454       tree access_fn = NULL;
6455       tree evolution_part;
6456       tree init_expr;
6457       tree step_expr;
6458       tree var, ni, ni_name;
6459       block_stmt_iterator last_bsi;
6460
6461       if (vect_print_dump_info (REPORT_DETAILS))
6462         {
6463           fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
6464           print_generic_expr (vect_dump, phi, TDF_SLIM);
6465         }
6466
6467       /* Skip virtual phi's.  */
6468       if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
6469         {
6470           if (vect_print_dump_info (REPORT_DETAILS))
6471             fprintf (vect_dump, "virtual phi. skip.");
6472           continue;
6473         }
6474
6475       /* Skip reduction phis.  */
6476       if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
6477         {
6478           if (vect_print_dump_info (REPORT_DETAILS))
6479             fprintf (vect_dump, "reduc phi. skip.");
6480           continue;
6481         }
6482
6483       access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
6484       gcc_assert (access_fn);
6485       evolution_part =
6486          unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
6487       gcc_assert (evolution_part != NULL_TREE);
6488
6489       /* FORNOW: We do not support IVs whose evolution function is a polynomial
6490          of degree >= 2 or exponential.  */
6491       gcc_assert (!tree_is_chrec (evolution_part));
6492
6493       step_expr = evolution_part;
6494       init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
6495                                                                loop->num));
6496
6497       if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
6498         ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
6499                           init_expr,
6500                           fold_convert (sizetype,
6501                                         fold_build2 (MULT_EXPR, TREE_TYPE (niters),
6502                                                      niters, step_expr)));
6503       else
6504         ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
6505                           fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
6506                                        fold_convert (TREE_TYPE (init_expr),
6507                                                      niters),
6508                                        step_expr),
6509                           init_expr);
6510
6511
6512
6513       var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
6514       add_referenced_var (var);
6515
6516       last_bsi = bsi_last (exit_bb);
6517       ni_name = force_gimple_operand_bsi (&last_bsi, ni, false, var,
6518                                           true, BSI_SAME_STMT);
6519
6520       /* Fix phi expressions in the successor bb.  */
6521       SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
6522     }
6523 }
6524
6525 /* Return the more conservative threshold between the
6526    min_profitable_iters returned by the cost model and the user
6527    specified threshold, if provided.  */
6528
6529 static unsigned int
6530 conservative_cost_threshold (loop_vec_info loop_vinfo,
6531                              int min_profitable_iters)
6532 {
6533   unsigned int th;
6534   int min_scalar_loop_bound;
6535
6536   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
6537                             * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
6538
6539   /* Use the cost model only if it is more conservative than user specified
6540      threshold.  */
6541   th = (unsigned) min_scalar_loop_bound;
6542   if (min_profitable_iters
6543       && (!min_scalar_loop_bound
6544           || min_profitable_iters > min_scalar_loop_bound))
6545     th = (unsigned) min_profitable_iters;
6546
6547   if (th && vect_print_dump_info (REPORT_COST))
6548     fprintf (vect_dump, "Vectorization may not be profitable.");
6549
6550   return th;
6551 }
6552
6553 /* Function vect_do_peeling_for_loop_bound
6554
6555    Peel the last iterations of the loop represented by LOOP_VINFO.
6556    The peeled iterations form a new epilog loop.  Given that the loop now
6557    iterates NITERS times, the new epilog loop iterates
6558    NITERS % VECTORIZATION_FACTOR times.
6559
6560    The original loop will later be made to iterate
6561    NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO).  */
6562
6563 static void
6564 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
6565 {
6566   tree ni_name, ratio_mult_vf_name;
6567   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6568   struct loop *new_loop;
6569   edge update_e;
6570   basic_block preheader;
6571   int loop_num;
6572   bool check_profitability = false;
6573   unsigned int th = 0;
6574   int min_profitable_iters;
6575
6576   if (vect_print_dump_info (REPORT_DETAILS))
6577     fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
6578
6579   initialize_original_copy_tables ();
6580
6581   /* Generate the following variables on the preheader of original loop:
6582
6583      ni_name = number of iteration the original loop executes
6584      ratio = ni_name / vf
6585      ratio_mult_vf_name = ratio * vf  */
6586   vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
6587                                    &ratio_mult_vf_name, ratio);
6588
6589   loop_num  = loop->num;
6590
6591   /* If cost model check not done during versioning and
6592      peeling for alignment.  */
6593   if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6594       && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
6595       && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
6596     {
6597       check_profitability = true;
6598
6599       /* Get profitability threshold for vectorized loop.  */
6600       min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6601
6602       th = conservative_cost_threshold (loop_vinfo,
6603                                         min_profitable_iters);
6604     }
6605
6606   new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
6607                                             ratio_mult_vf_name, ni_name, false,
6608                                             th, check_profitability);
6609   gcc_assert (new_loop);
6610   gcc_assert (loop_num == loop->num);
6611 #ifdef ENABLE_CHECKING
6612   slpeel_verify_cfg_after_peeling (loop, new_loop);
6613 #endif
6614
6615   /* A guard that controls whether the new_loop is to be executed or skipped
6616      is placed in LOOP->exit.  LOOP->exit therefore has two successors - one
6617      is the preheader of NEW_LOOP, where the IVs from LOOP are used.  The other
6618      is a bb after NEW_LOOP, where these IVs are not used.  Find the edge that
6619      is on the path where the LOOP IVs are used and need to be updated.  */
6620
6621   preheader = loop_preheader_edge (new_loop)->src;
6622   if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
6623     update_e = EDGE_PRED (preheader, 0);
6624   else
6625     update_e = EDGE_PRED (preheader, 1);
6626
6627   /* Update IVs of original loop as if they were advanced
6628      by ratio_mult_vf_name steps.  */
6629   vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
6630
6631   /* After peeling we have to reset scalar evolution analyzer.  */
6632   scev_reset ();
6633
6634   free_original_copy_tables ();
6635 }
6636
6637
6638 /* Function vect_gen_niters_for_prolog_loop
6639
6640    Set the number of iterations for the loop represented by LOOP_VINFO
6641    to the minimum between LOOP_NITERS (the original iteration count of the loop)
6642    and the misalignment of DR - the data reference recorded in
6643    LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).  As a result, after the execution of
6644    this loop, the data reference DR will refer to an aligned location.
6645
6646    The following computation is generated:
6647
6648    If the misalignment of DR is known at compile time:
6649      addr_mis = int mis = DR_MISALIGNMENT (dr);
6650    Else, compute address misalignment in bytes:
6651      addr_mis = addr & (vectype_size - 1)
6652
6653    prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
6654
6655    (elem_size = element type size; an element is the scalar element
6656         whose type is the inner type of the vectype)
6657
6658    For interleaving,
6659
6660    prolog_niters = min ( LOOP_NITERS ,
6661                         (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
6662          where group_size is the size of the interleaved group.
6663
6664    The above formulas assume that VF == number of elements in the vector. This
6665    may not hold when there are multiple-types in the loop.
6666    In this case, for some data-references in the loop the VF does not represent
6667    the number of elements that fit in the vector.  Therefore, instead of VF we
6668    use TYPE_VECTOR_SUBPARTS.  */
6669
6670 static tree
6671 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
6672 {
6673   struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
6674   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6675   tree var, stmt;
6676   tree iters, iters_name;
6677   edge pe;
6678   basic_block new_bb;
6679   tree dr_stmt = DR_STMT (dr);
6680   stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
6681   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6682   int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
6683   tree niters_type = TREE_TYPE (loop_niters);
6684   int group_size = 1;
6685   int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
6686   int nelements = TYPE_VECTOR_SUBPARTS (vectype);
6687
6688   if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6689     {
6690       /* For interleaved access element size must be multiplied by the size of
6691          the interleaved group.  */
6692       group_size = DR_GROUP_SIZE (vinfo_for_stmt (
6693                                                DR_GROUP_FIRST_DR (stmt_info)));
6694       element_size *= group_size;
6695     }
6696
6697   pe = loop_preheader_edge (loop);
6698
6699   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
6700     {
6701       int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
6702       int elem_misalign = byte_misalign / element_size;
6703
6704       if (vect_print_dump_info (REPORT_DETAILS))
6705         fprintf (vect_dump, "known alignment = %d.", byte_misalign);
6706       iters = build_int_cst (niters_type,
6707                              (nelements - elem_misalign)&(nelements/group_size-1));
6708     }
6709   else
6710     {
6711       tree new_stmts = NULL_TREE;
6712       tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
6713                                                 &new_stmts, NULL_TREE, loop);
6714       tree ptr_type = TREE_TYPE (start_addr);
6715       tree size = TYPE_SIZE (ptr_type);
6716       tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
6717       tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
6718       tree elem_size_log =
6719         build_int_cst (type, exact_log2 (vectype_align/nelements));
6720       tree nelements_minus_1 = build_int_cst (type, nelements - 1);
6721       tree nelements_tree = build_int_cst (type, nelements);
6722       tree byte_misalign;
6723       tree elem_misalign;
6724
6725       new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
6726       gcc_assert (!new_bb);
6727
6728       /* Create:  byte_misalign = addr & (vectype_size - 1)  */
6729       byte_misalign =
6730         fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
6731
6732       /* Create:  elem_misalign = byte_misalign / element_size  */
6733       elem_misalign =
6734         fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
6735
6736       /* Create:  (niters_type) (nelements - elem_misalign)&(nelements - 1)  */
6737       iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
6738       iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
6739       iters = fold_convert (niters_type, iters);
6740     }
6741
6742   /* Create:  prolog_loop_niters = min (iters, loop_niters) */
6743   /* If the loop bound is known at compile time we already verified that it is
6744      greater than vf; since the misalignment ('iters') is at most vf, there's
6745      no need to generate the MIN_EXPR in this case.  */
6746   if (TREE_CODE (loop_niters) != INTEGER_CST)
6747     iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
6748
6749   if (vect_print_dump_info (REPORT_DETAILS))
6750     {
6751       fprintf (vect_dump, "niters for prolog loop: ");
6752       print_generic_expr (vect_dump, iters, TDF_SLIM);
6753     }
6754
6755   var = create_tmp_var (niters_type, "prolog_loop_niters");
6756   add_referenced_var (var);
6757   iters_name = force_gimple_operand (iters, &stmt, false, var);
6758
6759   /* Insert stmt on loop preheader edge.  */
6760   if (stmt)
6761     {
6762       basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6763       gcc_assert (!new_bb);
6764     }
6765
6766   return iters_name;
6767 }
6768
6769
6770 /* Function vect_update_init_of_dr
6771
6772    NITERS iterations were peeled from LOOP.  DR represents a data reference
6773    in LOOP.  This function updates the information recorded in DR to
6774    account for the fact that the first NITERS iterations had already been
6775    executed.  Specifically, it updates the OFFSET field of DR.  */
6776
6777 static void
6778 vect_update_init_of_dr (struct data_reference *dr, tree niters)
6779 {
6780   tree offset = DR_OFFSET (dr);
6781
6782   niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
6783   offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
6784   DR_OFFSET (dr) = offset;
6785 }
6786
6787
6788 /* Function vect_update_inits_of_drs
6789
6790    NITERS iterations were peeled from the loop represented by LOOP_VINFO.
6791    This function updates the information recorded for the data references in
6792    the loop to account for the fact that the first NITERS iterations had
6793    already been executed.  Specifically, it updates the initial_condition of
6794    the access_function of all the data_references in the loop.  */
6795
6796 static void
6797 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
6798 {
6799   unsigned int i;
6800   VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
6801   struct data_reference *dr;
6802
6803   if (vect_print_dump_info (REPORT_DETAILS))
6804     fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
6805
6806   for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
6807     vect_update_init_of_dr (dr, niters);
6808 }
6809
6810
6811 /* Function vect_do_peeling_for_alignment
6812
6813    Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
6814    'niters' is set to the misalignment of one of the data references in the
6815    loop, thereby forcing it to refer to an aligned location at the beginning
6816    of the execution of this loop.  The data reference for which we are
6817    peeling is recorded in LOOP_VINFO_UNALIGNED_DR.  */
6818
6819 static void
6820 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
6821 {
6822   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6823   tree niters_of_prolog_loop, ni_name;
6824   tree n_iters;
6825   struct loop *new_loop;
6826   bool check_profitability = false;
6827   unsigned int th = 0;
6828   int min_profitable_iters;
6829
6830   if (vect_print_dump_info (REPORT_DETAILS))
6831     fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
6832
6833   initialize_original_copy_tables ();
6834
6835   ni_name = vect_build_loop_niters (loop_vinfo);
6836   niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
6837
6838
6839   /* If cost model check not done during versioning.  */
6840   if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6841       && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
6842     {
6843       check_profitability = true;
6844
6845       /* Get profitability threshold for vectorized loop.  */
6846       min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6847
6848       th = conservative_cost_threshold (loop_vinfo,
6849                                         min_profitable_iters);
6850     }
6851
6852   /* Peel the prolog loop and iterate it niters_of_prolog_loop.  */
6853   new_loop =
6854     slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
6855                                    niters_of_prolog_loop, ni_name, true,
6856                                    th, check_profitability);
6857
6858   gcc_assert (new_loop);
6859 #ifdef ENABLE_CHECKING
6860   slpeel_verify_cfg_after_peeling (new_loop, loop);
6861 #endif
6862
6863   /* Update number of times loop executes.  */
6864   n_iters = LOOP_VINFO_NITERS (loop_vinfo);
6865   LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
6866                 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
6867
6868   /* Update the init conditions of the access functions of all data refs.  */
6869   vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
6870
6871   /* After peeling we have to reset scalar evolution analyzer.  */
6872   scev_reset ();
6873
6874   free_original_copy_tables ();
6875 }
6876
6877
6878 /* Function vect_create_cond_for_align_checks.
6879
6880    Create a conditional expression that represents the alignment checks for
6881    all of data references (array element references) whose alignment must be
6882    checked at runtime.
6883
6884    Input:
6885    COND_EXPR  - input conditional expression.  New conditions will be chained
6886                 with logical AND operation.
6887    LOOP_VINFO - two fields of the loop information are used.
6888                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
6889                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
6890
6891    Output:
6892    COND_EXPR_STMT_LIST - statements needed to construct the conditional
6893                          expression.
6894    The returned value is the conditional expression to be used in the if
6895    statement that controls which version of the loop gets executed at runtime.
6896
6897    The algorithm makes two assumptions:
6898      1) The number of bytes "n" in a vector is a power of 2.
6899      2) An address "a" is aligned if a%n is zero and that this
6900         test can be done as a&(n-1) == 0.  For example, for 16
6901         byte vectors the test is a&0xf == 0.  */
6902
6903 static void
6904 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
6905                                    tree *cond_expr,
6906                                    tree *cond_expr_stmt_list)
6907 {
6908   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6909   VEC(tree,heap) *may_misalign_stmts
6910     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
6911   tree ref_stmt, tmp;
6912   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
6913   tree mask_cst;
6914   unsigned int i;
6915   tree psize;
6916   tree int_ptrsize_type;
6917   char tmp_name[20];
6918   tree or_tmp_name = NULL_TREE;
6919   tree and_tmp, and_tmp_name, and_stmt;
6920   tree ptrsize_zero;
6921   tree part_cond_expr;
6922
6923   /* Check that mask is one less than a power of 2, i.e., mask is
6924      all zeros followed by all ones.  */
6925   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
6926
6927   /* CHECKME: what is the best integer or unsigned type to use to hold a
6928      cast from a pointer value?  */
6929   psize = TYPE_SIZE (ptr_type_node);
6930   int_ptrsize_type
6931     = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
6932
6933   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
6934      of the first vector of the i'th data reference. */
6935
6936   for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
6937     {
6938       tree new_stmt_list = NULL_TREE;
6939       tree addr_base;
6940       tree addr_tmp, addr_tmp_name, addr_stmt;
6941       tree or_tmp, new_or_tmp_name, or_stmt;
6942
6943       /* create: addr_tmp = (int)(address_of_first_vector) */
6944       addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
6945                                         &new_stmt_list, NULL_TREE, loop);
6946
6947       if (new_stmt_list != NULL_TREE)
6948         append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
6949
6950       sprintf (tmp_name, "%s%d", "addr2int", i);
6951       addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6952       add_referenced_var (addr_tmp);
6953       addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
6954       addr_stmt = fold_convert (int_ptrsize_type, addr_base);
6955       addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
6956       SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
6957       append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
6958
6959       /* The addresses are OR together.  */
6960
6961       if (or_tmp_name != NULL_TREE)
6962         {
6963           /* create: or_tmp = or_tmp | addr_tmp */
6964           sprintf (tmp_name, "%s%d", "orptrs", i);
6965           or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6966           add_referenced_var (or_tmp);
6967           new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
6968           tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
6969                         or_tmp_name, addr_tmp_name);
6970           or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
6971           SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
6972           append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
6973           or_tmp_name = new_or_tmp_name;
6974         }
6975       else
6976         or_tmp_name = addr_tmp_name;
6977
6978     } /* end for i */
6979
6980   mask_cst = build_int_cst (int_ptrsize_type, mask);
6981
6982   /* create: and_tmp = or_tmp & mask  */
6983   and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
6984   add_referenced_var (and_tmp);
6985   and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
6986
6987   tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
6988   and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
6989   SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
6990   append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
6991
6992   /* Make and_tmp the left operand of the conditional test against zero.
6993      if and_tmp has a nonzero bit then some address is unaligned.  */
6994   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
6995   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
6996                                 and_tmp_name, ptrsize_zero);
6997   if (*cond_expr)
6998     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
6999                               *cond_expr, part_cond_expr);
7000   else
7001     *cond_expr = part_cond_expr;
7002 }
7003
7004 /* Function vect_vfa_segment_size.
7005
7006    Create an expression that computes the size of segment
7007    that will be accessed for a data reference.  The functions takes into
7008    account that realignment loads may access one more vector.
7009
7010    Input:
7011      DR: The data reference.
7012      VECT_FACTOR: vectorization factor.
7013
7014    Return an expression whose value is the size of segment which will be
7015    accessed by DR.  */
7016
7017 static tree
7018 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7019 {
7020   tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7021                                      DR_STEP (dr), vect_factor);
7022
7023   if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7024     {
7025       tree vector_size = TYPE_SIZE_UNIT
7026                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7027
7028       segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7029                                     segment_length, vector_size);
7030     }
7031   return fold_convert (sizetype, segment_length);
7032 }
7033
7034 /* Function vect_create_cond_for_alias_checks.
7035
7036    Create a conditional expression that represents the run-time checks for
7037    overlapping of address ranges represented by a list of data references
7038    relations passed as input.
7039
7040    Input:
7041    COND_EXPR  - input conditional expression.  New conditions will be chained
7042                 with logical AND operation.
7043    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7044                 to be checked.
7045
7046    Output:
7047    COND_EXPR - conditional expression.
7048    COND_EXPR_STMT_LIST - statements needed to construct the conditional
7049                          expression.
7050
7051
7052    The returned value is the conditional expression to be used in the if
7053    statement that controls which version of the loop gets executed at runtime.
7054 */
7055
7056 static void
7057 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7058                                    tree * cond_expr,
7059                                    tree * cond_expr_stmt_list)
7060 {
7061   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7062   VEC (ddr_p, heap) * may_alias_ddrs =
7063     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7064   tree vect_factor =
7065     build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7066
7067   ddr_p ddr;
7068   unsigned int i;
7069   tree part_cond_expr;
7070
7071   /* Create expression
7072      ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7073      || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7074      &&
7075      ...
7076      &&
7077      ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7078      || (load_ptr_n + load_segment_length_n) < store_ptr_n))  */
7079
7080   if (VEC_empty (ddr_p, may_alias_ddrs))
7081     return;
7082
7083   for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
7084     {
7085       struct data_reference *dr_a, *dr_b;
7086       tree dr_group_first_a, dr_group_first_b;
7087       tree addr_base_a, addr_base_b;
7088       tree segment_length_a, segment_length_b;
7089       tree stmt_a, stmt_b;
7090
7091       dr_a = DDR_A (ddr);
7092       stmt_a = DR_STMT (DDR_A (ddr));
7093       dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
7094       if (dr_group_first_a)
7095         {
7096           stmt_a = dr_group_first_a;
7097           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
7098         }
7099
7100       dr_b = DDR_B (ddr);
7101       stmt_b = DR_STMT (DDR_B (ddr));
7102       dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
7103       if (dr_group_first_b)
7104         {
7105           stmt_b = dr_group_first_b;
7106           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
7107         }
7108
7109       addr_base_a =
7110         vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
7111                                               NULL_TREE, loop);
7112       addr_base_b =
7113         vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
7114                                               NULL_TREE, loop);
7115
7116       segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
7117       segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
7118
7119       if (vect_print_dump_info (REPORT_DR_DETAILS))
7120         {
7121           fprintf (vect_dump,
7122                    "create runtime check for data references ");
7123           print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
7124           fprintf (vect_dump, " and ");
7125           print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
7126         }
7127
7128
7129       part_cond_expr =
7130         fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
7131           fold_build2 (LT_EXPR, boolean_type_node,
7132             fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
7133               addr_base_a,
7134               segment_length_a),
7135             addr_base_b),
7136           fold_build2 (LT_EXPR, boolean_type_node,
7137             fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
7138               addr_base_b,
7139               segment_length_b),
7140             addr_base_a));
7141
7142       if (*cond_expr)
7143         *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7144                                   *cond_expr, part_cond_expr);
7145       else
7146         *cond_expr = part_cond_expr;
7147     }
7148     if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7149       fprintf (vect_dump, "created %u versioning for alias checks.\n",
7150                VEC_length (ddr_p, may_alias_ddrs));
7151
7152 }
7153
7154 /* Function vect_loop_versioning.
7155
7156    If the loop has data references that may or may not be aligned or/and
7157    has data reference relations whose independence was not proven then
7158    two versions of the loop need to be generated, one which is vectorized
7159    and one which isn't.  A test is then generated to control which of the
7160    loops is executed.  The test checks for the alignment of all of the
7161    data references that may or may not be aligned.  An additional
7162    sequence of runtime tests is generated for each pairs of DDRs whose
7163    independence was not proven.  The vectorized version of loop is
7164    executed only if both alias and alignment tests are passed.
7165
7166    The test generated to check which version of loop is executed
7167    is modified to also check for profitability as indicated by the
7168    cost model initially.  */
7169
7170 static void
7171 vect_loop_versioning (loop_vec_info loop_vinfo)
7172 {
7173   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7174   struct loop *nloop;
7175   tree cond_expr = NULL_TREE;
7176   tree cond_expr_stmt_list = NULL_TREE;
7177   basic_block condition_bb;
7178   block_stmt_iterator cond_exp_bsi;
7179   basic_block merge_bb;
7180   basic_block new_exit_bb;
7181   edge new_exit_e, e;
7182   tree orig_phi, new_phi, arg;
7183   unsigned prob = 4 * REG_BR_PROB_BASE / 5;
7184   tree gimplify_stmt_list;
7185   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
7186   int min_profitable_iters = 0;
7187   unsigned int th;
7188
7189   /* Get profitability threshold for vectorized loop.  */
7190   min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7191
7192   th = conservative_cost_threshold (loop_vinfo,
7193                                     min_profitable_iters);
7194
7195   cond_expr =
7196     build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
7197             build_int_cst (TREE_TYPE (scalar_loop_iters), th));
7198
7199   cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
7200                                     false, NULL_TREE);
7201
7202   if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
7203       vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
7204                                          &cond_expr_stmt_list);
7205
7206   if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7207     vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
7208                                        &cond_expr_stmt_list);
7209
7210   cond_expr =
7211     fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
7212   cond_expr =
7213     force_gimple_operand (cond_expr, &gimplify_stmt_list, true,
7214                           NULL_TREE);
7215   append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list);
7216
7217   initialize_original_copy_tables ();
7218   nloop = loop_version (loop, cond_expr, &condition_bb,
7219                         prob, prob, REG_BR_PROB_BASE - prob, true);
7220   free_original_copy_tables();
7221
7222   /* Loop versioning violates an assumption we try to maintain during
7223      vectorization - that the loop exit block has a single predecessor.
7224      After versioning, the exit block of both loop versions is the same
7225      basic block (i.e. it has two predecessors). Just in order to simplify
7226      following transformations in the vectorizer, we fix this situation
7227      here by adding a new (empty) block on the exit-edge of the loop,
7228      with the proper loop-exit phis to maintain loop-closed-form.  */
7229
7230   merge_bb = single_exit (loop)->dest;
7231   gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
7232   new_exit_bb = split_edge (single_exit (loop));
7233   new_exit_e = single_exit (loop);
7234   e = EDGE_SUCC (new_exit_bb, 0);
7235
7236   for (orig_phi = phi_nodes (merge_bb); orig_phi;
7237         orig_phi = PHI_CHAIN (orig_phi))
7238     {
7239       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
7240                                   new_exit_bb);
7241       arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
7242       add_phi_arg (new_phi, arg, new_exit_e);
7243       SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
7244     }
7245
7246   /* End loop-exit-fixes after versioning.  */
7247
7248   update_ssa (TODO_update_ssa);
7249   if (cond_expr_stmt_list)
7250     {
7251       cond_exp_bsi = bsi_last (condition_bb);
7252       bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
7253     }
7254 }
7255
7256 /* Remove a group of stores (for SLP or interleaving), free their
7257    stmt_vec_info.  */
7258
7259 static void
7260 vect_remove_stores (tree first_stmt)
7261 {
7262   stmt_ann_t ann;
7263   tree next = first_stmt;
7264   tree tmp;
7265   stmt_vec_info next_stmt_info;
7266   block_stmt_iterator next_si;
7267
7268   while (next)
7269     {
7270       /* Free the attached stmt_vec_info and remove the stmt.  */
7271       next_si = bsi_for_stmt (next);
7272       bsi_remove (&next_si, true);
7273       next_stmt_info = vinfo_for_stmt (next);
7274       ann = stmt_ann (next);
7275       tmp = DR_GROUP_NEXT_DR (next_stmt_info);
7276       free (next_stmt_info);
7277       set_stmt_info (ann, NULL);
7278       next = tmp;
7279     }
7280 }
7281
7282
7283 /* Vectorize SLP instance tree in postorder.  */
7284
7285 static bool
7286 vect_schedule_slp_instance (slp_tree node, unsigned int vec_stmts_size)
7287 {
7288   tree stmt;
7289   bool strided_store, is_store;
7290   block_stmt_iterator si;
7291   stmt_vec_info stmt_info;
7292
7293   if (!node)
7294     return false;
7295
7296   vect_schedule_slp_instance (SLP_TREE_LEFT (node), vec_stmts_size);
7297   vect_schedule_slp_instance (SLP_TREE_RIGHT (node), vec_stmts_size);
7298
7299   stmt = VEC_index(tree, SLP_TREE_SCALAR_STMTS (node), 0);
7300   stmt_info = vinfo_for_stmt (stmt);
7301   SLP_TREE_VEC_STMTS (node) = VEC_alloc (tree, heap, vec_stmts_size);
7302   SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
7303
7304   if (vect_print_dump_info (REPORT_DETAILS))
7305     {
7306       fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
7307       print_generic_expr (vect_dump, stmt, TDF_SLIM);
7308     }
7309
7310   si = bsi_for_stmt (stmt);
7311   is_store = vect_transform_stmt (stmt, &si, &strided_store, node);
7312   if (is_store)
7313     {
7314       if (DR_GROUP_FIRST_DR (stmt_info))
7315         /* If IS_STORE is TRUE, the vectorization of the
7316            interleaving chain was completed - free all the stores in
7317            the chain.  */
7318         vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
7319       else
7320         /* FORNOW: SLP originates only from strided stores.  */
7321         gcc_unreachable ();
7322
7323       return true;
7324     }
7325
7326   /* FORNOW: SLP originates only from strided stores.  */
7327   return false;
7328 }
7329
7330
7331 static bool
7332 vect_schedule_slp (loop_vec_info loop_vinfo, unsigned int nunits)
7333 {
7334   VEC (slp_instance, heap) *slp_instances =
7335     LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7336   slp_instance instance;
7337   unsigned int vec_stmts_size;
7338   unsigned int group_size, i;
7339   unsigned int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7340   bool is_store = false;
7341
7342   for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
7343     {
7344       group_size = SLP_INSTANCE_GROUP_SIZE (instance);
7345       /* For each SLP instance calculate number of vector stmts to be created
7346          for the scalar stmts in each node of the SLP tree. Number of vector
7347          elements in one vector iteration is the number of scalar elements in
7348          one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
7349          size.  */
7350       vec_stmts_size = vectorization_factor * group_size / nunits;
7351
7352       /* Schedule the tree of INSTANCE.  */
7353       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
7354                                              vec_stmts_size);
7355
7356       if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
7357           || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
7358         fprintf (vect_dump, "vectorizing stmts using SLP.");
7359     }
7360
7361   return is_store;
7362 }
7363
7364 /* Function vect_transform_loop.
7365
7366    The analysis phase has determined that the loop is vectorizable.
7367    Vectorize the loop - created vectorized stmts to replace the scalar
7368    stmts in the loop, and update the loop exit condition.  */
7369
7370 void
7371 vect_transform_loop (loop_vec_info loop_vinfo)
7372 {
7373   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7374   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7375   int nbbs = loop->num_nodes;
7376   block_stmt_iterator si, next_si;
7377   int i;
7378   tree ratio = NULL;
7379   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7380   bool strided_store;
7381   bool slp_scheduled = false;
7382   unsigned int nunits;
7383
7384   if (vect_print_dump_info (REPORT_DETAILS))
7385     fprintf (vect_dump, "=== vec_transform_loop ===");
7386
7387   if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7388       || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7389     vect_loop_versioning (loop_vinfo);
7390
7391   /* CHECKME: we wouldn't need this if we called update_ssa once
7392      for all loops.  */
7393   bitmap_zero (vect_memsyms_to_rename);
7394
7395   /* Peel the loop if there are data refs with unknown alignment.
7396      Only one data ref with unknown store is allowed.  */
7397
7398   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7399     vect_do_peeling_for_alignment (loop_vinfo);
7400
7401   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
7402      compile time constant), or it is a constant that doesn't divide by the
7403      vectorization factor, then an epilog loop needs to be created.
7404      We therefore duplicate the loop: the original loop will be vectorized,
7405      and will compute the first (n/VF) iterations. The second copy of the loop
7406      will remain scalar and will compute the remaining (n%VF) iterations.
7407      (VF is the vectorization factor).  */
7408
7409   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7410       || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7411           && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
7412     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
7413   else
7414     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7415                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
7416
7417   /* 1) Make sure the loop header has exactly two entries
7418      2) Make sure we have a preheader basic block.  */
7419
7420   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7421
7422   split_edge (loop_preheader_edge (loop));
7423
7424   /* FORNOW: the vectorizer supports only loops which body consist
7425      of one basic block (header + empty latch). When the vectorizer will
7426      support more involved loop forms, the order by which the BBs are
7427      traversed need to be reconsidered.  */
7428
7429   for (i = 0; i < nbbs; i++)
7430     {
7431       basic_block bb = bbs[i];
7432       stmt_vec_info stmt_info;
7433       tree phi;
7434
7435       for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
7436         {
7437           if (vect_print_dump_info (REPORT_DETAILS))
7438             {
7439               fprintf (vect_dump, "------>vectorizing phi: ");
7440               print_generic_expr (vect_dump, phi, TDF_SLIM);
7441             }
7442           stmt_info = vinfo_for_stmt (phi);
7443           if (!stmt_info)
7444             continue;
7445
7446           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7447               && !STMT_VINFO_LIVE_P (stmt_info))
7448             continue;
7449
7450           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7451                 != (unsigned HOST_WIDE_INT) vectorization_factor)
7452               && vect_print_dump_info (REPORT_DETAILS))
7453             fprintf (vect_dump, "multiple-types.");
7454
7455           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
7456             {
7457               if (vect_print_dump_info (REPORT_DETAILS))
7458                 fprintf (vect_dump, "transform phi.");
7459               vect_transform_stmt (phi, NULL, NULL, NULL);
7460             }
7461         }
7462
7463       for (si = bsi_start (bb); !bsi_end_p (si);)
7464         {
7465           tree stmt = bsi_stmt (si);
7466           bool is_store;
7467
7468           if (vect_print_dump_info (REPORT_DETAILS))
7469             {
7470               fprintf (vect_dump, "------>vectorizing statement: ");
7471               print_generic_expr (vect_dump, stmt, TDF_SLIM);
7472             }
7473
7474           stmt_info = vinfo_for_stmt (stmt);
7475
7476           /* vector stmts created in the outer-loop during vectorization of
7477              stmts in an inner-loop may not have a stmt_info, and do not
7478              need to be vectorized.  */
7479           if (!stmt_info)
7480             {
7481               bsi_next (&si);
7482               continue;
7483             }
7484
7485           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7486               && !STMT_VINFO_LIVE_P (stmt_info))
7487             {
7488               bsi_next (&si);
7489               continue;
7490             }
7491
7492           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
7493           nunits =
7494             (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7495           if (!STMT_SLP_TYPE (stmt_info)
7496               && nunits != (unsigned int) vectorization_factor
7497               && vect_print_dump_info (REPORT_DETAILS))
7498             /* For SLP VF is set according to unrolling factor, and not to
7499                vector size, hence for SLP this print is not valid.  */
7500             fprintf (vect_dump, "multiple-types.");
7501
7502           /* SLP. Schedule all the SLP instances when the first SLP stmt is
7503              reached.  */
7504           if (STMT_SLP_TYPE (stmt_info))
7505             {
7506               if (!slp_scheduled)
7507                 {
7508                   slp_scheduled = true;
7509
7510                   if (vect_print_dump_info (REPORT_DETAILS))
7511                     fprintf (vect_dump, "=== scheduling SLP instances ===");
7512
7513                   is_store = vect_schedule_slp (loop_vinfo, nunits);
7514
7515                   /* IS_STORE is true if STMT is a store. Stores cannot be of
7516                      hybrid SLP type. They are removed in
7517                      vect_schedule_slp_instance and their vinfo is destroyed. */
7518                   if (is_store)
7519                     {
7520                       bsi_next (&si);
7521                       continue;
7522                     }
7523                 }
7524
7525               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
7526               if (PURE_SLP_STMT (stmt_info))
7527                 {
7528                   bsi_next (&si);
7529                   continue;
7530                 }
7531             }
7532
7533           /* -------- vectorize statement ------------ */
7534           if (vect_print_dump_info (REPORT_DETAILS))
7535             fprintf (vect_dump, "transform statement.");
7536
7537           strided_store = false;
7538           is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL);
7539           if (is_store)
7540             {
7541               stmt_ann_t ann;
7542               if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7543                 {
7544                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7545                      interleaving chain was completed - free all the stores in
7546                      the chain.  */
7547                   tree next = DR_GROUP_FIRST_DR (stmt_info);
7548                   tree tmp;
7549                   stmt_vec_info next_stmt_info;
7550
7551                   while (next)
7552                     {
7553                       next_si = bsi_for_stmt (next);
7554                       next_stmt_info = vinfo_for_stmt (next);
7555                       /* Free the attached stmt_vec_info and remove the stmt.  */
7556                       ann = stmt_ann (next);
7557                       tmp = DR_GROUP_NEXT_DR (next_stmt_info);
7558                       free (next_stmt_info);
7559                       set_stmt_info (ann, NULL);
7560                       bsi_remove (&next_si, true);
7561                       next = tmp;
7562                     }
7563                   bsi_remove (&si, true);
7564                   continue;
7565                 }
7566               else
7567                 {
7568                   /* Free the attached stmt_vec_info and remove the stmt.  */
7569                   ann = stmt_ann (stmt);
7570                   free (stmt_info);
7571                   set_stmt_info (ann, NULL);
7572                   bsi_remove (&si, true);
7573                   continue;
7574                 }
7575             }
7576           bsi_next (&si);
7577         }                       /* stmts in BB */
7578     }                           /* BBs in loop */
7579
7580   slpeel_make_loop_iterate_ntimes (loop, ratio);
7581
7582   mark_set_for_renaming (vect_memsyms_to_rename);
7583
7584   /* The memory tags and pointers in vectorized statements need to
7585      have their SSA forms updated.  FIXME, why can't this be delayed
7586      until all the loops have been transformed?  */
7587   update_ssa (TODO_update_ssa);
7588
7589   if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7590     fprintf (vect_dump, "LOOP VECTORIZED.");
7591   if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7592     fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
7593 }