* tree-vectorizer.h (verbosity_levels): Add new verbosity level

[pf3gnuchains/gcc-fork.git] / gcc / tree-vect-transform.c
diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c

index 9698b41..6fe1d63 100644 (file)
--- a/gcc/tree-vect-transform.c
+++ b/gcc/tree-vect-transform.c
@@ -119,11 +119,14 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
    int vec_inside_cost = 0;
    int vec_outside_cost = 0;
    int scalar_single_iter_cost = 0;
+  int scalar_outside_cost = 0;
+  bool runtime_test = false;
    int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
    int nbbs = loop->num_nodes;
-  int byte_misalign;
+  int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
+  int peel_guard_costs = 0;
    int innerloop_iters = 0, factor;
    VEC (slp_instance, heap) *slp_instances;
    slp_instance instance;
@@ -131,20 +134,44 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
    /* Cost model disabled.  */
    if (!flag_vect_cost_model)
      {
-      if (vect_print_dump_info (REPORT_DETAILS))
+      if (vect_print_dump_info (REPORT_COST))
          fprintf (vect_dump, "cost model disabled.");      
        return 0;
      }
  
-  /* Requires loop versioning tests to handle misalignment.
-     FIXME: Make cost depend on number of stmts in may_misalign list.  */
+  /* If the number of iterations is unknown, or the
+     peeling-for-misalignment amount is unknown, we will have to generate
+     a runtime test to test the loop count against the threshold.    */
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      || (byte_misalign < 0))
+    runtime_test = true;
+
+  /* Requires loop versioning tests to handle misalignment.  */
  
    if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
      {
-      vec_outside_cost += TARG_COND_BRANCH_COST;
-      if (vect_print_dump_info (REPORT_DETAILS))
+      /*  FIXME: Make cost depend on complexity of individual check.  */
+      vec_outside_cost +=
+        VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
+      if (vect_print_dump_info (REPORT_COST))
+        fprintf (vect_dump, "cost model: Adding cost of checks for loop "
+                 "versioning to treat misalignment.\n");
+    }
+
+  if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+    {
+      /*  FIXME: Make cost depend on complexity of individual check.  */
+      vec_outside_cost +=
+        VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
+      if (vect_print_dump_info (REPORT_COST))
          fprintf (vect_dump, "cost model: Adding cost of checks for loop "
-                 "versioning.\n");
+                 "versioning aliasing.\n");
+    }
+
+  if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
+      || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+    {
+      vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
      }
  
    /* Count statements in scalar loop.  Using this as scalar cost for a single
@@ -170,45 +197,52 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
         factor = 1;
  
        for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
-        {
-          tree stmt = bsi_stmt (si);
-          stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
-          if (!STMT_VINFO_RELEVANT_P (stmt_info)
-              && !STMT_VINFO_LIVE_P (stmt_info))
-            continue;
-          scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
-          vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
+       {
+         tree stmt = bsi_stmt (si);
+         stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+         /* Skip stmts that are not vectorized inside the loop.  */
+         if (!STMT_VINFO_RELEVANT_P (stmt_info)
+             && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
+           continue;
+         scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
+         vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
           /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
              some of the "outside" costs are generated inside the outer-loop.  */
-          vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
-        }
+         vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
+       }
      }
  
    /* Add additional cost for the peeled instructions in prologue and epilogue
       loop.
  
       FORNOW: If we dont know the value of peel_iters for prologue or epilogue
-     at compile-time - we assume it's (vf-1)/2 (the worst would be vf-1).
+     at compile-time - we assume it's vf/2 (the worst would be vf-1).
  
       TODO: Build an expression that represents peel_iters for prologue and
       epilogue to be used in a run-time test.  */
  
-  byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
-
    if (byte_misalign < 0)
      {
-      peel_iters_prologue = (vf - 1)/2;
-      if (vect_print_dump_info (REPORT_DETAILS))
+      peel_iters_prologue = vf/2;
+      if (vect_print_dump_info (REPORT_COST))
          fprintf (vect_dump, "cost model: "
-                 "prologue peel iters set to (vf-1)/2.");
+                 "prologue peel iters set to vf/2.");
  
        /* If peeling for alignment is unknown, loop bound of main loop becomes
           unknown.  */
-      peel_iters_epilogue = (vf - 1)/2;
-      if (vect_print_dump_info (REPORT_DETAILS))
+      peel_iters_epilogue = vf/2;
+      if (vect_print_dump_info (REPORT_COST))
          fprintf (vect_dump, "cost model: "
-                 "epilogue peel iters set to (vf-1)/2 because "
+                 "epilogue peel iters set to vf/2 because "
                   "peeling for alignment is unknown .");
+
+      /* If peeled iterations are unknown, count a taken branch and a not taken
+        branch per peeled loop. Even if scalar loop iterations are known, 
+        vector iterations are not known since peeled prologue iterations are
+        not known. Hence guards remain the same.  */
+      peel_guard_costs +=  2 * (TARG_COND_TAKEN_BRANCH_COST
+                              + TARG_COND_NOT_TAKEN_BRANCH_COST);
+
      }
    else 
      {
@@ -226,11 +260,16 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
  
        if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
          {
-          peel_iters_epilogue = (vf - 1)/2;
-          if (vect_print_dump_info (REPORT_DETAILS))
+          peel_iters_epilogue = vf/2;
+          if (vect_print_dump_info (REPORT_COST))
              fprintf (vect_dump, "cost model: "
-                     "epilogue peel iters set to (vf-1)/2 because "
+                     "epilogue peel iters set to vf/2 because "
                       "loop iterations are unknown .");
+
+         /* If peeled iterations are known but number of scalar loop
+            iterations are unknown, count a taken branch per peeled loop.  */
+         peel_guard_costs +=  2 * TARG_COND_TAKEN_BRANCH_COST;
+
          }
        else      
         {
@@ -241,52 +280,76 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
         }
      }
  
-  /* Requires a prologue loop when peeling to handle misalignment. Add cost of
-     two guards, one for the peeled loop and one for the vector loop.  */
-
-  if (peel_iters_prologue)
-    {
-      vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "cost model: Adding cost of checks for "
-                 "prologue.\n");
-    }
-
- /* Requires an epilogue loop to finish up remaining iterations after vector
-    loop. Add cost of two guards, one for the peeled loop and one for the
-    vector loop.  */
-
-  if (peel_iters_epilogue
-      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      || LOOP_VINFO_INT_NITERS (loop_vinfo) % vf)
-    {
-      vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "cost model : Adding cost of checks for "
-                 "epilogue.\n");
-    }
-
    vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
-                      + (peel_iters_epilogue * scalar_single_iter_cost);
-
-  /* Allow targets add additional (outside-of-loop) costs. FORNOW, the only
-     information we provide for the target is whether testing against the
-     threshold involves a runtime test.  */
-  if (targetm.vectorize.builtin_vectorization_cost)
-    {
-      bool runtime_test = false;
-
-      /* If the number of iterations is unknown, or the
-        peeling-for-misalignment amount is unknown, we eill have to generate
-        a runtime test to test the loop count against the threshold.  */
-      if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-         || (byte_misalign < 0))
-       runtime_test = true;
-      vec_outside_cost +=
-       targetm.vectorize.builtin_vectorization_cost (runtime_test);
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "cost model : Adding target out-of-loop cost = %d",
-                 targetm.vectorize.builtin_vectorization_cost (runtime_test));
+                      + (peel_iters_epilogue * scalar_single_iter_cost)
+                      + peel_guard_costs;
+
+  /* FORNOW: The scalar outside cost is incremented in one of the
+     following ways:
+
+     1. The vectorizer checks for alignment and aliasing and generates
+     a condition that allows dynamic vectorization.  A cost model
+     check is ANDED with the versioning condition.  Hence scalar code
+     path now has the added cost of the versioning check.
+
+       if (cost > th & versioning_check)
+         jmp to vector code
+
+     Hence run-time scalar is incremented by not-taken branch cost.
+
+     2. The vectorizer then checks if a prologue is required.  If the
+     cost model check was not done before during versioning, it has to
+     be done before the prologue check.
+
+       if (cost <= th)
+         prologue = scalar_iters
+       if (prologue == 0)
+         jmp to vector code
+       else
+         execute prologue
+       if (prologue == num_iters)
+        go to exit
+
+     Hence the run-time scalar cost is incremented by a taken branch,
+     plus a not-taken branch, plus a taken branch cost.
+
+     3. The vectorizer then checks if an epilogue is required.  If the
+     cost model check was not done before during prologue check, it
+     has to be done with the epilogue check.
+
+       if (prologue == 0)
+         jmp to vector code
+       else
+         execute prologue
+       if (prologue == num_iters)
+        go to exit
+       vector code:
+         if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
+           jmp to epilogue
+
+     Hence the run-time scalar cost should be incremented by 2 taken
+     branches.
+
+     TODO: The back end may reorder the BBS's differently and reverse
+     conditions/branch directions.  Change the stimates below to
+     something more reasonable.  */
+
+  if (runtime_test)
+    {
+      /* Cost model check occurs at versioning.  */
+      if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
+         || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+       scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
+      else
+       {
+         /* Cost model occurs at prologue generation.  */
+         if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+           scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
+             + TARG_COND_NOT_TAKEN_BRANCH_COST;
+         /* Cost model check occurs at epilogue generation.  */
+         else
+           scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
+       }
      }
  
    /* Add SLP costs.  */
@@ -299,30 +362,36 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
  
    /* Calculate number of iterations required to make the vector version 
       profitable, relative to the loop bodies only. The following condition
-     must hold true: ((SIC*VF)-VIC)*niters > VOC*VF, where
+     must hold true: 
+     SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
+     where
       SIC = scalar iteration cost, VIC = vector iteration cost,
-     VOC = vector outside cost and VF = vectorization factor.  */
+     VOC = vector outside cost, VF = vectorization factor,
+     PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
+     SOC = scalar outside cost for run time cost model check.  */
  
    if ((scalar_single_iter_cost * vf) > vec_inside_cost)
      {
-      if (vec_outside_cost == 0)
+      if (vec_outside_cost <= 0)
          min_profitable_iters = 1;
        else
          {
-          min_profitable_iters = (vec_outside_cost * vf)
+          min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
+                                 - vec_inside_cost * peel_iters_prologue
+                                  - vec_inside_cost * peel_iters_epilogue)
                                   / ((scalar_single_iter_cost * vf)
                                      - vec_inside_cost);
  
            if ((scalar_single_iter_cost * vf * min_profitable_iters)
                <= ((vec_inside_cost * min_profitable_iters)
-                  + (vec_outside_cost * vf)))
+                  + ((vec_outside_cost - scalar_outside_cost) * vf)))
              min_profitable_iters++;
          }
      }
    /* vector version will never be profitable.  */
    else
      {
-      if (vect_print_dump_info (REPORT_DETAILS))
+      if (vect_print_dump_info (REPORT_COST))
          fprintf (vect_dump, "cost model: vector iteration cost = %d "
                   "is divisible by scalar iteration cost = %d by a factor "
                   "greater than or equal to the vectorization factor = %d .",
@@ -330,22 +399,22 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
        return -1;
      }
  
-  if (vect_print_dump_info (REPORT_DETAILS))
+  if (vect_print_dump_info (REPORT_COST))
      {
        fprintf (vect_dump, "Cost model analysis: \n");
        fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
                vec_inside_cost);
        fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
                vec_outside_cost);
-      fprintf (vect_dump, "  Scalar cost: %d\n", scalar_single_iter_cost);
+      fprintf (vect_dump, "  Scalar iteration cost: %d\n",
+              scalar_single_iter_cost);
+      fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
        fprintf (vect_dump, "  prologue iterations: %d\n",
                 peel_iters_prologue);
        fprintf (vect_dump, "  epilogue iterations: %d\n",
                 peel_iters_epilogue);
        fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
                min_profitable_iters);
-      fprintf (vect_dump, "  Actual minimum iters for profitability: %d\n",
-              min_profitable_iters < vf ? vf : min_profitable_iters);
      }
  
    min_profitable_iters = 
@@ -355,6 +424,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
       if (niters <= min_profitable_iters)
         then skip the vectorized loop.  */
    min_profitable_iters--;
+
+  if (vect_print_dump_info (REPORT_COST))
+    fprintf (vect_dump, "  Profitability threshold = %d\n",
+            min_profitable_iters);
+    
    return min_profitable_iters;
  }
  
@@ -368,7 +442,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
     generated within the strip-mine loop, the initial definition before
     the loop, and the epilogue code that must be generated.  */
  
-static void
+static bool 
  vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
                            int ncopies)
  {
@@ -389,6 +463,16 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
  
    reduction_op = TREE_OPERAND (operation, op_type-1);
    vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
+  if (!vectype)
+    {
+      if (vect_print_dump_info (REPORT_COST))
+        {
+          fprintf (vect_dump, "unsupported data-type ");
+          print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
+        }
+      return false;
+   }
+  
    mode = TYPE_MODE (vectype);
    orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
  
@@ -436,10 +520,12 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
  
    STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
  
-  if (vect_print_dump_info (REPORT_DETAILS))
+  if (vect_print_dump_info (REPORT_COST))
      fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
               "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
               STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
+
+  return true;
  }
  
  
@@ -455,33 +541,13 @@ vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
    /* prologue cost for vec_init and vec_step.  */
    STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
    
-  if (vect_print_dump_info (REPORT_DETAILS))
+  if (vect_print_dump_info (REPORT_COST))
      fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
               "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
               STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
  }
  
  
-/* Return addresses of the cost fields of SLP_NODE if it's not NULL, and of
-   the stmt otherwise.  */
-
-static inline void
-vect_get_cost_fields (stmt_vec_info stmt_info, slp_tree slp_node, 
-                     int **inside_cost_field, int **outside_cost_field)
-{
-  if (slp_node)
-    {
-      *inside_cost_field = &(SLP_TREE_INSIDE_OF_LOOP_COST (slp_node));
-      *outside_cost_field = &(SLP_TREE_OUTSIDE_OF_LOOP_COST (slp_node));
-    }
-  else
-    {
-      *inside_cost_field = &(STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info));
-      *outside_cost_field = &(STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
-    }
-}
-
-
  /* Function vect_model_simple_cost.  
  
     Models cost for simple operations, i.e. those that only emit ncopies of a 
@@ -493,24 +559,24 @@ vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
                         enum vect_def_type *dt, slp_tree slp_node)
  {
    int i;
-  int *inside_cost_field, *outside_cost_field;
+  int inside_cost = 0, outside_cost = 0;
  
-  /* Take addresses of relevant fields to update in the function.  */
-  vect_get_cost_fields (stmt_info, slp_node, &inside_cost_field, 
-                       &outside_cost_field);
-
-  *inside_cost_field = ncopies * TARG_VEC_STMT_COST;
+  inside_cost = ncopies * TARG_VEC_STMT_COST;
  
    /* FORNOW: Assuming maximum 2 args per stmts.  */
    for (i = 0; i < 2; i++)
      {
        if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
-       *outside_cost_field += TARG_SCALAR_TO_VEC_COST; 
+       outside_cost += TARG_SCALAR_TO_VEC_COST; 
      }
    
-  if (vect_print_dump_info (REPORT_DETAILS))
+  if (vect_print_dump_info (REPORT_COST))
      fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
-             "outside_cost = %d .", *inside_cost_field, *outside_cost_field);
+             "outside_cost = %d .", inside_cost, outside_cost);
+
+  /* Set the costs either in STMT_INFO or SLP_NODE (if exists).  */
+  stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
+  stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
  }
  
  
@@ -541,16 +607,11 @@ void
  vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, 
                        enum vect_def_type dt, slp_tree slp_node)
  {
-  int cost = 0;
    int group_size;
-  int *inside_cost_field, *outside_cost_field;
-
-  /* Take addresses of relevant fields to update in the function.  */
-  vect_get_cost_fields (stmt_info, slp_node, &inside_cost_field, 
-                       &outside_cost_field);
+  int inside_cost = 0, outside_cost = 0;
  
    if (dt == vect_constant_def || dt == vect_invariant_def)
-    *outside_cost_field = TARG_SCALAR_TO_VEC_COST;
+    outside_cost = TARG_SCALAR_TO_VEC_COST;
  
    /* Strided access?  */
    if (DR_GROUP_FIRST_DR (stmt_info)) 
@@ -564,23 +625,25 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
    if (group_size > 1) 
      {
        /* Uses a high and low interleave operation for each needed permute.  */
-      cost = ncopies * exact_log2(group_size) * group_size 
+      inside_cost = ncopies * exact_log2(group_size) * group_size 
               * TARG_VEC_STMT_COST;
  
-      if (vect_print_dump_info (REPORT_DETAILS))
+      if (vect_print_dump_info (REPORT_COST))
          fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
                   group_size);
  
      }
  
    /* Costs of the stores.  */
-  cost += ncopies * TARG_VEC_STORE_COST;
+  inside_cost += ncopies * TARG_VEC_STORE_COST;
  
-  *inside_cost_field = cost;
-
-  if (vect_print_dump_info (REPORT_DETAILS))
+  if (vect_print_dump_info (REPORT_COST))
      fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
-             "outside_cost = %d .", *inside_cost_field, *outside_cost_field);
+             "outside_cost = %d .", inside_cost, outside_cost);
+
+  /* Set the costs either in STMT_INFO or SLP_NODE (if exists).  */
+  stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
+  stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
  }
  
  
@@ -595,16 +658,11 @@ void
  vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
                  
  {
-  int inner_cost = 0;
    int group_size;
    int alignment_support_cheme;
    tree first_stmt;
    struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
-  int *inside_cost_field, *outside_cost_field;
-
-  /* Take addresses of relevant fields to update in the function.  */
-  vect_get_cost_fields (stmt_info, slp_node, &inside_cost_field, 
-                       &outside_cost_field);
+  int inside_cost = 0, outside_cost = 0;
  
    /* Strided accesses?  */
    first_stmt = DR_GROUP_FIRST_DR (stmt_info);
@@ -627,10 +685,10 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
    if (group_size > 1) 
      {
        /* Uses an even and odd extract operations for each needed permute.  */
-      inner_cost = ncopies * exact_log2(group_size) * group_size
-                   * TARG_VEC_STMT_COST;
+      inside_cost = ncopies * exact_log2(group_size) * group_size
+       * TARG_VEC_STMT_COST;
  
-      if (vect_print_dump_info (REPORT_DETAILS))
+      if (vect_print_dump_info (REPORT_COST))
          fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
                   group_size);
  
@@ -641,9 +699,9 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
      {
      case dr_aligned:
        {
-        inner_cost += ncopies * TARG_VEC_LOAD_COST;
+        inside_cost += ncopies * TARG_VEC_LOAD_COST;
  
-        if (vect_print_dump_info (REPORT_DETAILS))
+        if (vect_print_dump_info (REPORT_COST))
            fprintf (vect_dump, "vect_model_load_cost: aligned.");
  
          break;
@@ -651,9 +709,9 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
      case dr_unaligned_supported:
        {
          /* Here, we assign an additional cost for the unaligned load.  */
-        inner_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
+        inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
  
-        if (vect_print_dump_info (REPORT_DETAILS))
+        if (vect_print_dump_info (REPORT_COST))
            fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
                     "hardware.");
  
@@ -661,21 +719,19 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
        }
      case dr_explicit_realign:
        {
-        inner_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
+        inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
  
          /* FIXME: If the misalignment remains fixed across the iterations of
             the containing loop, the following cost should be added to the
             outside costs.  */
          if (targetm.vectorize.builtin_mask_for_load)
-          inner_cost += TARG_VEC_STMT_COST;
+          inside_cost += TARG_VEC_STMT_COST;
  
          break;
        }
      case dr_explicit_realign_optimized:
        {
-        int outer_cost = 0;
-
-        if (vect_print_dump_info (REPORT_DETAILS))
+        if (vect_print_dump_info (REPORT_COST))
            fprintf (vect_dump, "vect_model_load_cost: unaligned software "
                     "pipelined.");
  
@@ -688,14 +744,12 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
  
          if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
            {
-            outer_cost = 2*TARG_VEC_STMT_COST;
+            outside_cost = 2*TARG_VEC_STMT_COST;
              if (targetm.vectorize.builtin_mask_for_load)
-              outer_cost += TARG_VEC_STMT_COST;
+              outside_cost += TARG_VEC_STMT_COST;
            }
-        
-        *outside_cost_field = outer_cost;
  
-        inner_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
+        inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
  
          break;
        }
@@ -703,13 +757,14 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
      default:
        gcc_unreachable ();
      }
-
-  *inside_cost_field = inner_cost;
-
-  if (vect_print_dump_info (REPORT_DETAILS))
+  
+  if (vect_print_dump_info (REPORT_COST))
      fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
-             "outside_cost = %d .", *inside_cost_field, *outside_cost_field);
+             "outside_cost = %d .", inside_cost, outside_cost);
  
+  /* Set the costs either in STMT_INFO or SLP_NODE (if exists).  */
+  stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
+  stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
  }
  
  
@@ -1325,6 +1380,7 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
    bool is_store = false;
    unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
    VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
+  bool constant_p;
  
    if (STMT_VINFO_DATA_REF (stmt_vinfo))
      is_store = true;
@@ -1348,6 +1404,7 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
    number_of_copies = least_common_multiple (nunits, group_size) / group_size;
  
    number_of_places_left_in_vector = nunits;
+  constant_p = true;
    for (j = 0; j < number_of_copies; j++)
      {
        for (i = group_size - 1; VEC_iterate (tree, stmts, i, stmt); i--)
@@ -1357,6 +1414,8 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
             op = operation;
           else
             op = TREE_OPERAND (operation, op_num);
+         if (!CONSTANT_CLASS_P (op))
+           constant_p = false;
  
            /* Create 'vect_ = {op0,op1,...,opn}'.  */
            t = tree_cons (NULL_TREE, op, t);
@@ -1368,7 +1427,12 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
                number_of_places_left_in_vector = nunits;
  
               vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
-              vec_cst = build_constructor_from_list (vector_type, t);
+              gcc_assert (vector_type);
+             if (constant_p)
+               vec_cst = build_vector (vector_type, t);
+             else
+               vec_cst = build_constructor_from_list (vector_type, t);
+             constant_p = true;
                VEC_quick_push (tree, voprnds,
                                vect_init_vector (stmt, vec_cst, vector_type,
                                                 NULL));
@@ -1400,7 +1464,7 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
  }
  
  
-/* Get vectorized defintions from SLP_NODE that contains corresponding
+/* Get vectorized definitions from SLP_NODE that contains corresponding
     vectorized def-stmts.  */
   
  static void
@@ -1428,7 +1492,9 @@ vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
     call vect_get_constant_vectors() to create vector stmts.
     Otherwise, the def-stmts must be already vectorized and the vectorized stmts
     must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
-   vect_get_slp_vect_defs() to retrieve them.  */
+   vect_get_slp_vect_defs() to retrieve them.  
+   If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
+   the right node. This is used when the second operand must remain scalar.  */
   
  static void
  vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
@@ -1456,7 +1522,7 @@ vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
      return;
  
    operation = GIMPLE_STMT_OPERAND (first_stmt, 1);
-  if (TREE_OPERAND_LENGTH (operation) == unary_op)
+  if (TREE_OPERAND_LENGTH (operation) == unary_op || !vec_oprnds1)
      return;
  
    *vec_oprnds1 = VEC_alloc (tree, heap, 
@@ -1490,8 +1556,8 @@ get_initial_def_for_induction (tree iv_phi)
    loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
-  tree vectype = get_vectype_for_scalar_type (scalar_type);
-  int nunits =  TYPE_VECTOR_SUBPARTS (vectype);
+  tree vectype; 
+  int nunits;
    edge pe = loop_preheader_edge (loop);
    struct loop *iv_loop;
    basic_block new_bb;
@@ -1505,7 +1571,7 @@ get_initial_def_for_induction (tree iv_phi)
    int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    int i;
    bool ok;
-  int ncopies = vf / nunits;
+  int ncopies;
    tree expr;
    stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
    bool nested_in_vect_loop = false;
@@ -1518,6 +1584,11 @@ get_initial_def_for_induction (tree iv_phi)
    block_stmt_iterator si;
    basic_block bb = bb_for_stmt (iv_phi);
  
+  vectype = get_vectype_for_scalar_type (scalar_type);
+  gcc_assert (vectype);
+  nunits = TYPE_VECTOR_SUBPARTS (vectype);
+  ncopies = vf / nunits;
+
    gcc_assert (phi_info);
    gcc_assert (ncopies >= 1);
  
@@ -1616,7 +1687,8 @@ get_initial_def_for_induction (tree iv_phi)
    t = NULL_TREE;
    for (i = 0; i < nunits; i++)
      t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
-  vec = build_constructor_from_list (vectype, t);
+  gcc_assert (CONSTANT_CLASS_P (new_name));
+  vec = build_vector (vectype, t);
    vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
  
  
@@ -1672,7 +1744,8 @@ get_initial_def_for_induction (tree iv_phi)
        t = NULL_TREE;
        for (i = 0; i < nunits; i++)
         t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
-      vec = build_constructor_from_list (vectype, t);
+      gcc_assert (CONSTANT_CLASS_P (new_name));
+      vec = build_vector (vectype, t);
        vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
  
        vec_def = induc_def;
@@ -1808,6 +1881,7 @@ vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
              t = tree_cons (NULL_TREE, op, t);
            }
          vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
+        gcc_assert (vector_type);
          vec_cst = build_vector (vector_type, t);
  
          return vect_init_vector (stmt, vec_cst, vector_type, NULL);
@@ -1830,6 +1904,7 @@ vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
  
         /* FIXME: use build_constructor directly.  */
         vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
+        gcc_assert (vector_type);
          vec_inv = build_constructor_from_list (vector_type, t);
          return vect_init_vector (stmt, vec_inv, vector_type, NULL);
        }
@@ -1972,7 +2047,7 @@ vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
    vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
    VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
  
-  if (vec_oprnds1)
+  if (vec_oprnds1 && *vec_oprnds1)
      {
        vec_oprnd = VEC_pop (tree, *vec_oprnds1);
        vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
@@ -2104,7 +2179,7 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
    tree vector_type;
    bool nested_in_vect_loop = false; 
  
-  gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
+  gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
    if (nested_in_vect_loop_p (loop, stmt))
      nested_in_vect_loop = true;
    else
@@ -2117,18 +2192,19 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
    case WIDEN_SUM_EXPR:
    case DOT_PROD_EXPR:
    case PLUS_EXPR:
-      if (nested_in_vect_loop)
-       *adjustment_def = vecdef;
-      else
-       *adjustment_def = init_val;
-    /* Create a vector of zeros for init_def.  */
-    if (INTEGRAL_TYPE_P (type))
-      def_for_init = build_int_cst (type, 0);
+    if (nested_in_vect_loop)
+      *adjustment_def = vecdef;
      else
+      *adjustment_def = init_val;
+    /* Create a vector of zeros for init_def.  */
+    if (SCALAR_FLOAT_TYPE_P (type))
        def_for_init = build_real (type, dconst0);
-      for (i = nunits - 1; i >= 0; --i)
-    t = tree_cons (NULL_TREE, def_for_init, t);
+    else
+      def_for_init = build_int_cst (type, 0);
+    for (i = nunits - 1; i >= 0; --i)
+      t = tree_cons (NULL_TREE, def_for_init, t);
      vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
+    gcc_assert (vector_type);
      init_def = build_vector (vector_type, t);
      break;
  
@@ -2239,6 +2315,7 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt,
    op_type = TREE_OPERAND_LENGTH (operation);
    reduction_op = TREE_OPERAND (operation, op_type-1);
    vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
+  gcc_assert (vectype);
    mode = TYPE_MODE (vectype);
  
    /*** 1. Create the reduction def-use cycle  ***/
@@ -2711,6 +2788,9 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
      return false;
    scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
    scalar_type = TREE_TYPE (scalar_dest);
+  if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type) 
+      && !SCALAR_FLOAT_TYPE_P (scalar_type))
+    return false;
  
    /* All uses but the last are expected to be defined in the loop.
       The last use is the reduction variable.  */
@@ -2812,6 +2892,16 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
           reduction variable, and get the tree-code from orig_stmt.  */
        orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
        vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
+      if (!vectype)
+       {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            {
+              fprintf (vect_dump, "unsupported data-type ");
+              print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
+            }
+          return false;
+        }
+
        vec_mode = TYPE_MODE (vectype);
      }
    else
@@ -2840,7 +2930,8 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
    if (!vec_stmt) /* transformation not required.  */
      {
        STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
-      vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
+      if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
+        return false;
        return true;
      }
  
@@ -2978,14 +3069,6 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
    if (STMT_SLP_TYPE (stmt_info))
      return false;
  
-  /* FORNOW: not yet supported.  */
-  if (STMT_VINFO_LIVE_P (stmt_info))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "value used after loop.");
-      return false;
-    }
-
    /* Is STMT a vectorizable call?   */
    if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
      return false;
@@ -3033,10 +3116,14 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
      return false;
  
    vectype_in = get_vectype_for_scalar_type (rhs_type);
+  if (!vectype_in)
+    return false;
    nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
  
    lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
    vectype_out = get_vectype_for_scalar_type (lhs_type);
+  if (!vectype_out)
+    return false;
    nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
  
    /* FORNOW */
@@ -3320,14 +3407,6 @@ vectorizable_conversion (tree stmt, block_stmt_iterator *bsi,
    if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
      return false;
  
-  if (STMT_VINFO_LIVE_P (stmt_info))
-    {
-      /* FORNOW: not yet supported.  */
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "value used after loop.");
-      return false;
-    }
-
    if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
      return false;
  
@@ -3343,11 +3422,15 @@ vectorizable_conversion (tree stmt, block_stmt_iterator *bsi,
    op0 = TREE_OPERAND (operation, 0);
    rhs_type = TREE_TYPE (op0);
    vectype_in = get_vectype_for_scalar_type (rhs_type);
+  if (!vectype_in)
+    return false;
    nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
  
    scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
    lhs_type = TREE_TYPE (scalar_dest);
    vectype_out = get_vectype_for_scalar_type (lhs_type);
+  if (!vectype_out)
+    return false;
    nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
  
    /* FORNOW */
@@ -3594,14 +3677,6 @@ vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
    if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
      return false;
  
-  /* FORNOW: not yet supported.  */
-  if (STMT_VINFO_LIVE_P (stmt_info))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "value used after loop.");
-      return false;
-    }
-
    /* Is vectorizable assignment?  */
    if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
      return false;
@@ -3711,14 +3786,6 @@ vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
  
    gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
  
-  if (STMT_VINFO_LIVE_P (stmt_info))
-    {
-      /* FORNOW: not yet supported.  */
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "value used after loop.");
-      return false;
-    }
-
    if (TREE_CODE (phi) != PHI_NODE)
      return false;
  
@@ -3780,6 +3847,8 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
    int j, i;
    VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
    tree vop0, vop1;
+  unsigned int k;
+  bool scalar_shift_arg = false;
  
    /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
       this, so we can safely override NCOPIES with 1 here.  */
@@ -3800,14 +3869,6 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
    if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
      return false;
  
-  /* FORNOW: not yet supported.  */
-  if (STMT_VINFO_LIVE_P (stmt_info))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "value used after loop.");
-      return false;
-    }
-
    /* Is STMT a vectorizable binary/unary operation?   */
    if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
      return false;
@@ -3817,6 +3878,8 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
  
    scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
    vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
+  if (!vectype_out)
+    return false;
    nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
    if (nunits_out != nunits_in)
      return false;
@@ -3872,18 +3935,21 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
      {
        if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "op not supported by target.");
+      /* Check only during analysis.  */
        if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
-          || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-            < vect_min_worthwhile_factor (code))
+          || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+             < vect_min_worthwhile_factor (code)
+              && !vec_stmt))
          return false;
        if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "proceeding using word mode.");
      }
  
-  /* Worthwhile without SIMD support?  */
+  /* Worthwhile without SIMD support? Check only during analysis.  */
    if (!VECTOR_MODE_P (TYPE_MODE (vectype))
        && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-        < vect_min_worthwhile_factor (code))
+        < vect_min_worthwhile_factor (code)
+      && !vec_stmt)
      {
        if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "not worthwhile without SIMD support.");
@@ -3899,14 +3965,18 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
        /* Invariant argument is needed for a vector shift
          by a scalar shift operand.  */
        optab_op2_mode = insn_data[icode].operand[2].mode;
-      if (! (VECTOR_MODE_P (optab_op2_mode)
-            || dt[1] == vect_constant_def
-            || dt[1] == vect_invariant_def))
+      if (!VECTOR_MODE_P (optab_op2_mode))
         {
-         if (vect_print_dump_info (REPORT_DETAILS))
-           fprintf (vect_dump, "operand mode requires invariant argument.");
-         return false;
-       }
+         if (dt[1] != vect_constant_def && dt[1] != vect_invariant_def)
+           {
+             if (vect_print_dump_info (REPORT_DETAILS))
+               fprintf (vect_dump, "operand mode requires invariant"
+                                    " argument.");
+             return false;
+           }
+
+          scalar_shift_arg = true;
+        }
      }
  
    if (!vec_stmt) /* transformation not required.  */
@@ -3926,12 +3996,21 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
    /* Handle def.  */
    vec_dest = vect_create_destination_var (scalar_dest, vectype);
  
+  /* Allocate VECs for vector operands. In case of SLP, vector operands are 
+     created in the previous stages of the recursion, so no allocation is
+     needed, except for the case of shift with scalar shift argument. In that
+     case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
+     be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
+     In case of loop-based vectorization we allocate VECs of size 1. We 
+     allocate VEC_OPRNDS1 only in case of binary operation.  */ 
    if (!slp_node)
      {
        vec_oprnds0 = VEC_alloc (tree, heap, 1);
        if (op_type == binary_op)
-       vec_oprnds1 = VEC_alloc (tree, heap, 1);
+        vec_oprnds1 = VEC_alloc (tree, heap, 1);
      }
+  else if (scalar_shift_arg)
+    vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);  
  
    /* In case the vectorization factor (VF) is bigger than the number
       of elements that we can fit in a vectype (nunits), we have to generate
@@ -3993,8 +4072,7 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
        if (j == 0)
         {
           if (op_type == binary_op
-             && (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
-              && !slp_node)
+             && (code == LSHIFT_EXPR || code == RSHIFT_EXPR))
             {
               /* Vector shl and shr insn patterns can be defined with scalar 
                  operand 2 (shift operand). In this case, use constant or loop 
@@ -4007,14 +4085,27 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
                     fprintf (vect_dump, "operand 1 using scalar mode.");
                   vec_oprnd1 = op1;
                   VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
+                 if (slp_node)
+                   {
+                     /* Store vec_oprnd1 for every vector stmt to be created
+                        for SLP_NODE. We check during the analysis that all the
+                         shift arguments are the same.  
+                        TODO: Allow different constants for different vector 
+                        stmts generated for an SLP instance.  */          
+                     for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
+                       VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
+                   }
                 }
             }
          
+          /* vec_oprnd1 is available if operand 1 should be of a scalar-type 
+             (a special case for certain kind of vector shifts); otherwise, 
+             operand 1 should be of a vector type (the usual case).  */
           if (op_type == binary_op && !vec_oprnd1)
             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1, 
                                slp_node);
           else
-           vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, &vec_oprnds1, 
+           vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, 
                                slp_node);
         }
        else
@@ -4095,14 +4186,6 @@ vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
    if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
      return false;
  
-  /* FORNOW: not yet supported.  */
-  if (STMT_VINFO_LIVE_P (stmt_info))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "value used after loop.");
-      return false;
-    }
-
    /* Is STMT a vectorizable type-demotion operation?  */
    if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
      return false;
@@ -4117,10 +4200,14 @@ vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
  
    op0 = TREE_OPERAND (operation, 0);
    vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
+  if (!vectype_in)
+    return false;
    nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
  
    scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
    vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
+  if (!vectype_out)
+    return false;
    nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
    if (nunits_in != nunits_out / 2) /* FORNOW */
      return false;
@@ -4252,14 +4339,6 @@ vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
    if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
      return false;
  
-  /* FORNOW: not yet supported.  */
-  if (STMT_VINFO_LIVE_P (stmt_info))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "value used after loop.");
-      return false;
-    }
-
    /* Is STMT a vectorizable type-promotion operation?  */
    if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
      return false;
@@ -4275,10 +4354,14 @@ vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
  
    op0 = TREE_OPERAND (operation, 0);
    vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
+  if (!vectype_in)
+    return false;
    nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
  
    scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
    vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
+  if (!vectype_out)
+    return false;
    nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
    if (nunits_out != nunits_in / 2) /* FORNOW */
      return false;
@@ -4597,7 +4680,7 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
    int nunits = TYPE_VECTOR_SUBPARTS (vectype);
    int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
    int j;
-  tree next_stmt, first_stmt;
+  tree next_stmt, first_stmt = NULL_TREE;
    bool strided_store = false;
    unsigned int group_size, i;
    VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
@@ -4628,13 +4711,6 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
    if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
      return false;
  
-  if (STMT_VINFO_LIVE_P (stmt_info))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "value used after loop.");
-      return false;
-    }
-
    /* Is vectorizable store? */
  
    if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
@@ -4666,9 +4742,28 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
    if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
      {
        strided_store = true;
+      first_stmt = DR_GROUP_FIRST_DR (stmt_info);
        if (!vect_strided_store_supported (vectype)
           && !PURE_SLP_STMT (stmt_info) && !slp)
-       return false;      
+       return false;
+     
+      if (first_stmt == stmt)
+       {
+          /* STMT is the leader of the group. Check the operands of all the
+             stmts of the group.  */
+          next_stmt = DR_GROUP_NEXT_DR (stmt_info);
+          while (next_stmt)
+            {
+              op = GIMPLE_STMT_OPERAND (next_stmt, 1);
+              if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
+                {
+                  if (vect_print_dump_info (REPORT_DETAILS))
+                    fprintf (vect_dump, "use not simple.");
+                  return false;
+                }
+              next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
+            }
+        }
      }
  
    if (!vec_stmt) /* transformation not required.  */
@@ -4683,7 +4778,6 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
  
    if (strided_store)
      {
-      first_stmt = DR_GROUP_FIRST_DR (stmt_info);
        first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
        group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
  
@@ -4829,8 +4923,9 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
              OPRNDS are of size 1.  */
           for (i = 0; i < group_size; i++)
             {
-             vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, 
-                                                  VEC_index (tree, oprnds, i));
+             op = VEC_index (tree, oprnds, i);
+             vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
+             vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op); 
               VEC_replace(tree, dr_chain, i, vec_oprnd);
               VEC_replace(tree, oprnds, i, vec_oprnd);
             }
@@ -5049,6 +5144,7 @@ vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
        new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
        new_temp = make_ssa_name (vec_dest, new_stmt);
        GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+      mark_symbols_for_renaming (new_stmt);
        new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
        gcc_assert (!new_bb);
        msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
@@ -5457,14 +5553,6 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
    if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
      return false;
  
-  /* FORNOW: not yet supported.  */
-  if (STMT_VINFO_LIVE_P (stmt_info))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "value used after loop.");
-      return false;
-    }
-
    /* Is vectorizable load? */
    if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
      return false;
@@ -5892,7 +5980,7 @@ vectorizable_live_operation (tree stmt,
    for (i = 0; i < op_type; i++)
      {
        op = TREE_OPERAND (operation, i);
-      if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
+      if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
          {
            if (vect_print_dump_info (REPORT_DETAILS))
              fprintf (vect_dump, "use not simple.");
@@ -6442,6 +6530,33 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
      }
  }
  
+/* Return the more conservative threshold between the
+   min_profitable_iters returned by the cost model and the user
+   specified threshold, if provided.  */
+
+static unsigned int
+conservative_cost_threshold (loop_vec_info loop_vinfo,
+                            int min_profitable_iters)
+{
+  unsigned int th;
+  int min_scalar_loop_bound;
+
+  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
+                           * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
+
+  /* Use the cost model only if it is more conservative than user specified
+     threshold.  */
+  th = (unsigned) min_scalar_loop_bound;
+  if (min_profitable_iters
+      && (!min_scalar_loop_bound
+          || min_profitable_iters > min_scalar_loop_bound))
+    th = (unsigned) min_profitable_iters;
+
+  if (th && vect_print_dump_info (REPORT_COST))
+    fprintf (vect_dump, "Vectorization may not be profitable.");
+
+  return th;
+}
  
  /* Function vect_do_peeling_for_loop_bound
  
@@ -6462,8 +6577,8 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
    edge update_e;
    basic_block preheader;
    int loop_num;
-  unsigned int th;
-  int min_scalar_loop_bound;
+  bool check_profitability = false;
+  unsigned int th = 0;
    int min_profitable_iters;
  
    if (vect_print_dump_info (REPORT_DETAILS))
@@ -6481,28 +6596,24 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
  
    loop_num  = loop->num; 
  
-  /* Analyze cost to set threshhold for vectorized loop.  */
-  min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
-  min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND))
-                          * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-
-  /* Use the cost model only if it is more conservative than user specified
-     threshold.  */
+  /* If cost model check not done during versioning and 
+     peeling for alignment.  */
+  if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
+      && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
+      && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
+    {
+      check_profitability = true;
  
-  th = (unsigned) min_scalar_loop_bound;
-  if (min_profitable_iters
-      && (!min_scalar_loop_bound
-          || min_profitable_iters > min_scalar_loop_bound))
-    th = (unsigned) min_profitable_iters;
+      /* Get profitability threshold for vectorized loop.  */
+      min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
  
-  if (min_profitable_iters
-      && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      && vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "vectorization may not be profitable.");
+      th = conservative_cost_threshold (loop_vinfo, 
+                                       min_profitable_iters);
+    }
  
    new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
                                              ratio_mult_vf_name, ni_name, false,
-                                            th);
+                                            th, check_profitability);
    gcc_assert (new_loop);
    gcc_assert (loop_num == loop->num);
  #ifdef ENABLE_CHECKING
@@ -6720,6 +6831,9 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
    tree niters_of_prolog_loop, ni_name;
    tree n_iters;
    struct loop *new_loop;
+  bool check_profitability = false;
+  unsigned int th = 0;
+  int min_profitable_iters;
  
    if (vect_print_dump_info (REPORT_DETAILS))
      fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
@@ -6729,10 +6843,26 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
    ni_name = vect_build_loop_niters (loop_vinfo);
    niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
    
+
+  /* If cost model check not done during versioning.  */
+  if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
+      && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+    {
+      check_profitability = true;
+
+      /* Get profitability threshold for vectorized loop.  */
+      min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
+
+      th = conservative_cost_threshold (loop_vinfo, 
+                                       min_profitable_iters);
+    }
+
    /* Peel the prolog loop and iterate it niters_of_prolog_loop.  */
-  new_loop = 
-       slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop), 
-                                      niters_of_prolog_loop, ni_name, true, 0); 
+  new_loop =
+    slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
+                                  niters_of_prolog_loop, ni_name, true,
+                                  th, check_profitability);
+
    gcc_assert (new_loop);
  #ifdef ENABLE_CHECKING
    slpeel_verify_cfg_after_peeling (new_loop, loop);
@@ -6760,6 +6890,8 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
     checked at runtime.
  
     Input:
+   COND_EXPR  - input conditional expression.  New conditions will be chained
+                with logical AND operation.
     LOOP_VINFO - two fields of the loop information are used.
                  LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
                  LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
@@ -6776,8 +6908,9 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
          test can be done as a&(n-1) == 0.  For example, for 16
          byte vectors the test is a&0xf == 0.  */
  
-static tree
+static void
  vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
+                                   tree *cond_expr,
                                     tree *cond_expr_stmt_list)
  {
    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -6793,6 +6926,7 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
    tree or_tmp_name = NULL_TREE;
    tree and_tmp, and_tmp_name, and_stmt;
    tree ptrsize_zero;
+  tree part_cond_expr;
  
    /* Check that mask is one less than a power of 2, i.e., mask is
       all zeros followed by all ones.  */
@@ -6866,8 +7000,13 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
    /* Make and_tmp the left operand of the conditional test against zero.
       if and_tmp has a nonzero bit then some address is unaligned.  */
    ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
-  return build2 (EQ_EXPR, boolean_type_node,
-                 and_tmp_name, ptrsize_zero);
+  part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
+                               and_tmp_name, ptrsize_zero);
+  if (*cond_expr)
+    *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
+                             *cond_expr, part_cond_expr);
+  else
+    *cond_expr = part_cond_expr;
  }
  
  /* Function vect_vfa_segment_size.
@@ -6886,31 +7025,18 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
  static tree
  vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
  {
-  tree segment_length;
+  tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
+                                    DR_STEP (dr), vect_factor);
  
    if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
      {
-      tree vector_size =
-        build_int_cst (integer_type_node,
-          GET_MODE_SIZE (TYPE_MODE (STMT_VINFO_VECTYPE
-           (vinfo_for_stmt (DR_STMT (dr))))));
+      tree vector_size = TYPE_SIZE_UNIT
+                         (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
  
-      segment_length =
-       fold_convert (sizetype,
-         fold_build2 (PLUS_EXPR, integer_type_node,
-           fold_build2 (MULT_EXPR, integer_type_node, DR_STEP (dr),
-                        vect_factor),
-           vector_size));
+      segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
+                                   segment_length, vector_size);
      }
-  else
-    {
-      segment_length =
-       fold_convert (sizetype,
-         fold_build2 (MULT_EXPR, integer_type_node, DR_STEP (dr),
-                      vect_factor));
-    }
-
-    return segment_length;
+  return fold_convert (sizetype, segment_length);
  }
  
  /* Function vect_create_cond_for_alias_checks.
@@ -6921,7 +7047,7 @@ vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
  
     Input:
     COND_EXPR  - input conditional expression.  New conditions will be chained
-                with logical and operation.
+                with logical AND operation.
     LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
                 to be checked.
  
@@ -6929,6 +7055,8 @@ vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
     COND_EXPR - conditional expression.
     COND_EXPR_STMT_LIST - statements needed to construct the conditional
                           expression.
+
+
     The returned value is the conditional expression to be used in the if
     statement that controls which version of the loop gets executed at runtime.
  */
@@ -6962,26 +7090,47 @@ vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
  
    for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
      {
-      tree stmt_a = DR_STMT (DDR_A (ddr));
-      tree stmt_b = DR_STMT (DDR_B (ddr));
+      struct data_reference *dr_a, *dr_b;
+      tree dr_group_first_a, dr_group_first_b;
+      tree addr_base_a, addr_base_b;
+      tree segment_length_a, segment_length_b;
+      tree stmt_a, stmt_b;
+
+      dr_a = DDR_A (ddr);
+      stmt_a = DR_STMT (DDR_A (ddr));
+      dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
+      if (dr_group_first_a)
+        {
+         stmt_a = dr_group_first_a;
+         dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
+       }
  
-      tree addr_base_a =
+      dr_b = DDR_B (ddr);
+      stmt_b = DR_STMT (DDR_B (ddr));
+      dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
+      if (dr_group_first_b)
+        {
+         stmt_b = dr_group_first_b;
+         dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
+       }
+
+      addr_base_a =
          vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
                                               NULL_TREE, loop);
-      tree addr_base_b =
+      addr_base_b =
          vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
                                               NULL_TREE, loop);
  
-      tree segment_length_a = vect_vfa_segment_size (DDR_A (ddr), vect_factor);
-      tree segment_length_b = vect_vfa_segment_size (DDR_B (ddr), vect_factor);
+      segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
+      segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
  
        if (vect_print_dump_info (REPORT_DR_DETAILS))
         {
           fprintf (vect_dump,
                    "create runtime check for data references ");
-         print_generic_expr (vect_dump, DR_REF (DDR_A (ddr)), TDF_SLIM);
+         print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
           fprintf (vect_dump, " and ");
-         print_generic_expr (vect_dump, DR_REF (DDR_B (ddr)), TDF_SLIM);
+         print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
         }
  
  
@@ -7010,6 +7159,108 @@ vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
  
  }
  
+/* Function vect_loop_versioning.
+ 
+   If the loop has data references that may or may not be aligned or/and
+   has data reference relations whose independence was not proven then
+   two versions of the loop need to be generated, one which is vectorized
+   and one which isn't.  A test is then generated to control which of the
+   loops is executed.  The test checks for the alignment of all of the
+   data references that may or may not be aligned.  An additional
+   sequence of runtime tests is generated for each pairs of DDRs whose
+   independence was not proven.  The vectorized version of loop is 
+   executed only if both alias and alignment tests are passed.  
+  
+   The test generated to check which version of loop is executed
+   is modified to also check for profitability as indicated by the 
+   cost model initially.  */
+
+static void
+vect_loop_versioning (loop_vec_info loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  struct loop *nloop;
+  tree cond_expr = NULL_TREE;
+  tree cond_expr_stmt_list = NULL_TREE;
+  basic_block condition_bb;
+  block_stmt_iterator cond_exp_bsi;
+  basic_block merge_bb;
+  basic_block new_exit_bb;
+  edge new_exit_e, e;
+  tree orig_phi, new_phi, arg;
+  unsigned prob = 4 * REG_BR_PROB_BASE / 5;
+  tree gimplify_stmt_list;
+  tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
+  int min_profitable_iters = 0;
+  unsigned int th;
+
+  /* Get profitability threshold for vectorized loop.  */
+  min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
+
+  th = conservative_cost_threshold (loop_vinfo,
+                                   min_profitable_iters);
+
+  cond_expr =
+    build2 (GT_EXPR, boolean_type_node, scalar_loop_iters, 
+           build_int_cst (TREE_TYPE (scalar_loop_iters), th));
+
+  cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
+                                   false, NULL_TREE);
+
+  if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
+      vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
+                                        &cond_expr_stmt_list);
+
+  if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+    vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr, 
+                                      &cond_expr_stmt_list);
+
+  cond_expr =
+    fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
+  cond_expr =
+    force_gimple_operand (cond_expr, &gimplify_stmt_list, true,
+                         NULL_TREE);
+  append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list);
+
+  initialize_original_copy_tables ();
+  nloop = loop_version (loop, cond_expr, &condition_bb,
+                       prob, prob, REG_BR_PROB_BASE - prob, true);
+  free_original_copy_tables();
+
+  /* Loop versioning violates an assumption we try to maintain during 
+     vectorization - that the loop exit block has a single predecessor.
+     After versioning, the exit block of both loop versions is the same
+     basic block (i.e. it has two predecessors). Just in order to simplify
+     following transformations in the vectorizer, we fix this situation
+     here by adding a new (empty) block on the exit-edge of the loop,
+     with the proper loop-exit phis to maintain loop-closed-form.  */
+  
+  merge_bb = single_exit (loop)->dest;
+  gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
+  new_exit_bb = split_edge (single_exit (loop));
+  new_exit_e = single_exit (loop);
+  e = EDGE_SUCC (new_exit_bb, 0);
+
+  for (orig_phi = phi_nodes (merge_bb); orig_phi; 
+       orig_phi = PHI_CHAIN (orig_phi))
+    {
+      new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
+                                 new_exit_bb);
+      arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
+      add_phi_arg (new_phi, arg, new_exit_e);
+      SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
+    } 
+
+  /* End loop-exit-fixes after versioning.  */
+
+  update_ssa (TODO_update_ssa);
+  if (cond_expr_stmt_list)
+    {
+      cond_exp_bsi = bsi_last (condition_bb);
+      bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
+    }
+}
+
  /* Remove a group of stores (for SLP or interleaving), free their 
     stmt_vec_info.  */
  
@@ -7118,7 +7369,6 @@ vect_schedule_slp (loop_vec_info loop_vinfo, unsigned int nunits)
    return is_store;
  }
  
-
  /* Function vect_transform_loop.
  
     The analysis phase has determined that the loop is vectorizable.
@@ -7142,81 +7392,9 @@ vect_transform_loop (loop_vec_info loop_vinfo)
    if (vect_print_dump_info (REPORT_DETAILS))
      fprintf (vect_dump, "=== vec_transform_loop ===");
  
-  /* If the loop has data references that may or may not be aligned or/and
-     has data reference relations whose independence was not proven then
-     two versions of the loop need to be generated, one which is vectorized
-     and one which isn't.  A test is then generated to control which of the
-     loops is executed.  The test checks for the alignment of all of the
-     data references that may or may not be aligned.  An additional
-     sequence of runtime tests is generated for each pairs of DDRs whose
-     independence was not proven.  The vectorized version of loop is 
-     executed only if both alias and alignment tests are passed.  */
-
    if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
        || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
-    {
-      struct loop *nloop;
-      tree cond_expr = NULL_TREE;
-      tree cond_expr_stmt_list = NULL_TREE;
-      basic_block condition_bb;
-      block_stmt_iterator cond_exp_bsi;
-      basic_block merge_bb;
-      basic_block new_exit_bb;
-      edge new_exit_e, e;
-      tree orig_phi, new_phi, arg;
-      unsigned prob = 4 * REG_BR_PROB_BASE / 5;
-      tree gimplify_stmt_list;
-
-      if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
-       cond_expr =
-         vect_create_cond_for_align_checks (loop_vinfo, &cond_expr_stmt_list);
-
-      if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
-       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
-                                                     &cond_expr_stmt_list);
-
-      cond_expr =
-        fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
-      cond_expr =
-        force_gimple_operand (cond_expr, &gimplify_stmt_list, true,
-                              NULL_TREE);
-      append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list);
-
-      initialize_original_copy_tables ();
-      nloop = loop_version (loop, cond_expr, &condition_bb,
-                           prob, prob, REG_BR_PROB_BASE - prob, true);
-      free_original_copy_tables();
-
-      /** Loop versioning violates an assumption we try to maintain during 
-        vectorization - that the loop exit block has a single predecessor.
-        After versioning, the exit block of both loop versions is the same
-        basic block (i.e. it has two predecessors). Just in order to simplify
-        following transformations in the vectorizer, we fix this situation
-        here by adding a new (empty) block on the exit-edge of the loop,
-        with the proper loop-exit phis to maintain loop-closed-form.  **/
-      
-      merge_bb = single_exit (loop)->dest;
-      gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
-      new_exit_bb = split_edge (single_exit (loop));
-      new_exit_e = single_exit (loop);
-      e = EDGE_SUCC (new_exit_bb, 0);
-
-      for (orig_phi = phi_nodes (merge_bb); orig_phi; 
-          orig_phi = PHI_CHAIN (orig_phi))
-       {
-          new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
-                                    new_exit_bb);
-          arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
-          add_phi_arg (new_phi, arg, new_exit_e);
-         SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
-       } 
-
-      /** end loop-exit-fixes after versioning  **/
-
-      update_ssa (TODO_update_ssa);
-      cond_exp_bsi = bsi_last (condition_bb);
-      bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
-    }
+    vect_loop_versioning (loop_vinfo);
  
    /* CHECKME: we wouldn't need this if we called update_ssa once
       for all loops.  */