2008-04-30 Paul Thomas <pault@gcc.gnu.org>

[pf3gnuchains/gcc-fork.git] / gcc / tree-vect-transform.c
diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c

index c718e07..6168fda 100644 (file)
--- a/gcc/tree-vect-transform.c
+++ b/gcc/tree-vect-transform.c
@@ -1,5 +1,5 @@
  /* Transformation Utilities for Loop Vectorization.
-   Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
+   Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
     Contributed by Dorit Naishlos <dorit@il.ibm.com>
  
  This file is part of GCC.
@@ -119,11 +119,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
    int vec_inside_cost = 0;
    int vec_outside_cost = 0;
    int scalar_single_iter_cost = 0;
+  int scalar_outside_cost = 0;
+  bool runtime_test = false;
    int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
    int nbbs = loop->num_nodes;
-  int byte_misalign;
+  int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
    int peel_guard_costs = 0;
    int innerloop_iters = 0, factor;
    VEC (slp_instance, heap) *slp_instances;
@@ -132,11 +134,18 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
    /* Cost model disabled.  */
    if (!flag_vect_cost_model)
      {
-      if (vect_print_dump_info (REPORT_DETAILS))
+      if (vect_print_dump_info (REPORT_COST))
          fprintf (vect_dump, "cost model disabled.");      
        return 0;
      }
  
+  /* If the number of iterations is unknown, or the
+     peeling-for-misalignment amount is unknown, we will have to generate
+     a runtime test to test the loop count against the threshold.    */
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      || (byte_misalign < 0))
+    runtime_test = true;
+
    /* Requires loop versioning tests to handle misalignment.  */
  
    if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
@@ -144,7 +153,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
        /*  FIXME: Make cost depend on complexity of individual check.  */
        vec_outside_cost +=
          VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
-      if (vect_print_dump_info (REPORT_DETAILS))
+      if (vect_print_dump_info (REPORT_COST))
          fprintf (vect_dump, "cost model: Adding cost of checks for loop "
                   "versioning to treat misalignment.\n");
      }
@@ -154,7 +163,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
        /*  FIXME: Make cost depend on complexity of individual check.  */
        vec_outside_cost +=
          VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
-      if (vect_print_dump_info (REPORT_DETAILS))
+      if (vect_print_dump_info (REPORT_COST))
          fprintf (vect_dump, "cost model: Adding cost of checks for loop "
                   "versioning aliasing.\n");
      }
@@ -188,18 +197,20 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
         factor = 1;
  
        for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
-        {
-          tree stmt = bsi_stmt (si);
-          stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
-          if (!STMT_VINFO_RELEVANT_P (stmt_info)
-              && !STMT_VINFO_LIVE_P (stmt_info))
-            continue;
-          scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
-          vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
+       {
+         tree stmt = bsi_stmt (si);
+         stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+         /* Skip stmts that are not vectorized inside the loop.  */
+         if (!STMT_VINFO_RELEVANT_P (stmt_info)
+             && (!STMT_VINFO_LIVE_P (stmt_info)
+                 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
+           continue;
+         scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
+         vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
           /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
              some of the "outside" costs are generated inside the outer-loop.  */
-          vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
-        }
+         vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
+       }
      }
  
    /* Add additional cost for the peeled instructions in prologue and epilogue
@@ -211,19 +222,17 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
       TODO: Build an expression that represents peel_iters for prologue and
       epilogue to be used in a run-time test.  */
  
-  byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
-
    if (byte_misalign < 0)
      {
        peel_iters_prologue = vf/2;
-      if (vect_print_dump_info (REPORT_DETAILS))
+      if (vect_print_dump_info (REPORT_COST))
          fprintf (vect_dump, "cost model: "
                   "prologue peel iters set to vf/2.");
  
        /* If peeling for alignment is unknown, loop bound of main loop becomes
           unknown.  */
        peel_iters_epilogue = vf/2;
-      if (vect_print_dump_info (REPORT_DETAILS))
+      if (vect_print_dump_info (REPORT_COST))
          fprintf (vect_dump, "cost model: "
                   "epilogue peel iters set to vf/2 because "
                   "peeling for alignment is unknown .");
@@ -253,7 +262,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
        if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
          {
            peel_iters_epilogue = vf/2;
-          if (vect_print_dump_info (REPORT_DETAILS))
+          if (vect_print_dump_info (REPORT_COST))
              fprintf (vect_dump, "cost model: "
                       "epilogue peel iters set to vf/2 because "
                       "loop iterations are unknown .");
@@ -276,24 +285,72 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
                        + (peel_iters_epilogue * scalar_single_iter_cost)
                        + peel_guard_costs;
  
-  /* Allow targets add additional (outside-of-loop) costs. FORNOW, the only
-     information we provide for the target is whether testing against the
-     threshold involves a runtime test.  */
-  if (targetm.vectorize.builtin_vectorization_cost)
-    {
-      bool runtime_test = false;
-
-      /* If the number of iterations is unknown, or the
-        peeling-for-misalignment amount is unknown, we eill have to generate
-        a runtime test to test the loop count against the threshold.  */
-      if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-         || (byte_misalign < 0))
-       runtime_test = true;
-      vec_outside_cost +=
-       targetm.vectorize.builtin_vectorization_cost (runtime_test);
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "cost model : Adding target out-of-loop cost = %d",
-                 targetm.vectorize.builtin_vectorization_cost (runtime_test));
+  /* FORNOW: The scalar outside cost is incremented in one of the
+     following ways:
+
+     1. The vectorizer checks for alignment and aliasing and generates
+     a condition that allows dynamic vectorization.  A cost model
+     check is ANDED with the versioning condition.  Hence scalar code
+     path now has the added cost of the versioning check.
+
+       if (cost > th & versioning_check)
+         jmp to vector code
+
+     Hence run-time scalar is incremented by not-taken branch cost.
+
+     2. The vectorizer then checks if a prologue is required.  If the
+     cost model check was not done before during versioning, it has to
+     be done before the prologue check.
+
+       if (cost <= th)
+         prologue = scalar_iters
+       if (prologue == 0)
+         jmp to vector code
+       else
+         execute prologue
+       if (prologue == num_iters)
+        go to exit
+
+     Hence the run-time scalar cost is incremented by a taken branch,
+     plus a not-taken branch, plus a taken branch cost.
+
+     3. The vectorizer then checks if an epilogue is required.  If the
+     cost model check was not done before during prologue check, it
+     has to be done with the epilogue check.
+
+       if (prologue == 0)
+         jmp to vector code
+       else
+         execute prologue
+       if (prologue == num_iters)
+        go to exit
+       vector code:
+         if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
+           jmp to epilogue
+
+     Hence the run-time scalar cost should be incremented by 2 taken
+     branches.
+
+     TODO: The back end may reorder the BBS's differently and reverse
+     conditions/branch directions.  Change the stimates below to
+     something more reasonable.  */
+
+  if (runtime_test)
+    {
+      /* Cost model check occurs at versioning.  */
+      if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
+         || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+       scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
+      else
+       {
+         /* Cost model occurs at prologue generation.  */
+         if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+           scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
+             + TARG_COND_NOT_TAKEN_BRANCH_COST;
+         /* Cost model check occurs at epilogue generation.  */
+         else
+           scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
+       }
      }
  
    /* Add SLP costs.  */
@@ -306,9 +363,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
  
    /* Calculate number of iterations required to make the vector version 
       profitable, relative to the loop bodies only. The following condition
-     must hold true: ((SIC*VF)-VIC)*niters > VOC*VF, where
+     must hold true: 
+     SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
+     where
       SIC = scalar iteration cost, VIC = vector iteration cost,
-     VOC = vector outside cost and VF = vectorization factor.  */
+     VOC = vector outside cost, VF = vectorization factor,
+     PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
+     SOC = scalar outside cost for run time cost model check.  */
  
    if ((scalar_single_iter_cost * vf) > vec_inside_cost)
      {
@@ -316,22 +377,22 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
          min_profitable_iters = 1;
        else
          {
-          min_profitable_iters = (vec_outside_cost * vf 
-                                  - vec_inside_cost * peel_iters_prologue
+          min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
+                                 - vec_inside_cost * peel_iters_prologue
                                    - vec_inside_cost * peel_iters_epilogue)
                                   / ((scalar_single_iter_cost * vf)
                                      - vec_inside_cost);
  
            if ((scalar_single_iter_cost * vf * min_profitable_iters)
                <= ((vec_inside_cost * min_profitable_iters)
-                  + (vec_outside_cost * vf)))
+                  + ((vec_outside_cost - scalar_outside_cost) * vf)))
              min_profitable_iters++;
          }
      }
    /* vector version will never be profitable.  */
    else
      {
-      if (vect_print_dump_info (REPORT_DETAILS))
+      if (vect_print_dump_info (REPORT_COST))
          fprintf (vect_dump, "cost model: vector iteration cost = %d "
                   "is divisible by scalar iteration cost = %d by a factor "
                   "greater than or equal to the vectorization factor = %d .",
@@ -339,14 +400,16 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
        return -1;
      }
  
-  if (vect_print_dump_info (REPORT_DETAILS))
+  if (vect_print_dump_info (REPORT_COST))
      {
        fprintf (vect_dump, "Cost model analysis: \n");
        fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
                vec_inside_cost);
        fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
                vec_outside_cost);
-      fprintf (vect_dump, "  Scalar cost: %d\n", scalar_single_iter_cost);
+      fprintf (vect_dump, "  Scalar iteration cost: %d\n",
+              scalar_single_iter_cost);
+      fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
        fprintf (vect_dump, "  prologue iterations: %d\n",
                 peel_iters_prologue);
        fprintf (vect_dump, "  epilogue iterations: %d\n",
@@ -363,7 +426,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
         then skip the vectorized loop.  */
    min_profitable_iters--;
  
-  if (vect_print_dump_info (REPORT_DETAILS))
+  if (vect_print_dump_info (REPORT_COST))
      fprintf (vect_dump, "  Profitability threshold = %d\n",
              min_profitable_iters);
      
@@ -380,7 +443,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
     generated within the strip-mine loop, the initial definition before
     the loop, and the epilogue code that must be generated.  */
  
-static void
+static bool 
  vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
                            int ncopies)
  {
@@ -401,6 +464,16 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
  
    reduction_op = TREE_OPERAND (operation, op_type-1);
    vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
+  if (!vectype)
+    {
+      if (vect_print_dump_info (REPORT_COST))
+        {
+          fprintf (vect_dump, "unsupported data-type ");
+          print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
+        }
+      return false;
+   }
+  
    mode = TYPE_MODE (vectype);
    orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
  
@@ -448,10 +521,12 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
  
    STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
  
-  if (vect_print_dump_info (REPORT_DETAILS))
+  if (vect_print_dump_info (REPORT_COST))
      fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
               "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
               STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
+
+  return true;
  }
  
  
@@ -467,7 +542,7 @@ vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
    /* prologue cost for vec_init and vec_step.  */
    STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
    
-  if (vect_print_dump_info (REPORT_DETAILS))
+  if (vect_print_dump_info (REPORT_COST))
      fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
               "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
               STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
@@ -496,7 +571,7 @@ vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
         outside_cost += TARG_SCALAR_TO_VEC_COST; 
      }
    
-  if (vect_print_dump_info (REPORT_DETAILS))
+  if (vect_print_dump_info (REPORT_COST))
      fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
               "outside_cost = %d .", inside_cost, outside_cost);
  
@@ -554,7 +629,7 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
        inside_cost = ncopies * exact_log2(group_size) * group_size 
               * TARG_VEC_STMT_COST;
  
-      if (vect_print_dump_info (REPORT_DETAILS))
+      if (vect_print_dump_info (REPORT_COST))
          fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
                   group_size);
  
@@ -563,7 +638,7 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
    /* Costs of the stores.  */
    inside_cost += ncopies * TARG_VEC_STORE_COST;
  
-  if (vect_print_dump_info (REPORT_DETAILS))
+  if (vect_print_dump_info (REPORT_COST))
      fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
               "outside_cost = %d .", inside_cost, outside_cost);
  
@@ -614,7 +689,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
        inside_cost = ncopies * exact_log2(group_size) * group_size
         * TARG_VEC_STMT_COST;
  
-      if (vect_print_dump_info (REPORT_DETAILS))
+      if (vect_print_dump_info (REPORT_COST))
          fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
                   group_size);
  
@@ -627,7 +702,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
        {
          inside_cost += ncopies * TARG_VEC_LOAD_COST;
  
-        if (vect_print_dump_info (REPORT_DETAILS))
+        if (vect_print_dump_info (REPORT_COST))
            fprintf (vect_dump, "vect_model_load_cost: aligned.");
  
          break;
@@ -637,7 +712,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
          /* Here, we assign an additional cost for the unaligned load.  */
          inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
  
-        if (vect_print_dump_info (REPORT_DETAILS))
+        if (vect_print_dump_info (REPORT_COST))
            fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
                     "hardware.");
  
@@ -657,7 +732,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
        }
      case dr_explicit_realign_optimized:
        {
-        if (vect_print_dump_info (REPORT_DETAILS))
+        if (vect_print_dump_info (REPORT_COST))
            fprintf (vect_dump, "vect_model_load_cost: unaligned software "
                     "pipelined.");
  
@@ -684,7 +759,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
        gcc_unreachable ();
      }
    
-  if (vect_print_dump_info (REPORT_DETAILS))
+  if (vect_print_dump_info (REPORT_COST))
      fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
               "outside_cost = %d .", inside_cost, outside_cost);
  
@@ -1027,8 +1102,12 @@ vect_create_data_ref_ptr (tree stmt, struct loop *at_loop,
    new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
                                                     offset, loop);
    pe = loop_preheader_edge (loop);
-  new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
-  gcc_assert (!new_bb);
+  if (new_stmt_list)
+    {
+      new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
+      gcc_assert (!new_bb);
+    }
+
    *initial_address = new_temp;
  
    /* Create: p = (vectype *) initial_base  */
@@ -1306,6 +1385,7 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
    bool is_store = false;
    unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
    VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
+  bool constant_p;
  
    if (STMT_VINFO_DATA_REF (stmt_vinfo))
      is_store = true;
@@ -1329,6 +1409,7 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
    number_of_copies = least_common_multiple (nunits, group_size) / group_size;
  
    number_of_places_left_in_vector = nunits;
+  constant_p = true;
    for (j = 0; j < number_of_copies; j++)
      {
        for (i = group_size - 1; VEC_iterate (tree, stmts, i, stmt); i--)
@@ -1338,6 +1419,8 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
             op = operation;
           else
             op = TREE_OPERAND (operation, op_num);
+         if (!CONSTANT_CLASS_P (op))
+           constant_p = false;
  
            /* Create 'vect_ = {op0,op1,...,opn}'.  */
            t = tree_cons (NULL_TREE, op, t);
@@ -1349,7 +1432,12 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
                number_of_places_left_in_vector = nunits;
  
               vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
-              vec_cst = build_constructor_from_list (vector_type, t);
+              gcc_assert (vector_type);
+             if (constant_p)
+               vec_cst = build_vector (vector_type, t);
+             else
+               vec_cst = build_constructor_from_list (vector_type, t);
+             constant_p = true;
                VEC_quick_push (tree, voprnds,
                                vect_init_vector (stmt, vec_cst, vector_type,
                                                 NULL));
@@ -1473,8 +1561,8 @@ get_initial_def_for_induction (tree iv_phi)
    loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
-  tree vectype = get_vectype_for_scalar_type (scalar_type);
-  int nunits =  TYPE_VECTOR_SUBPARTS (vectype);
+  tree vectype; 
+  int nunits;
    edge pe = loop_preheader_edge (loop);
    struct loop *iv_loop;
    basic_block new_bb;
@@ -1488,7 +1576,7 @@ get_initial_def_for_induction (tree iv_phi)
    int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    int i;
    bool ok;
-  int ncopies = vf / nunits;
+  int ncopies;
    tree expr;
    stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
    bool nested_in_vect_loop = false;
@@ -1501,6 +1589,11 @@ get_initial_def_for_induction (tree iv_phi)
    block_stmt_iterator si;
    basic_block bb = bb_for_stmt (iv_phi);
  
+  vectype = get_vectype_for_scalar_type (scalar_type);
+  gcc_assert (vectype);
+  nunits = TYPE_VECTOR_SUBPARTS (vectype);
+  ncopies = vf / nunits;
+
    gcc_assert (phi_info);
    gcc_assert (ncopies >= 1);
  
@@ -1599,7 +1692,8 @@ get_initial_def_for_induction (tree iv_phi)
    t = NULL_TREE;
    for (i = 0; i < nunits; i++)
      t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
-  vec = build_constructor_from_list (vectype, t);
+  gcc_assert (CONSTANT_CLASS_P (new_name));
+  vec = build_vector (vectype, t);
    vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
  
  
@@ -1655,7 +1749,8 @@ get_initial_def_for_induction (tree iv_phi)
        t = NULL_TREE;
        for (i = 0; i < nunits; i++)
         t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
-      vec = build_constructor_from_list (vectype, t);
+      gcc_assert (CONSTANT_CLASS_P (new_name));
+      vec = build_vector (vectype, t);
        vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
  
        vec_def = induc_def;
@@ -1791,6 +1886,7 @@ vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
              t = tree_cons (NULL_TREE, op, t);
            }
          vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
+        gcc_assert (vector_type);
          vec_cst = build_vector (vector_type, t);
  
          return vect_init_vector (stmt, vec_cst, vector_type, NULL);
@@ -1813,6 +1909,7 @@ vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
  
         /* FIXME: use build_constructor directly.  */
         vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
+        gcc_assert (vector_type);
          vec_inv = build_constructor_from_list (vector_type, t);
          return vect_init_vector (stmt, vec_inv, vector_type, NULL);
        }
@@ -2018,11 +2115,7 @@ vect_finish_stmt_generation (tree stmt, tree vec_stmt,
    /* Make sure bsi points to the stmt that is being vectorized.  */
    gcc_assert (stmt == bsi_stmt (*bsi));
  
-#ifdef USE_MAPPED_LOCATION
    SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
-#else
-  SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
-#endif
  }
  
  
@@ -2087,7 +2180,7 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
    tree vector_type;
    bool nested_in_vect_loop = false; 
  
-  gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
+  gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
    if (nested_in_vect_loop_p (loop, stmt))
      nested_in_vect_loop = true;
    else
@@ -2100,18 +2193,19 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
    case WIDEN_SUM_EXPR:
    case DOT_PROD_EXPR:
    case PLUS_EXPR:
-      if (nested_in_vect_loop)
-       *adjustment_def = vecdef;
-      else
-       *adjustment_def = init_val;
-    /* Create a vector of zeros for init_def.  */
-    if (INTEGRAL_TYPE_P (type))
-      def_for_init = build_int_cst (type, 0);
+    if (nested_in_vect_loop)
+      *adjustment_def = vecdef;
      else
+      *adjustment_def = init_val;
+    /* Create a vector of zeros for init_def.  */
+    if (SCALAR_FLOAT_TYPE_P (type))
        def_for_init = build_real (type, dconst0);
-      for (i = nunits - 1; i >= 0; --i)
-    t = tree_cons (NULL_TREE, def_for_init, t);
+    else
+      def_for_init = build_int_cst (type, 0);
+    for (i = nunits - 1; i >= 0; --i)
+      t = tree_cons (NULL_TREE, def_for_init, t);
      vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
+    gcc_assert (vector_type);
      init_def = build_vector (vector_type, t);
      break;
  
@@ -2222,6 +2316,7 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt,
    op_type = TREE_OPERAND_LENGTH (operation);
    reduction_op = TREE_OPERAND (operation, op_type-1);
    vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
+  gcc_assert (vectype);
    mode = TYPE_MODE (vectype);
  
    /*** 1. Create the reduction def-use cycle  ***/
@@ -2426,7 +2521,6 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt,
           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
           rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
                          bitsize_zero_node);
-         BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
           epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
           GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
@@ -2441,7 +2535,6 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt,
               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
                                  bitpos);
                 
-             BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
               epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
               new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
               GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
@@ -2477,7 +2570,6 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt,
         bitpos = bitsize_zero_node;
  
        rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
-      BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
        epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
        new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
        GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp; 
@@ -2694,6 +2786,9 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
      return false;
    scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
    scalar_type = TREE_TYPE (scalar_dest);
+  if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type) 
+      && !SCALAR_FLOAT_TYPE_P (scalar_type))
+    return false;
  
    /* All uses but the last are expected to be defined in the loop.
       The last use is the reduction variable.  */
@@ -2795,6 +2890,16 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
           reduction variable, and get the tree-code from orig_stmt.  */
        orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
        vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
+      if (!vectype)
+       {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            {
+              fprintf (vect_dump, "unsupported data-type ");
+              print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
+            }
+          return false;
+        }
+
        vec_mode = TYPE_MODE (vectype);
      }
    else
@@ -2823,7 +2928,8 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
    if (!vec_stmt) /* transformation not required.  */
      {
        STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
-      vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
+      if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
+        return false;
        return true;
      }
  
@@ -2961,14 +3067,6 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
    if (STMT_SLP_TYPE (stmt_info))
      return false;
  
-  /* FORNOW: not yet supported.  */
-  if (STMT_VINFO_LIVE_P (stmt_info))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "value used after loop.");
-      return false;
-    }
-
    /* Is STMT a vectorizable call?   */
    if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
      return false;
@@ -3307,14 +3405,6 @@ vectorizable_conversion (tree stmt, block_stmt_iterator *bsi,
    if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
      return false;
  
-  if (STMT_VINFO_LIVE_P (stmt_info))
-    {
-      /* FORNOW: not yet supported.  */
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "value used after loop.");
-      return false;
-    }
-
    if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
      return false;
  
@@ -3545,6 +3635,9 @@ vectorizable_conversion (tree stmt, block_stmt_iterator *bsi,
        *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
      }
  
+  if (vec_oprnds0)
+    VEC_free (tree, heap, vec_oprnds0); 
+
    return true;
  }
  
@@ -3585,14 +3678,6 @@ vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
    if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
      return false;
  
-  /* FORNOW: not yet supported.  */
-  if (STMT_VINFO_LIVE_P (stmt_info))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "value used after loop.");
-      return false;
-    }
-
    /* Is vectorizable assignment?  */
    if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
      return false;
@@ -3702,14 +3787,6 @@ vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
  
    gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
  
-  if (STMT_VINFO_LIVE_P (stmt_info))
-    {
-      /* FORNOW: not yet supported.  */
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "value used after loop.");
-      return false;
-    }
-
    if (TREE_CODE (phi) != PHI_NODE)
      return false;
  
@@ -3771,6 +3848,8 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
    int j, i;
    VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
    tree vop0, vop1;
+  unsigned int k;
+  bool scalar_shift_arg = false;
  
    /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
       this, so we can safely override NCOPIES with 1 here.  */
@@ -3791,14 +3870,6 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
    if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
      return false;
  
-  /* FORNOW: not yet supported.  */
-  if (STMT_VINFO_LIVE_P (stmt_info))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "value used after loop.");
-      return false;
-    }
-
    /* Is STMT a vectorizable binary/unary operation?   */
    if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
      return false;
@@ -3865,18 +3936,21 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
      {
        if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "op not supported by target.");
+      /* Check only during analysis.  */
        if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
-          || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-            < vect_min_worthwhile_factor (code))
+          || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+             < vect_min_worthwhile_factor (code)
+              && !vec_stmt))
          return false;
        if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "proceeding using word mode.");
      }
  
-  /* Worthwhile without SIMD support?  */
+  /* Worthwhile without SIMD support? Check only during analysis.  */
    if (!VECTOR_MODE_P (TYPE_MODE (vectype))
        && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-        < vect_min_worthwhile_factor (code))
+        < vect_min_worthwhile_factor (code)
+      && !vec_stmt)
      {
        if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "not worthwhile without SIMD support.");
@@ -3892,14 +3966,18 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
        /* Invariant argument is needed for a vector shift
          by a scalar shift operand.  */
        optab_op2_mode = insn_data[icode].operand[2].mode;
-      if (! (VECTOR_MODE_P (optab_op2_mode)
-            || dt[1] == vect_constant_def
-            || dt[1] == vect_invariant_def))
+      if (!VECTOR_MODE_P (optab_op2_mode))
         {
-         if (vect_print_dump_info (REPORT_DETAILS))
-           fprintf (vect_dump, "operand mode requires invariant argument.");
-         return false;
-       }
+         if (dt[1] != vect_constant_def && dt[1] != vect_invariant_def)
+           {
+             if (vect_print_dump_info (REPORT_DETAILS))
+               fprintf (vect_dump, "operand mode requires invariant"
+                                    " argument.");
+             return false;
+           }
+
+          scalar_shift_arg = true;
+        }
      }
  
    if (!vec_stmt) /* transformation not required.  */
@@ -3919,10 +3997,21 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
    /* Handle def.  */
    vec_dest = vect_create_destination_var (scalar_dest, vectype);
  
+  /* Allocate VECs for vector operands. In case of SLP, vector operands are 
+     created in the previous stages of the recursion, so no allocation is
+     needed, except for the case of shift with scalar shift argument. In that
+     case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
+     be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
+     In case of loop-based vectorization we allocate VECs of size 1. We 
+     allocate VEC_OPRNDS1 only in case of binary operation.  */ 
    if (!slp_node)
-    vec_oprnds0 = VEC_alloc (tree, heap, 1);
-  if (op_type == binary_op)
-    vec_oprnds1 = VEC_alloc (tree, heap, 1);
+    {
+      vec_oprnds0 = VEC_alloc (tree, heap, 1);
+      if (op_type == binary_op)
+        vec_oprnds1 = VEC_alloc (tree, heap, 1);
+    }
+  else if (scalar_shift_arg)
+    vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);  
  
    /* In case the vectorization factor (VF) is bigger than the number
       of elements that we can fit in a vectype (nunits), we have to generate
@@ -3997,10 +4086,20 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
                     fprintf (vect_dump, "operand 1 using scalar mode.");
                   vec_oprnd1 = op1;
                   VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
+                 if (slp_node)
+                   {
+                     /* Store vec_oprnd1 for every vector stmt to be created
+                        for SLP_NODE. We check during the analysis that all the
+                         shift arguments are the same.  
+                        TODO: Allow different constants for different vector 
+                        stmts generated for an SLP instance.  */          
+                     for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
+                       VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
+                   }
                 }
             }
          
-          /* vec_oprnd is available if operand 1 should be of a scalar-type 
+          /* vec_oprnd1 is available if operand 1 should be of a scalar-type 
               (a special case for certain kind of vector shifts); otherwise, 
               operand 1 should be of a vector type (the usual case).  */
           if (op_type == binary_op && !vec_oprnd1)
@@ -4088,14 +4187,6 @@ vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
    if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
      return false;
  
-  /* FORNOW: not yet supported.  */
-  if (STMT_VINFO_LIVE_P (stmt_info))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "value used after loop.");
-      return false;
-    }
-
    /* Is STMT a vectorizable type-demotion operation?  */
    if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
      return false;
@@ -4249,14 +4340,6 @@ vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
    if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
      return false;
  
-  /* FORNOW: not yet supported.  */
-  if (STMT_VINFO_LIVE_P (stmt_info))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "value used after loop.");
-      return false;
-    }
-
    /* Is STMT a vectorizable type-promotion operation?  */
    if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
      return false;
@@ -4506,11 +4589,8 @@ vect_permute_store_chain (VEC(tree,heap) *dr_chain,
    tree scalar_dest, tmp;
    int i;
    unsigned int j;
-  VEC(tree,heap) *first, *second;
    
    scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
-  first = VEC_alloc (tree, heap, length/2);
-  second = VEC_alloc (tree, heap, length/2);
  
    /* Check that the operation is supported.  */
    if (!vect_strided_store_supported (vectype))
@@ -4598,7 +4678,7 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
    int nunits = TYPE_VECTOR_SUBPARTS (vectype);
    int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
    int j;
-  tree next_stmt, first_stmt;
+  tree next_stmt, first_stmt = NULL_TREE;
    bool strided_store = false;
    unsigned int group_size, i;
    VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
@@ -4629,13 +4709,6 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
    if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
      return false;
  
-  if (STMT_VINFO_LIVE_P (stmt_info))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "value used after loop.");
-      return false;
-    }
-
    /* Is vectorizable store? */
  
    if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
@@ -4667,9 +4740,28 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
    if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
      {
        strided_store = true;
+      first_stmt = DR_GROUP_FIRST_DR (stmt_info);
        if (!vect_strided_store_supported (vectype)
           && !PURE_SLP_STMT (stmt_info) && !slp)
-       return false;      
+       return false;
+     
+      if (first_stmt == stmt)
+       {
+          /* STMT is the leader of the group. Check the operands of all the
+             stmts of the group.  */
+          next_stmt = DR_GROUP_NEXT_DR (stmt_info);
+          while (next_stmt)
+            {
+              op = GIMPLE_STMT_OPERAND (next_stmt, 1);
+              if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
+                {
+                  if (vect_print_dump_info (REPORT_DETAILS))
+                    fprintf (vect_dump, "use not simple.");
+                  return false;
+                }
+              next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
+            }
+        }
      }
  
    if (!vec_stmt) /* transformation not required.  */
@@ -4684,7 +4776,6 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
  
    if (strided_store)
      {
-      first_stmt = DR_GROUP_FIRST_DR (stmt_info);
        first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
        group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
  
@@ -4830,8 +4921,9 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
              OPRNDS are of size 1.  */
           for (i = 0; i < group_size; i++)
             {
-             vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, 
-                                                  VEC_index (tree, oprnds, i));
+             op = VEC_index (tree, oprnds, i);
+             vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
+             vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op); 
               VEC_replace(tree, dr_chain, i, vec_oprnd);
               VEC_replace(tree, oprnds, i, vec_oprnd);
             }
@@ -4881,6 +4973,11 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
         }
      }
  
+  VEC_free (tree, heap, dr_chain);  
+  VEC_free (tree, heap, oprnds);  
+  if (result_chain)
+    VEC_free (tree, heap, result_chain);  
+
    return true;
  }
  
@@ -5050,6 +5147,7 @@ vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
        new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
        new_temp = make_ssa_name (vec_dest, new_stmt);
        GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+      mark_symbols_for_renaming (new_stmt);
        new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
        gcc_assert (!new_bb);
        msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
@@ -5385,6 +5483,8 @@ vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
             break;
          }
      }
+
+  VEC_free (tree, heap, result_chain);
    return true;
  }
  
@@ -5458,14 +5558,6 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
    if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
      return false;
  
-  /* FORNOW: not yet supported.  */
-  if (STMT_VINFO_LIVE_P (stmt_info))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "value used after loop.");
-      return false;
-    }
-
    /* Is vectorizable load? */
    if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
      return false;
@@ -5793,8 +5885,6 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
                   bitpos = bitsize_zero_node;
                   vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp, 
                                                             bitsize, bitpos);
-                 BIT_FIELD_REF_UNSIGNED (vec_inv) = 
-                                                TYPE_UNSIGNED (scalar_type);
                   vec_dest = 
                         vect_create_destination_var (scalar_dest, NULL_TREE);
                   new_stmt = build_gimple_modify_stmt (vec_dest, vec_inv);
@@ -5832,6 +5922,7 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
           if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
             return false;         
           *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
+          VEC_free (tree, heap, dr_chain);
           dr_chain = VEC_alloc (tree, heap, group_size);
         }
        else
@@ -5844,6 +5935,9 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
         }
      }
  
+  if (dr_chain)
+    VEC_free (tree, heap, dr_chain);
+
    return true;
  }
  
@@ -5893,7 +5987,7 @@ vectorizable_live_operation (tree stmt,
    for (i = 0; i < op_type; i++)
      {
        op = TREE_OPERAND (operation, i);
-      if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
+      if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
          {
            if (vect_print_dump_info (REPORT_DETAILS))
              fprintf (vect_dump, "use not simple.");
@@ -6443,6 +6537,33 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
      }
  }
  
+/* Return the more conservative threshold between the
+   min_profitable_iters returned by the cost model and the user
+   specified threshold, if provided.  */
+
+static unsigned int
+conservative_cost_threshold (loop_vec_info loop_vinfo,
+                            int min_profitable_iters)
+{
+  unsigned int th;
+  int min_scalar_loop_bound;
+
+  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
+                           * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
+
+  /* Use the cost model only if it is more conservative than user specified
+     threshold.  */
+  th = (unsigned) min_scalar_loop_bound;
+  if (min_profitable_iters
+      && (!min_scalar_loop_bound
+          || min_profitable_iters > min_scalar_loop_bound))
+    th = (unsigned) min_profitable_iters;
+
+  if (th && vect_print_dump_info (REPORT_COST))
+    fprintf (vect_dump, "Vectorization may not be profitable.");
+
+  return th;
+}
  
  /* Function vect_do_peeling_for_loop_bound
  
@@ -6463,8 +6584,8 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
    edge update_e;
    basic_block preheader;
    int loop_num;
-  unsigned int th;
-  int min_scalar_loop_bound;
+  bool check_profitability = false;
+  unsigned int th = 0;
    int min_profitable_iters;
  
    if (vect_print_dump_info (REPORT_DETAILS))
@@ -6482,28 +6603,24 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
  
    loop_num  = loop->num; 
  
-  /* Analyze cost to set threshhold for vectorized loop.  */
-  min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
-  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
-                           * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
-
-  /* Use the cost model only if it is more conservative than user specified
-     threshold.  */
+  /* If cost model check not done during versioning and 
+     peeling for alignment.  */
+  if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
+      && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
+      && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
+    {
+      check_profitability = true;
  
-  th = (unsigned) min_scalar_loop_bound;
-  if (min_profitable_iters
-      && (!min_scalar_loop_bound
-          || min_profitable_iters > min_scalar_loop_bound))
-    th = (unsigned) min_profitable_iters;
+      /* Get profitability threshold for vectorized loop.  */
+      min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
  
-  if (((LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
-      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
-      && vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "vectorization may not be profitable.");
+      th = conservative_cost_threshold (loop_vinfo, 
+                                       min_profitable_iters);
+    }
  
    new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
                                              ratio_mult_vf_name, ni_name, false,
-                                            th);
+                                            th, check_profitability);
    gcc_assert (new_loop);
    gcc_assert (loop_num == loop->num);
  #ifdef ENABLE_CHECKING
@@ -6721,6 +6838,9 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
    tree niters_of_prolog_loop, ni_name;
    tree n_iters;
    struct loop *new_loop;
+  bool check_profitability = false;
+  unsigned int th = 0;
+  int min_profitable_iters;
  
    if (vect_print_dump_info (REPORT_DETAILS))
      fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
@@ -6730,10 +6850,26 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
    ni_name = vect_build_loop_niters (loop_vinfo);
    niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
    
+
+  /* If cost model check not done during versioning.  */
+  if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
+      && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+    {
+      check_profitability = true;
+
+      /* Get profitability threshold for vectorized loop.  */
+      min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
+
+      th = conservative_cost_threshold (loop_vinfo, 
+                                       min_profitable_iters);
+    }
+
    /* Peel the prolog loop and iterate it niters_of_prolog_loop.  */
-  new_loop = 
-       slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop), 
-                                      niters_of_prolog_loop, ni_name, true, 0); 
+  new_loop =
+    slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
+                                  niters_of_prolog_loop, ni_name, true,
+                                  th, check_profitability);
+
    gcc_assert (new_loop);
  #ifdef ENABLE_CHECKING
    slpeel_verify_cfg_after_peeling (new_loop, loop);
@@ -6761,6 +6897,8 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
     checked at runtime.
  
     Input:
+   COND_EXPR  - input conditional expression.  New conditions will be chained
+                with logical AND operation.
     LOOP_VINFO - two fields of the loop information are used.
                  LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
                  LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
@@ -6777,8 +6915,9 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
          test can be done as a&(n-1) == 0.  For example, for 16
          byte vectors the test is a&0xf == 0.  */
  
-static tree
+static void
  vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
+                                   tree *cond_expr,
                                     tree *cond_expr_stmt_list)
  {
    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -6794,6 +6933,7 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
    tree or_tmp_name = NULL_TREE;
    tree and_tmp, and_tmp_name, and_stmt;
    tree ptrsize_zero;
+  tree part_cond_expr;
  
    /* Check that mask is one less than a power of 2, i.e., mask is
       all zeros followed by all ones.  */
@@ -6867,8 +7007,13 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
    /* Make and_tmp the left operand of the conditional test against zero.
       if and_tmp has a nonzero bit then some address is unaligned.  */
    ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
-  return build2 (EQ_EXPR, boolean_type_node,
-                 and_tmp_name, ptrsize_zero);
+  part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
+                               and_tmp_name, ptrsize_zero);
+  if (*cond_expr)
+    *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
+                             *cond_expr, part_cond_expr);
+  else
+    *cond_expr = part_cond_expr;
  }
  
  /* Function vect_vfa_segment_size.
@@ -6909,7 +7054,7 @@ vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
  
     Input:
     COND_EXPR  - input conditional expression.  New conditions will be chained
-                with logical and operation.
+                with logical AND operation.
     LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
                 to be checked.
  
@@ -7031,7 +7176,11 @@ vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
     data references that may or may not be aligned.  An additional
     sequence of runtime tests is generated for each pairs of DDRs whose
     independence was not proven.  The vectorized version of loop is 
-   executed only if both alias and alignment tests are passed.  */
+   executed only if both alias and alignment tests are passed.  
+  
+   The test generated to check which version of loop is executed
+   is modified to also check for profitability as indicated by the 
+   cost model initially.  */
  
  static void
  vect_loop_versioning (loop_vec_info loop_vinfo)
@@ -7048,17 +7197,30 @@ vect_loop_versioning (loop_vec_info loop_vinfo)
    tree orig_phi, new_phi, arg;
    unsigned prob = 4 * REG_BR_PROB_BASE / 5;
    tree gimplify_stmt_list;
+  tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
+  int min_profitable_iters = 0;
+  unsigned int th;
  
-  if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
-      && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
-    return;
+  /* Get profitability threshold for vectorized loop.  */
+  min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
+
+  th = conservative_cost_threshold (loop_vinfo,
+                                   min_profitable_iters);
+
+  cond_expr =
+    build2 (GT_EXPR, boolean_type_node, scalar_loop_iters, 
+           build_int_cst (TREE_TYPE (scalar_loop_iters), th));
+
+  cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
+                                   false, NULL_TREE);
  
    if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
-    cond_expr =
-      vect_create_cond_for_align_checks (loop_vinfo, &cond_expr_stmt_list);
+      vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
+                                        &cond_expr_stmt_list);
  
    if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
-    vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr, &cond_expr_stmt_list);
+    vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr, 
+                                      &cond_expr_stmt_list);
  
    cond_expr =
      fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
@@ -7112,10 +7274,8 @@ vect_loop_versioning (loop_vec_info loop_vinfo)
  static void
  vect_remove_stores (tree first_stmt)
  {
-  stmt_ann_t ann;
    tree next = first_stmt;
    tree tmp;
-  stmt_vec_info next_stmt_info;
    block_stmt_iterator next_si;
  
    while (next)
@@ -7123,11 +7283,8 @@ vect_remove_stores (tree first_stmt)
        /* Free the attached stmt_vec_info and remove the stmt.  */
        next_si = bsi_for_stmt (next);
        bsi_remove (&next_si, true);
-      next_stmt_info = vinfo_for_stmt (next);
-      ann = stmt_ann (next);
-      tmp = DR_GROUP_NEXT_DR (next_stmt_info);
-      free (next_stmt_info);
-      set_stmt_info (ann, NULL);
+      tmp = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
+      free_stmt_vec_info (next);
        next = tmp;
      }
  }
@@ -7226,7 +7383,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
    int nbbs = loop->num_nodes;
-  block_stmt_iterator si, next_si;
+  block_stmt_iterator si;
    int i;
    tree ratio = NULL;
    int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
@@ -7236,7 +7393,10 @@ vect_transform_loop (loop_vec_info loop_vinfo)
  
    if (vect_print_dump_info (REPORT_DETAILS))
      fprintf (vect_dump, "=== vec_transform_loop ===");
-  vect_loop_versioning (loop_vinfo);
+
+  if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
+      || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+    vect_loop_versioning (loop_vinfo);
  
    /* CHECKME: we wouldn't need this if we called update_ssa once
       for all loops.  */
@@ -7388,37 +7548,19 @@ vect_transform_loop (loop_vec_info loop_vinfo)
           is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL);
            if (is_store)
              {
-             stmt_ann_t ann;
               if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
                 {
                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
                      interleaving chain was completed - free all the stores in
                      the chain.  */
-                 tree next = DR_GROUP_FIRST_DR (stmt_info);
-                 tree tmp;
-                 stmt_vec_info next_stmt_info;
-
-                 while (next)
-                   {
-                     next_si = bsi_for_stmt (next);
-                     next_stmt_info = vinfo_for_stmt (next);
-                     /* Free the attached stmt_vec_info and remove the stmt.  */
-                     ann = stmt_ann (next);
-                     tmp = DR_GROUP_NEXT_DR (next_stmt_info);
-                     free (next_stmt_info);
-                     set_stmt_info (ann, NULL);
-                     bsi_remove (&next_si, true);
-                     next = tmp;
-                   }
+                 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
                   bsi_remove (&si, true);
                   continue;
                 }
               else
                 {
                   /* Free the attached stmt_vec_info and remove the stmt.  */
-                 ann = stmt_ann (stmt);
-                 free (stmt_info);
-                 set_stmt_info (ann, NULL);
+                 free_stmt_vec_info (stmt);
                   bsi_remove (&si, true);
                   continue;
                 }