OSDN Git Service

PR middle-end/38641
[pf3gnuchains/gcc-fork.git] / gcc / tree-vect-analyze.c
index 6cfea2b..a4460b4 100644 (file)
@@ -1,12 +1,13 @@
 /* Analysis Utilities for Loop Vectorization.
-   Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
+   Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software
+   Foundation, Inc.
    Contributed by Dorit Naishlos <dorit@il.ibm.com>
 
 This file is part of GCC.
 
 GCC is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free
-Software Foundation; either version 2, or (at your option) any later
+Software Foundation; either version 3, or (at your option) any later
 version.
 
 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
@@ -15,9 +16,8 @@ FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 for more details.
 
 You should have received a copy of the GNU General Public License
-along with GCC; see the file COPYING.  If not, write to the Free
-Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
-02110-1301, USA.  */
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
 
 #include "config.h"
 #include "system.h"
@@ -25,6 +25,7 @@ Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
 #include "tm.h"
 #include "ggc.h"
 #include "tree.h"
+#include "target.h"
 #include "basic-block.h"
 #include "diagnostic.h"
 #include "tree-flow.h"
@@ -39,30 +40,53 @@ Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
 #include "tree-scalar-evolution.h"
 #include "tree-vectorizer.h"
 #include "toplev.h"
+#include "recog.h"
 
-/* Main analysis functions.  */
-static loop_vec_info vect_analyze_loop_form (struct loop *);
-static bool vect_analyze_data_refs (loop_vec_info);
-static bool vect_mark_stmts_to_be_vectorized (loop_vec_info);
-static void vect_analyze_scalar_cycles (loop_vec_info);
-static bool vect_analyze_data_ref_accesses (loop_vec_info);
-static bool vect_analyze_data_ref_dependences (loop_vec_info);
-static bool vect_analyze_data_refs_alignment (loop_vec_info);
-static bool vect_compute_data_refs_alignment (loop_vec_info);
-static bool vect_enhance_data_refs_alignment (loop_vec_info);
-static bool vect_analyze_operations (loop_vec_info);
-static bool vect_determine_vectorization_factor (loop_vec_info);
-
-/* Utility functions for the analyses.  */
-static bool exist_non_indexing_operands_for_use_p (tree, tree);
-static tree vect_get_loop_niters (struct loop *, tree *);
-static bool vect_analyze_data_ref_dependence
-  (struct data_dependence_relation *, loop_vec_info);
-static bool vect_compute_data_ref_alignment (struct data_reference *); 
-static bool vect_analyze_data_ref_access (struct data_reference *);
 static bool vect_can_advance_ivs_p (loop_vec_info);
-static void vect_update_misalignment_for_peel
-  (struct data_reference *, struct data_reference *, int npeel);
+
+/* Return the smallest scalar part of STMT.
+   This is used to determine the vectype of the stmt. We generally set the 
+   vectype according to the type of the result (lhs). For stmts whose 
+   result-type is different than the type of the arguments (e.g., demotion,
+   promotion), vectype will be reset appropriately (later).  Note that we have 
+   to visit the smallest datatype in this function, because that determines the
+   VF. If the smallest datatype in the loop is present only as the rhs of a 
+   promotion operation - we'd miss it.
+   Such a case, where a variable of this datatype does not appear in the lhs
+   anywhere in the loop, can only occur if it's an invariant: e.g.:
+   'int_x = (int) short_inv', which we'd expect to have been optimized away by 
+   invariant motion. However, we cannot rely on invariant motion to always take
+   invariants out of the loop, and so in the case of promotion we also have to
+   check the rhs. 
+   LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
+   types.  */
+
+tree
+vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
+                               HOST_WIDE_INT *rhs_size_unit)
+{
+  tree scalar_type = gimple_expr_type (stmt);
+  HOST_WIDE_INT lhs, rhs;
+
+  lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
+
+  if (is_gimple_assign (stmt)
+      && (gimple_assign_cast_p (stmt)
+          || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
+          || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
+    {
+      tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
+
+      rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
+      if (rhs < lhs)
+        scalar_type = rhs_type;
+    }
+     
+  *lhs_size_unit = lhs; 
+  *rhs_size_unit = rhs;
+  return scalar_type;
+}
+
 
 /* Function vect_determine_vectorization_factor
 
@@ -95,14 +119,15 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
   int nbbs = loop->num_nodes;
-  block_stmt_iterator si;
+  gimple_stmt_iterator si;
   unsigned int vectorization_factor = 0;
   tree scalar_type;
-  tree phi;
+  gimple phi;
   tree vectype;
   unsigned int nunits;
   stmt_vec_info stmt_info;
   int i;
+  HOST_WIDE_INT dummy;
 
   if (vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
@@ -111,13 +136,14 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
     {
       basic_block bb = bbs[i];
 
-      for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
+      for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
        {
+         phi = gsi_stmt (si);
          stmt_info = vinfo_for_stmt (phi);
          if (vect_print_dump_info (REPORT_DETAILS))
            {
              fprintf (vect_dump, "==> examining phi: ");
-             print_generic_expr (vect_dump, phi, TDF_SLIM);
+             print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
            }
 
          gcc_assert (stmt_info);
@@ -162,15 +188,15 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
            }
        }
 
-      for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
+      for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
         {
-         tree stmt = bsi_stmt (si);
+         gimple stmt = gsi_stmt (si);
          stmt_info = vinfo_for_stmt (stmt);
 
          if (vect_print_dump_info (REPORT_DETAILS))
            {
              fprintf (vect_dump, "==> examining statement: ");
-             print_generic_expr (vect_dump, stmt, TDF_SLIM);
+             print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
            }
 
          gcc_assert (stmt_info);
@@ -184,23 +210,22 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
              continue;
            }
 
-         if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
+         if (gimple_get_lhs (stmt) == NULL_TREE)
            {
              if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
                {
                  fprintf (vect_dump, "not vectorized: irregular stmt.");
-                 print_generic_expr (vect_dump, stmt, TDF_SLIM);
+                 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
                }
              return false;
            }
 
-         if (!GIMPLE_STMT_P (stmt)
-             && VECTOR_MODE_P (TYPE_MODE (TREE_TYPE (stmt))))
+         if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
            {
              if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
                {
                  fprintf (vect_dump, "not vectorized: vector stmt in loop:");
-                 print_generic_expr (vect_dump, stmt, TDF_SLIM);
+                 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
                }
              return false;
            }
@@ -216,21 +241,12 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
            }
          else
            {
+
              gcc_assert (! STMT_VINFO_DATA_REF (stmt_info)
                          && !is_pattern_stmt_p (stmt_info));
 
-             /* We set the vectype according to the type of the result (lhs).
-                For stmts whose result-type is different than the type of the
-                arguments (e.g. demotion, promotion), vectype will be reset 
-                appropriately (later).  Note that we have to visit the smallest 
-                datatype in this function, because that determines the VF.  
-                If the smallest datatype in the loop is present only as the 
-                rhs of a promotion operation - we'd miss it here.
-                However, in such a case, that a variable of this datatype
-                does not appear in the lhs anywhere in the loop, it shouldn't
-                affect the vectorization factor.   */
-             scalar_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
-
+             scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, 
+                                                           &dummy);
              if (vect_print_dump_info (REPORT_DETAILS))
                {
                  fprintf (vect_dump, "get vectype for scalar type:  ");
@@ -283,6 +299,30 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 }
 
 
+/* SLP costs are calculated according to SLP instance unrolling factor (i.e., 
+   the number of created vector stmts depends on the unrolling factor). However,
+   the actual number of vector stmts for every SLP node depends on VF which is
+   set later in vect_analyze_operations(). Hence, SLP costs should be updated.
+   In this function we assume that the inside costs calculated in 
+   vect_model_xxx_cost are linear in ncopies.  */
+
+static void
+vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
+{
+  unsigned int i, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  VEC (slp_instance, heap) *slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
+  slp_instance instance;
+
+  if (vect_print_dump_info (REPORT_SLP))
+    fprintf (vect_dump, "=== vect_update_slp_costs_according_to_vf ===");
+
+  for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
+    /* We assume that costs are linear in ncopies.  */
+    SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance) *= vf 
+      / SLP_INSTANCE_UNROLLING_FACTOR (instance);        
+}
+
+
 /* Function vect_analyze_operations.
 
    Scan the loop stmts and make sure they are all vectorizable.  */
@@ -293,16 +333,17 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
   int nbbs = loop->num_nodes;
-  block_stmt_iterator si;
+  gimple_stmt_iterator si;
   unsigned int vectorization_factor = 0;
   int i;
   bool ok;
-  tree phi;
+  gimple phi;
   stmt_vec_info stmt_info;
   bool need_to_vectorize = false;
   int min_profitable_iters;
   int min_scalar_loop_bound;
   unsigned int th;
+  bool only_slp_in_loop = true;
 
   if (vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "=== vect_analyze_operations ===");
@@ -314,15 +355,34 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
     {
       basic_block bb = bbs[i];
 
-      for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
+      for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
         {
+         phi = gsi_stmt (si);
          ok = true;
 
          stmt_info = vinfo_for_stmt (phi);
          if (vect_print_dump_info (REPORT_DETAILS))
            {
              fprintf (vect_dump, "examining phi: ");
-             print_generic_expr (vect_dump, phi, TDF_SLIM);
+             print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
+           }
+
+         if (! is_loop_header_bb_p (bb))
+           {
+             /* inner-loop loop-closed exit phi in outer-loop vectorization
+                (i.e. a phi in the tail of the outer-loop). 
+                FORNOW: we currently don't support the case that these phis
+                are not used in the outerloop, cause this case requires
+                to actually do something here.  */
+             if (!STMT_VINFO_RELEVANT_P (stmt_info) 
+                 || STMT_VINFO_LIVE_P (stmt_info))
+               {
+                 if (vect_print_dump_info (REPORT_DETAILS))
+                   fprintf (vect_dump, 
+                            "Unsupported loop-closed phi in outer-loop.");
+                 return false;
+               }
+             continue;
            }
 
          gcc_assert (stmt_info);
@@ -357,22 +417,22 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
                {
                  fprintf (vect_dump,
                           "not vectorized: relevant phi not supported: ");
-                 print_generic_expr (vect_dump, phi, TDF_SLIM);
+                 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
                }
              return false;
            }
        }
 
-      for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
+      for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
        {
-         tree stmt = bsi_stmt (si);
+         gimple stmt = gsi_stmt (si);
          stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
-         enum vect_def_type relevance = STMT_VINFO_RELEVANT (stmt_info);
+         enum vect_relevant relevance = STMT_VINFO_RELEVANT (stmt_info);
 
          if (vect_print_dump_info (REPORT_DETAILS))
            {
              fprintf (vect_dump, "==> examining statement: ");
-             print_generic_expr (vect_dump, stmt, TDF_SLIM);
+             print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
            }
 
          gcc_assert (stmt_info);
@@ -398,7 +458,9 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
              break;
        
            case vect_reduction_def:
-             gcc_assert (relevance == vect_unused_in_loop);
+             gcc_assert (relevance == vect_used_in_outer
+                         || relevance == vect_used_in_outer_by_reduction
+                         || relevance == vect_unused_in_loop);
              break;    
 
            case vect_induction_def:
@@ -411,38 +473,76 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
 
          if (STMT_VINFO_RELEVANT_P (stmt_info))
            {
-             gcc_assert (GIMPLE_STMT_P (stmt)
-                         || !VECTOR_MODE_P (TYPE_MODE (TREE_TYPE (stmt))));
+             gcc_assert (!VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))));
              gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
              need_to_vectorize = true;
            }
 
-         ok = (vectorizable_type_promotion (stmt, NULL, NULL)
-               || vectorizable_type_demotion (stmt, NULL, NULL)
-               || vectorizable_conversion (stmt, NULL, NULL)
-               || vectorizable_operation (stmt, NULL, NULL)
-               || vectorizable_assignment (stmt, NULL, NULL)
-               || vectorizable_load (stmt, NULL, NULL)
+         ok = true;
+         if (STMT_VINFO_RELEVANT_P (stmt_info)
+             || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
+           ok = (vectorizable_type_promotion (stmt, NULL, NULL, NULL)
+               || vectorizable_type_demotion (stmt, NULL, NULL, NULL)
+               || vectorizable_conversion (stmt, NULL, NULL, NULL)
+               || vectorizable_operation (stmt, NULL, NULL, NULL)
+               || vectorizable_assignment (stmt, NULL, NULL, NULL)
+               || vectorizable_load (stmt, NULL, NULL, NULL, NULL)
                || vectorizable_call (stmt, NULL, NULL)
-               || vectorizable_store (stmt, NULL, NULL)
+               || vectorizable_store (stmt, NULL, NULL, NULL)
                || vectorizable_condition (stmt, NULL, NULL)
                || vectorizable_reduction (stmt, NULL, NULL));
 
+         if (!ok)
+           {
+             if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
+               {
+                 fprintf (vect_dump, "not vectorized: relevant stmt not ");
+                 fprintf (vect_dump, "supported: ");
+                 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+               }
+             return false;
+           }
+
          /* Stmts that are (also) "live" (i.e. - that are used out of the loop)
             need extra handling, except for vectorizable reductions.  */
          if (STMT_VINFO_LIVE_P (stmt_info)
              && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type) 
-           ok |= vectorizable_live_operation (stmt, NULL, NULL);
+           ok = vectorizable_live_operation (stmt, NULL, NULL);
 
          if (!ok)
            {
              if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
                {
-                 fprintf (vect_dump, "not vectorized: stmt not supported: ");
-                 print_generic_expr (vect_dump, stmt, TDF_SLIM);
+                 fprintf (vect_dump, "not vectorized: live stmt not ");
+                 fprintf (vect_dump, "supported: ");
+                 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
                }
              return false;
            }   
+
+         if (!PURE_SLP_STMT (stmt_info))
+           {
+             /* STMT needs loop-based vectorization.  */
+             only_slp_in_loop = false;
+
+             /* Groups of strided accesses whose size is not a power of 2 are 
+                not vectorizable yet using loop-vectorization. Therefore, if 
+                this stmt feeds non-SLP-able stmts (i.e., this stmt has to be 
+                both SLPed and loop-based vectorized), the loop cannot be
+                vectorized.  */
+             if (STMT_VINFO_STRIDED_ACCESS (stmt_info)
+                 && exact_log2 (DR_GROUP_SIZE (vinfo_for_stmt (
+                                 DR_GROUP_FIRST_DR (stmt_info)))) == -1)
+               {
+                 if (vect_print_dump_info (REPORT_DETAILS))
+                   {
+                     fprintf (vect_dump, "not vectorized: the size of group "
+                              "of strided accesses is not a power of 2");
+                     print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+                   }
+                 return false;
+               }
+           }
        } /* stmts in bb */
     } /* bbs */
 
@@ -462,6 +562,18 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
       return false;
     }
 
+  /* If all the stmts in the loop can be SLPed, we perform only SLP, and
+     vectorization factor of the loop is the unrolling factor required by the
+     SLP instances.  If that unrolling factor is 1, we say, that we perform
+     pure SLP on loop - cross iteration parallelism is not exploited.  */
+  if (only_slp_in_loop)
+    vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
+  else
+    vectorization_factor = least_common_multiple (vectorization_factor,
+                               LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
+  
+  LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
+
   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
       && vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump,
@@ -481,7 +593,12 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
 
   /* Analyze cost. Decide if worth while to vectorize.  */
 
+  /* Once VF is set, SLP costs should be updated since the number of created
+     vector stmts depends on VF.  */
+  vect_update_slp_costs_according_to_vf (loop_vinfo);
+
   min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
+  LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
 
   if (min_profitable_iters < 0)
     {
@@ -493,8 +610,8 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
       return false;
     }
 
-  min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND))
-                          * vectorization_factor;
+  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
+                           * vectorization_factor) - 1);
 
   /* Use the cost model only if it is more conservative than user specified
      threshold.  */
@@ -506,7 +623,7 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
     th = (unsigned) min_profitable_iters;
 
   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
+      && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
     {
       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))          
         fprintf (vect_dump, "not vectorized: vectorization not "
@@ -550,7 +667,7 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
    used in STMT for anything other than indexing an array.  */
 
 static bool
-exist_non_indexing_operands_for_use_p (tree use, tree stmt)
+exist_non_indexing_operands_for_use_p (tree use, gimple stmt)
 {
   tree operand;
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
@@ -574,10 +691,12 @@ exist_non_indexing_operands_for_use_p (tree use, tree stmt)
      Therefore, all we need to check is if STMT falls into the
      first case, and whether var corresponds to USE.  */
  
-  if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) == SSA_NAME)
+  if (TREE_CODE (gimple_assign_lhs (stmt)) == SSA_NAME)
     return false;
 
-  operand = GIMPLE_STMT_OPERAND (stmt, 1);
+  if (!gimple_assign_copy_p (stmt))
+    return false;
+  operand = gimple_assign_rhs1 (stmt);
 
   if (TREE_CODE (operand) != SSA_NAME)
     return false;
@@ -589,60 +708,28 @@ exist_non_indexing_operands_for_use_p (tree use, tree stmt)
 }
 
 
-/* Function vect_analyze_scalar_cycles.
-
-   Examine the cross iteration def-use cycles of scalar variables, by
-   analyzing the loop (scalar) PHIs; Classify each cycle as one of the
-   following: invariant, induction, reduction, unknown.
-   
-   Some forms of scalar cycles are not yet supported.
-
-   Example1: reduction: (unsupported yet)
-
-              loop1:
-              for (i=0; i<N; i++)
-                 sum += a[i];
-
-   Example2: induction: (unsupported yet)
-
-              loop2:
-              for (i=0; i<N; i++)
-                 a[i] = i;
-
-   Note: the following loop *is* vectorizable:
-
-              loop3:
-              for (i=0; i<N; i++)
-                 a[i] = b[i];
-
-         even though it has a def-use cycle caused by the induction variable i:
-
-              loop: i_2 = PHI (i_0, i_1)
-                    a[i_2] = ...;
-                    i_1 = i_2 + 1;
-                    GOTO loop;
+/* Function vect_analyze_scalar_cycles_1.
 
-         because the def-use cycle in loop3 is considered "not relevant" - i.e.,
-         it does not need to be vectorized because it is only used for array
-         indexing (see 'mark_stmts_to_be_vectorized'). The def-use cycle in
-         loop2 on the other hand is relevant (it is being written to memory).
-*/
+   Examine the cross iteration def-use cycles of scalar variables
+   in LOOP. LOOP_VINFO represents the loop that is now being
+   considered for vectorization (can be LOOP, or an outer-loop
+   enclosing LOOP).  */
 
 static void
-vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
+vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 {
-  tree phi;
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block bb = loop->header;
   tree dumy;
-  VEC(tree,heap) *worklist = VEC_alloc (tree, heap, 64);
+  VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
+  gimple_stmt_iterator gsi;
 
   if (vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
 
   /* First - identify all inductions.  */
-  for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
+  for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
     {
+      gimple phi = gsi_stmt (gsi);
       tree access_fn = NULL;
       tree def = PHI_RESULT (phi);
       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
@@ -650,7 +737,7 @@ vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
       if (vect_print_dump_info (REPORT_DETAILS))
        {
          fprintf (vect_dump, "Analyze phi: ");
-         print_generic_expr (vect_dump, phi, TDF_SLIM);
+         print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
        }
 
       /* Skip virtual phi's. The data dependences that are associated with
@@ -671,7 +758,7 @@ vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
       if (!access_fn
          || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy)) 
        {
-         VEC_safe_push (tree, heap, worklist, phi);      
+         VEC_safe_push (gimple, heap, worklist, phi);    
          continue;
        }
 
@@ -682,23 +769,23 @@ vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 
 
   /* Second - identify all reductions.  */
-  while (VEC_length (tree, worklist) > 0)
+  while (VEC_length (gimple, worklist) > 0)
     {
-      tree phi = VEC_pop (tree, worklist);
+      gimple phi = VEC_pop (gimple, worklist);
       tree def = PHI_RESULT (phi);
       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
-      tree reduc_stmt;
+      gimple reduc_stmt;
 
       if (vect_print_dump_info (REPORT_DETAILS))
         { 
           fprintf (vect_dump, "Analyze phi: ");
-          print_generic_expr (vect_dump, phi, TDF_SLIM);
+          print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
         }
 
       gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 
-      reduc_stmt = vect_is_simple_reduction (loop, phi);
+      reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi);
       if (reduc_stmt)
         {
           if (vect_print_dump_info (REPORT_DETAILS))
@@ -712,11 +799,78 @@ vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
           fprintf (vect_dump, "Unknown def-use cycle pattern.");
     }
 
-  VEC_free (tree, heap, worklist);
+  VEC_free (gimple, heap, worklist);
   return;
 }
 
 
+/* Function vect_analyze_scalar_cycles.
+
+   Examine the cross iteration def-use cycles of scalar variables, by
+   analyzing the loop-header PHIs of scalar variables; Classify each 
+   cycle as one of the following: invariant, induction, reduction, unknown.
+   We do that for the loop represented by LOOP_VINFO, and also to its
+   inner-loop, if exists.
+   Examples for scalar cycles:
+
+   Example1: reduction:
+
+              loop1:
+              for (i=0; i<N; i++)
+                 sum += a[i];
+
+   Example2: induction:
+
+              loop2:
+              for (i=0; i<N; i++)
+                 a[i] = i;  */
+
+static void
+vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+
+  vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
+
+  /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
+     Reductions in such inner-loop therefore have different properties than
+     the reductions in the nest that gets vectorized:
+     1. When vectorized, they are executed in the same order as in the original
+        scalar loop, so we can't change the order of computation when
+        vectorizing them.
+     2. FIXME: Inner-loop reductions can be used in the inner-loop, so the 
+        current checks are too strict.  */
+
+  if (loop->inner)
+    vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
+}
+
+
+/* Find the place of the data-ref in STMT in the interleaving chain that starts
+   from FIRST_STMT. Return -1 if the data-ref is not a part of the chain.  */
+
+static int 
+vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
+{
+  gimple next_stmt = first_stmt;
+  int result = 0;
+
+  if (first_stmt != DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)))
+    return -1;
+
+  while (next_stmt && next_stmt != stmt)
+    {
+      result++;
+      next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
+    }
+
+  if (next_stmt)
+    return result;
+  else
+    return -1;
+}
+
+
 /* Function vect_insert_into_interleaving_chain.
 
    Insert DRA into the interleaving chain of DRB according to DRA's INIT.  */
@@ -725,7 +879,8 @@ static void
 vect_insert_into_interleaving_chain (struct data_reference *dra,
                                     struct data_reference *drb)
 {
-  tree prev, next, next_init;
+  gimple prev, next;
+  tree next_init;
   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 
   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 
@@ -747,7 +902,7 @@ vect_insert_into_interleaving_chain (struct data_reference *dra,
 
   /* We got to the end of the list. Insert here.  */
   DR_GROUP_NEXT_DR (vinfo_for_stmt (prev)) = DR_STMT (dra);
-  DR_GROUP_NEXT_DR (stmtinfo_a) = NULL_TREE;
+  DR_GROUP_NEXT_DR (stmtinfo_a) = NULL;
 }
 
 
@@ -780,8 +935,10 @@ vect_update_interleaving_chain (struct data_reference *drb,
 {
   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 
   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
-  tree next_init, init_dra_chain, init_drb_chain, first_a, first_b;
-  tree node, prev, next, node_init, first_stmt;
+  tree next_init, init_dra_chain, init_drb_chain;
+  gimple first_a, first_b;
+  tree node_init;
+  gimple node, prev, next, first_stmt;
 
   /* 1. New stmts - both DRA and DRB are not a part of any chain.   */
   if (!DR_GROUP_FIRST_DR (stmtinfo_a) && !DR_GROUP_FIRST_DR (stmtinfo_b))
@@ -804,10 +961,10 @@ vect_update_interleaving_chain (struct data_reference *drb,
   /* 3. DRA is a part of a chain and DRB is not.  */  
   if (DR_GROUP_FIRST_DR (stmtinfo_a) && !DR_GROUP_FIRST_DR (stmtinfo_b))
     {
-      tree old_first_stmt = DR_GROUP_FIRST_DR (stmtinfo_a);
+      gimple old_first_stmt = DR_GROUP_FIRST_DR (stmtinfo_a);
       tree init_old = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (
                                                              old_first_stmt)));
-      tree tmp;
+      gimple tmp;
 
       if (tree_int_cst_compare (init_old, DR_INIT (drb)) > 0)
        {
@@ -883,7 +1040,7 @@ vect_update_interleaving_chain (struct data_reference *drb,
        {
          /* We got to the end of the list. Insert here.  */
          DR_GROUP_NEXT_DR (vinfo_for_stmt (prev)) = node;
-         DR_GROUP_NEXT_DR (vinfo_for_stmt (node)) = NULL_TREE;
+         DR_GROUP_NEXT_DR (vinfo_for_stmt (node)) = NULL;
          prev = node;
        }                       
       DR_GROUP_FIRST_DR (vinfo_for_stmt (node)) = first_stmt;
@@ -952,7 +1109,9 @@ vect_check_interleaving (struct data_reference *dra,
   type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
 
   if (type_size_a != type_size_b
-      || tree_int_cst_compare (DR_STEP (dra), DR_STEP (drb)))
+      || tree_int_cst_compare (DR_STEP (dra), DR_STEP (drb))
+      || !types_compatible_p (TREE_TYPE (DR_REF (dra)), 
+                              TREE_TYPE (DR_REF (drb))))
     return;
 
   init_a = TREE_INT_CST_LOW (DR_INIT (dra));
@@ -1005,11 +1164,85 @@ vect_check_interleaving (struct data_reference *dra,
     }
 }
 
+/* Check if data references pointed by DR_I and DR_J are same or
+   belong to same interleaving group.  Return FALSE if drs are
+   different, otherwise return TRUE.  */
+
+static bool
+vect_same_range_drs (data_reference_p dr_i, data_reference_p dr_j)
+{
+  gimple stmt_i = DR_STMT (dr_i);
+  gimple stmt_j = DR_STMT (dr_j);
+
+  if (operand_equal_p (DR_REF (dr_i), DR_REF (dr_j), 0)
+      || (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_i))
+           && DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_j))
+           && (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_i))
+               == DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_j)))))
+    return true;
+  else
+    return false;
+}
+
+/* If address ranges represented by DDR_I and DDR_J are equal,
+   return TRUE, otherwise return FALSE.  */
+
+static bool
+vect_vfa_range_equal (ddr_p ddr_i, ddr_p ddr_j)
+{
+  if ((vect_same_range_drs (DDR_A (ddr_i), DDR_A (ddr_j))
+       && vect_same_range_drs (DDR_B (ddr_i), DDR_B (ddr_j)))
+      || (vect_same_range_drs (DDR_A (ddr_i), DDR_B (ddr_j))
+         && vect_same_range_drs (DDR_B (ddr_i), DDR_A (ddr_j))))
+    return true;
+  else
+    return false;
+}
+
+/* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
+   tested at run-time.  Return TRUE if DDR was successfully inserted.
+   Return false if versioning is not supported.  */
+
+static bool
+vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+
+  if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
+    return false;
+
+  if (vect_print_dump_info (REPORT_DR_DETAILS))
+    {
+      fprintf (vect_dump, "mark for run-time aliasing test between ");
+      print_generic_expr (vect_dump, DR_REF (DDR_A (ddr)), TDF_SLIM);
+      fprintf (vect_dump, " and ");
+      print_generic_expr (vect_dump, DR_REF (DDR_B (ddr)), TDF_SLIM);
+    }
+
+  if (optimize_loop_nest_for_size_p (loop))
+    {
+      if (vect_print_dump_info (REPORT_DR_DETAILS))
+       fprintf (vect_dump, "versioning not supported when optimizing for size.");
+      return false;
+    }
+
+  /* FORNOW: We don't support versioning with outer-loop vectorization.  */
+  if (loop->inner)
+    {
+      if (vect_print_dump_info (REPORT_DR_DETAILS))
+       fprintf (vect_dump, "versioning not yet supported for outer-loops.");
+      return false;
+    }
+
+  VEC_safe_push (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo), ddr);
+  return true;
+}
 
 /* Function vect_analyze_data_ref_dependence.
 
    Return TRUE if there (might) exist a dependence between a memory-reference
-   DRA and a memory-reference DRB.  */
+   DRA and a memory-reference DRB.  When versioning for alias may check a
+   dependence at run-time, return FALSE.  */
       
 static bool
 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
@@ -1039,27 +1272,29 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
   
   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
     {
-      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
+      if (vect_print_dump_info (REPORT_DR_DETAILS))
         {
           fprintf (vect_dump,
-                   "not vectorized: can't determine dependence between ");
+                   "versioning for alias required: can't determine dependence between ");
           print_generic_expr (vect_dump, DR_REF (dra), TDF_SLIM);
           fprintf (vect_dump, " and ");
           print_generic_expr (vect_dump, DR_REF (drb), TDF_SLIM);
         }
-      return true;
+      /* Add to list of ddrs that need to be tested at run-time.  */
+      return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
     }
 
   if (DDR_NUM_DIST_VECTS (ddr) == 0)
     {
-      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
+      if (vect_print_dump_info (REPORT_DR_DETAILS))
         {
-          fprintf (vect_dump, "not vectorized: bad dist vector for ");
+          fprintf (vect_dump, "versioning for alias required: bad dist vector for ");
           print_generic_expr (vect_dump, DR_REF (dra), TDF_SLIM);
           fprintf (vect_dump, " and ");
           print_generic_expr (vect_dump, DR_REF (drb), TDF_SLIM);
         }
-      return true;
+      /* Add to list of ddrs that need to be tested at run-time.  */
+      return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
     }    
 
   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
@@ -1099,19 +1334,23 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
           continue;
        }
 
-      if (abs (dist) >= vectorization_factor)
+      if (abs (dist) >= vectorization_factor 
+          || (dist > 0 && DDR_REVERSED_P (ddr)))
        {
-         /* Dependence distance does not create dependence, as far as vectorization
-            is concerned, in this case.  */
+         /* Dependence distance does not create dependence, as far as 
+            vectorization is concerned, in this case. If DDR_REVERSED_P the 
+            order of the data-refs in DDR was reversed (to make distance
+            vector positive), and the actual distance is negative.  */
          if (vect_print_dump_info (REPORT_DR_DETAILS))
-           fprintf (vect_dump, "dependence distance >= VF.");
+           fprintf (vect_dump, "dependence distance >= VF or negative.");
          continue;
        }
 
       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
        {
          fprintf (vect_dump,
-                  "not vectorized: possible dependence between data-refs ");
+                  "not vectorized, possible dependence "
+                  "between data-refs ");
          print_generic_expr (vect_dump, DR_REF (dra), TDF_SLIM);
          fprintf (vect_dump, " and ");
          print_generic_expr (vect_dump, DR_REF (drb), TDF_SLIM);
@@ -1123,7 +1362,6 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
   return false;
 }
 
-
 /* Function vect_analyze_data_ref_dependences.
           
    Examine all the data references in the loop, and make sure there do not
@@ -1133,7 +1371,7 @@ static bool
 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo)
 {
   unsigned int i;
-  VEC (ddr_p, heap) *ddrs = LOOP_VINFO_DDRS (loop_vinfo);
+  VEC (ddr_p, heap) * ddrs = LOOP_VINFO_DDRS (loop_vinfo);
   struct data_dependence_relation *ddr;
 
   if (vect_print_dump_info (REPORT_DETAILS)) 
@@ -1162,8 +1400,10 @@ vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo)
 static bool
 vect_compute_data_ref_alignment (struct data_reference *dr)
 {
-  tree stmt = DR_STMT (dr);
+  gimple stmt = DR_STMT (dr);
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);  
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   tree ref = DR_REF (dr);
   tree vectype;
   tree base, base_addr;
@@ -1180,13 +1420,42 @@ vect_compute_data_ref_alignment (struct data_reference *dr)
   misalign = DR_INIT (dr);
   aligned_to = DR_ALIGNED_TO (dr);
   base_addr = DR_BASE_ADDRESS (dr);
-  base = build_fold_indirect_ref (base_addr);
   vectype = STMT_VINFO_VECTYPE (stmt_info);
+
+  /* In case the dataref is in an inner-loop of the loop that is being
+     vectorized (LOOP), we use the base and misalignment information
+     relative to the outer-loop (LOOP). This is ok only if the misalignment
+     stays the same throughout the execution of the inner-loop, which is why
+     we have to check that the stride of the dataref in the inner-loop evenly
+     divides by the vector size.  */
+  if (nested_in_vect_loop_p (loop, stmt))
+    {
+      tree step = DR_STEP (dr);
+      HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
+    
+      if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
+        {
+          if (vect_print_dump_info (REPORT_ALIGNMENT))
+            fprintf (vect_dump, "inner step divides the vector-size.");
+         misalign = STMT_VINFO_DR_INIT (stmt_info);
+         aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
+         base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
+        }
+      else
+       {
+         if (vect_print_dump_info (REPORT_ALIGNMENT))
+           fprintf (vect_dump, "inner step doesn't divide the vector-size.");
+         misalign = NULL_TREE;
+       }
+    }
+
+  base = build_fold_indirect_ref (base_addr);
   alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
 
-  if (tree_int_cst_compare (aligned_to, alignment) < 0)
+  if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
+      || !misalign)
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
+      if (vect_print_dump_info (REPORT_ALIGNMENT))
        {
          fprintf (vect_dump, "Unknown alignment for access: ");
          print_generic_expr (vect_dump, base, TDF_SLIM);
@@ -1299,9 +1568,9 @@ vect_update_misalignment_for_peel (struct data_reference *dr,
 
  /* For interleaved data accesses the step in the loop must be multiplied by
      the size of the interleaving group.  */
-  if (DR_GROUP_FIRST_DR (stmt_info))
+  if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
     dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info)));
-  if (DR_GROUP_FIRST_DR (peel_stmt_info))
+  if (STMT_VINFO_STRIDED_ACCESS (peel_stmt_info))
     dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
 
   /* It can be assumed that the data refs with the same alignment as dr_peel
@@ -1322,8 +1591,9 @@ vect_update_misalignment_for_peel (struct data_reference *dr,
       && known_alignment_for_access_p (dr_peel))
     {
       int misal = DR_MISALIGNMENT (dr);
+      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
       misal += npeel * dr_size;
-      misal %= UNITS_PER_SIMD_WORD;
+      misal %= GET_MODE_SIZE (TYPE_MODE (vectype));
       SET_DR_MISALIGNMENT (dr, misal);
       return;
     }
@@ -1349,11 +1619,11 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo)
 
   for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
     {
-      tree stmt = DR_STMT (dr);
+      gimple stmt = DR_STMT (dr);
       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 
       /* For interleaving, only the alignment of the first access matters.  */
-      if (DR_GROUP_FIRST_DR (stmt_info)
+      if (STMT_VINFO_STRIDED_ACCESS (stmt_info)
           && DR_GROUP_FIRST_DR (stmt_info) != stmt)
         continue;
 
@@ -1379,6 +1649,76 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo)
 }
 
 
+/* Function vector_alignment_reachable_p
+
+   Return true if vector alignment for DR is reachable by peeling
+   a few loop iterations.  Return false otherwise.  */
+
+static bool
+vector_alignment_reachable_p (struct data_reference *dr)
+{
+  gimple stmt = DR_STMT (dr);
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+
+  if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
+    {
+      /* For interleaved access we peel only if number of iterations in
+        the prolog loop ({VF - misalignment}), is a multiple of the
+        number of the interleaved accesses.  */
+      int elem_size, mis_in_elements;
+      int nelements = TYPE_VECTOR_SUBPARTS (vectype);
+
+      /* FORNOW: handle only known alignment.  */
+      if (!known_alignment_for_access_p (dr))
+       return false;
+
+      elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
+      mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
+
+      if ((nelements - mis_in_elements) % DR_GROUP_SIZE (stmt_info))
+       return false;
+    }
+
+  /* If misalignment is known at the compile time then allow peeling
+     only if natural alignment is reachable through peeling.  */
+  if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
+    {
+      HOST_WIDE_INT elmsize = 
+               int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
+      if (vect_print_dump_info (REPORT_DETAILS))
+       {
+         fprintf (vect_dump, "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
+         fprintf (vect_dump, ". misalignment = %d. ", DR_MISALIGNMENT (dr));
+       }
+      if (DR_MISALIGNMENT (dr) % elmsize)
+       {
+         if (vect_print_dump_info (REPORT_DETAILS))
+           fprintf (vect_dump, "data size does not divide the misalignment.\n");
+         return false;
+       }
+    }
+
+  if (!known_alignment_for_access_p (dr))
+    {
+      tree type = (TREE_TYPE (DR_REF (dr)));
+      tree ba = DR_BASE_OBJECT (dr);
+      bool is_packed = false;
+
+      if (ba)
+       is_packed = contains_packed_reference (ba);
+
+      if (vect_print_dump_info (REPORT_DETAILS))
+       fprintf (vect_dump, "Unknown misalignment, is_packed = %d",is_packed);
+      if (targetm.vectorize.vector_alignment_reachable (type, is_packed))
+       return true;
+      else
+       return false;
+    }
+
+  return true;
+}
+
 /* Function vect_enhance_data_refs_alignment
 
    This pass will use loop versioning and loop peeling in order to enhance
@@ -1482,8 +1822,9 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
   bool do_peeling = false;
   bool do_versioning = false;
   bool stat;
-  tree stmt;
+  gimple stmt;
   stmt_vec_info stmt_info;
+  int vect_versioning_for_alias_required;
 
   if (vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "=== vect_enhance_data_refs_alignment ===");
@@ -1534,46 +1875,30 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 
       /* For interleaving, only the alignment of the first access
          matters.  */
-      if (DR_GROUP_FIRST_DR (stmt_info)
+      if (STMT_VINFO_STRIDED_ACCESS (stmt_info)
           && DR_GROUP_FIRST_DR (stmt_info) != stmt)
         continue;
 
       if (!DR_IS_READ (dr) && !aligned_access_p (dr))
         {
-         if (DR_GROUP_FIRST_DR (stmt_info))
-           {
-             /* For interleaved access we peel only if number of iterations in
-                the prolog loop ({VF - misalignment}), is a multiple of the
-                number of the interleaved accesses.  */
-             int elem_size, mis_in_elements;
-             tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-             int nelements = TYPE_VECTOR_SUBPARTS (vectype);
-
-             /* FORNOW: handle only known alignment.  */
-             if (!known_alignment_for_access_p (dr))
-               {
-                 do_peeling = false;
-                 break;
-               }
-
-             elem_size = UNITS_PER_SIMD_WORD / nelements;
-             mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
-
-             if ((nelements - mis_in_elements) % DR_GROUP_SIZE (stmt_info))
-               {
-                 do_peeling = false;
-                 break;
-               }
-           }
-         dr0 = dr;
-         do_peeling = true;
+         do_peeling = vector_alignment_reachable_p (dr);
+         if (do_peeling)
+           dr0 = dr;
+         if (!do_peeling && vect_print_dump_info (REPORT_DETAILS))
+            fprintf (vect_dump, "vector alignment may not be reachable");
          break;
        }
     }
 
-  /* Often peeling for alignment will require peeling for loop-bound, which in 
-     turn requires that we know how to adjust the loop ivs after the loop.  */
-  if (!vect_can_advance_ivs_p (loop_vinfo)
+  vect_versioning_for_alias_required =
+    (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)) > 0);
+
+  /* Temporarily, if versioning for alias is required, we disable peeling
+     until we support peeling and versioning.  Often peeling for alignment
+     will require peeling for loop-bound, which in turn requires that we
+     know how to adjust the loop ivs after the loop.  */
+  if (vect_versioning_for_alias_required
+       || !vect_can_advance_ivs_p (loop_vinfo)
       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
     do_peeling = false;
 
@@ -1581,7 +1906,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
     {
       int mis;
       int npeel = 0;
-      tree stmt = DR_STMT (dr0);
+      gimple stmt = DR_STMT (dr0);
       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
@@ -1600,7 +1925,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
             members of the group, therefore we divide the number of iterations
             by the group size.  */
          stmt_info = vinfo_for_stmt (DR_STMT (dr0));     
-         if (DR_GROUP_FIRST_DR (stmt_info))
+         if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
            npeel /= DR_GROUP_SIZE (stmt_info);
 
           if (vect_print_dump_info (REPORT_DETAILS))
@@ -1619,7 +1944,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
          stmt_info = vinfo_for_stmt (stmt);
          /* For interleaving, only the alignment of the first access
             matters.  */
-         if (DR_GROUP_FIRST_DR (stmt_info)
+         if (STMT_VINFO_STRIDED_ACCESS (stmt_info)
              && DR_GROUP_FIRST_DR (stmt_info) != stmt)
            continue;
 
@@ -1668,13 +1993,16 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 
   /* Try versioning if:
      1) flag_tree_vect_loop_version is TRUE
-     2) optimize_size is FALSE
+     2) optimize loop for speed
      3) there is at least one unsupported misaligned data ref with an unknown
         misalignment, and
      4) all misaligned data refs with a known misalignment are supported, and
      5) the number of runtime alignment checks is within reason.  */
 
-  do_versioning = flag_tree_vect_loop_version && (!optimize_size);
+  do_versioning = 
+       flag_tree_vect_loop_version 
+       && optimize_loop_nest_for_speed_p (loop)
+       && (!loop->inner); /* FORNOW */
 
   if (do_versioning)
     {
@@ -1686,7 +2014,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
          /* For interleaving, only the alignment of the first access
             matters.  */
          if (aligned_access_p (dr)
-             || (DR_GROUP_FIRST_DR (stmt_info)
+             || (STMT_VINFO_STRIDED_ACCESS (stmt_info)
                  && DR_GROUP_FIRST_DR (stmt_info) != stmt))
            continue;
 
@@ -1694,14 +2022,14 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 
           if (!supportable_dr_alignment)
             {
-              tree stmt;
+              gimple stmt;
               int mask;
               tree vectype;
 
               if (known_alignment_for_access_p (dr)
-                  || VEC_length (tree,
+                  || VEC_length (gimple,
                                  LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
-                     >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_CHECKS))
+                     >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
                 {
                   do_versioning = false;
                   break;
@@ -1725,29 +2053,29 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
-              VEC_safe_push (tree, heap,
+              VEC_safe_push (gimple, heap,
                              LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo),
                              DR_STMT (dr));
             }
         }
       
       /* Versioning requires at least one misaligned data reference.  */
-      if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)) == 0)
+      if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)) == 0)
         do_versioning = false;
       else if (!do_versioning)
-        VEC_truncate (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo), 0);
+        VEC_truncate (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo), 0);
     }
 
   if (do_versioning)
     {
-      VEC(tree,heap) *may_misalign_stmts
+      VEC(gimple,heap) *may_misalign_stmts
         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
-      tree stmt;
+      gimple stmt;
 
       /* It can now be assumed that the data references in the statements
          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
          of the loop being vectorized.  */
-      for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, stmt); i++)
+      for (i = 0; VEC_iterate (gimple, may_misalign_stmts, i, stmt); i++)
         {
           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
           dr = STMT_VINFO_DATA_REF (stmt_info);
@@ -1798,37 +2126,27 @@ vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
 }
 
 
-/* Function vect_analyze_data_ref_access.
-
-   Analyze the access pattern of the data-reference DR. For now, a data access
-   has to be consecutive to be considered vectorizable.  */
+/* Analyze groups of strided accesses: check that DR belongs to a group of
+   strided accesses of legal size, step, etc. Detect gaps, single element
+   interleaving, and other special cases. Set strided access info.
+   Collect groups of strided stores for further use in SLP analysis.  */
 
 static bool
-vect_analyze_data_ref_access (struct data_reference *dr)
+vect_analyze_group_access (struct data_reference *dr)
 {
   tree step = DR_STEP (dr);
-  HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
   tree scalar_type = TREE_TYPE (DR_REF (dr));
   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
-  tree stmt = DR_STMT (dr);
+  gimple stmt = DR_STMT (dr);
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
+  HOST_WIDE_INT stride;
+  bool slp_impossible = false;
+
   /* For interleaving, STRIDE is STEP counted in elements, i.e., the size of the 
      interleaving group (including gaps).  */
-  HOST_WIDE_INT stride = dr_step / type_size;
-
-  if (!step)
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "bad data-ref access");
-      return false;
-    }
-
-  /* Consecutive?  */
-  if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)))
-    {
-      /* Mark that it is not interleaving.  */
-      DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) = NULL_TREE;
-      return true;
-    }
+  stride = dr_step / type_size; 
 
   /* Not consecutive access is possible only if it is a part of interleaving.  */
   if (!DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)))
@@ -1863,155 +2181,1230 @@ vect_analyze_data_ref_access (struct data_reference *dr)
   if (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt)
     {
       /* First stmt in the interleaving chain. Check the chain.  */
-      tree next = DR_GROUP_NEXT_DR (vinfo_for_stmt (stmt));
+      gimple next = DR_GROUP_NEXT_DR (vinfo_for_stmt (stmt));
       struct data_reference *data_ref = dr;
       unsigned int count = 1;
       tree next_step;
       tree prev_init = DR_INIT (data_ref);
-      tree prev = stmt;
+      gimple prev = stmt;
       HOST_WIDE_INT diff, count_in_bytes;
 
       while (next)
-       {
-         /* Skip same data-refs. In case that two or more stmts share data-ref
-            (supported only for loads), we vectorize only the first stmt, and
-            the rest get their vectorized loads from the first one.  */
-         if (!tree_int_cst_compare (DR_INIT (data_ref),
-                                    DR_INIT (STMT_VINFO_DATA_REF (
-                                                     vinfo_for_stmt (next)))))
-           {
+        {
+          /* Skip same data-refs. In case that two or more stmts share data-ref
+             (supported only for loads), we vectorize only the first stmt, and
+             the rest get their vectorized loads from the first one.  */
+          if (!tree_int_cst_compare (DR_INIT (data_ref),
+                                     DR_INIT (STMT_VINFO_DATA_REF (
+                                                  vinfo_for_stmt (next)))))
+            {
               if (!DR_IS_READ (data_ref))
-                { 
+                {
                   if (vect_print_dump_info (REPORT_DETAILS))
                     fprintf (vect_dump, "Two store stmts share the same dr.");
-                  return false; 
+                  return false;
                 }
 
-              /* Check that there is no load-store dependencies for this loads 
+              /* Check that there is no load-store dependencies for this loads
                  to prevent a case of load-store-load to the same location.  */
               if (DR_GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (next))
                   || DR_GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (prev)))
                 {
                   if (vect_print_dump_info (REPORT_DETAILS))
-                    fprintf (vect_dump, 
+                    fprintf (vect_dump,
                              "READ_WRITE dependence in interleaving.");
                   return false;
                 }
 
-             /* For load use the same data-ref load.  */
-             DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
+              /* For load use the same data-ref load.  */
+              DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
 
-             prev = next;
-             next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
-             continue;
-           }
-         prev = next;
+              prev = next;
+              next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
+              continue;
+            }
+          prev = next;
 
-         /* Check that all the accesses have the same STEP.  */
-         next_step = DR_STEP (STMT_VINFO_DATA_REF (vinfo_for_stmt (next)));
-         if (tree_int_cst_compare (step, next_step))
-           {
-             if (vect_print_dump_info (REPORT_DETAILS))
-               fprintf (vect_dump, "not consecutive access in interleaving");
-             return false;
-           }
+          /* Check that all the accesses have the same STEP.  */
+          next_step = DR_STEP (STMT_VINFO_DATA_REF (vinfo_for_stmt (next)));
+          if (tree_int_cst_compare (step, next_step))
+            {
+              if (vect_print_dump_info (REPORT_DETAILS))
+                fprintf (vect_dump, "not consecutive access in interleaving");
+              return false;
+            }
 
-         data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
-         /* Check that the distance between two accesses is equal to the type
-            size. Otherwise, we have gaps.  */
-         diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) 
-                 - TREE_INT_CST_LOW (prev_init)) / type_size;
-         if (!DR_IS_READ (data_ref) && diff != 1)
+          data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
+          /* Check that the distance between two accesses is equal to the type
+             size. Otherwise, we have gaps.  */
+          diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
+                  - TREE_INT_CST_LOW (prev_init)) / type_size;
+         if (diff != 1)
            {
-             if (vect_print_dump_info (REPORT_DETAILS))
-               fprintf (vect_dump, "interleaved store with gaps");
-             return false;
+             /* FORNOW: SLP of accesses with gaps is not supported.  */
+             slp_impossible = true;
+             if (!DR_IS_READ (data_ref))
+               {
+                 if (vect_print_dump_info (REPORT_DETAILS))
+                   fprintf (vect_dump, "interleaved store with gaps");
+                 return false;
+               }
            }
-         /* Store the gap from the previous member of the group. If there is no
+
+          /* Store the gap from the previous member of the group. If there is no
              gap in the access, DR_GROUP_GAP is always 1.  */
-         DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
+          DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
 
-         prev_init = DR_INIT (data_ref);
-         next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
-         /* Count the number of data-refs in the chain.  */
-         count++;
-       }
+          prev_init = DR_INIT (data_ref);
+          next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
+          /* Count the number of data-refs in the chain.  */
+          count++;
+        }
 
-      /* COUNT is the number of accesses found, we multiply it by the size of 
-        the type to get COUNT_IN_BYTES.  */
+      /* COUNT is the number of accesses found, we multiply it by the size of
+         the type to get COUNT_IN_BYTES.  */
       count_in_bytes = type_size * count;
 
       /* Check that the size of the interleaving is not greater than STEP.  */
-      if (dr_step < count_in_bytes) 
+      if (dr_step < count_in_bytes)
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            {
+              fprintf (vect_dump, "interleaving size is greater than step for ");
+              print_generic_expr (vect_dump, DR_REF (dr), TDF_SLIM);
+            }
+          return false;
+        }
+
+      /* Check that the size of the interleaving is equal to STEP for stores,
+         i.e., that there are no gaps.  */
+      if (dr_step != count_in_bytes)
+        {
+          if (DR_IS_READ (dr))
+            {
+              slp_impossible = true;
+              /* There is a gap after the last load in the group. This gap is a
+                 difference between the stride and the number of elements. When 
+                 there is no gap, this difference should be 0.  */ 
+              DR_GROUP_GAP (vinfo_for_stmt (stmt)) = stride - count; 
+            }
+          else
+            {
+              if (vect_print_dump_info (REPORT_DETAILS))
+                fprintf (vect_dump, "interleaved store with gaps");
+              return false;
+            }
+        }
+
+      /* Check that STEP is a multiple of type size.  */
+      if ((dr_step % type_size) != 0)
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            {
+              fprintf (vect_dump, "step is not a multiple of type size: step ");
+              print_generic_expr (vect_dump, step, TDF_SLIM);
+              fprintf (vect_dump, " size ");
+              print_generic_expr (vect_dump, TYPE_SIZE_UNIT (scalar_type),
+                                  TDF_SLIM);
+            }
+          return false;
+        }
+
+      /* FORNOW: we handle only interleaving that is a power of 2.  
+         We don't fail here if it may be still possible to vectorize the
+         group using SLP. If not, the size of the group will be checked in
+         vect_analyze_operations, and the vectorization will fail.  */
+      if (exact_log2 (stride) == -1)
        {
          if (vect_print_dump_info (REPORT_DETAILS))
+           fprintf (vect_dump, "interleaving is not a power of 2");
+
+         if (slp_impossible)
+           return false;
+       }
+      DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = stride;
+      if (vect_print_dump_info (REPORT_DETAILS))
+        fprintf (vect_dump, "Detected interleaving of size %d", (int)stride);
+
+      /* SLP: create an SLP data structure for every interleaving group of 
+        stores for further analysis in vect_analyse_slp.  */
+      if (!DR_IS_READ (dr) && !slp_impossible)
+       VEC_safe_push (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo), stmt);
+    }
+
+  return true;
+}
+
+
+/* Analyze the access pattern of the data-reference DR.
+   In case of non-consecutive accesses call vect_analyze_group_access() to
+   analyze groups of strided accesses.  */
+
+static bool
+vect_analyze_data_ref_access (struct data_reference *dr)
+{
+  tree step = DR_STEP (dr);
+  tree scalar_type = TREE_TYPE (DR_REF (dr));
+  gimple stmt = DR_STMT (dr);
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
+
+  if (!step)
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+       fprintf (vect_dump, "bad data-ref access");
+      return false;
+    }
+
+  /* Don't allow invariant accesses.  */
+  if (dr_step == 0)
+    return false; 
+
+  if (nested_in_vect_loop_p (loop, stmt))
+    {
+      /* Interleaved accesses are not yet supported within outer-loop
+        vectorization for references in the inner-loop.  */
+      DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) = NULL;
+
+      /* For the rest of the analysis we use the outer-loop step.  */
+      step = STMT_VINFO_DR_STEP (stmt_info);
+      dr_step = TREE_INT_CST_LOW (step);
+      
+      if (dr_step == 0)
+       {
+         if (vect_print_dump_info (REPORT_ALIGNMENT))
+           fprintf (vect_dump, "zero step in outer loop.");
+         if (DR_IS_READ (dr))
+           return true; 
+         else
+           return false;
+       }
+    }
+
+  /* Consecutive?  */
+  if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)))
+    {
+      /* Mark that it is not interleaving.  */
+      DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) = NULL;
+      return true;
+    }
+
+  if (nested_in_vect_loop_p (loop, stmt))
+    {
+      if (vect_print_dump_info (REPORT_ALIGNMENT))
+       fprintf (vect_dump, "strided access in outer loop.");
+      return false;
+    }
+
+  /* Not consecutive access - check if it's a part of interleaving group.  */
+  return vect_analyze_group_access (dr);
+}
+
+
+/* Function vect_analyze_data_ref_accesses.
+
+   Analyze the access pattern of all the data references in the loop.
+
+   FORNOW: the only access pattern that is considered vectorizable is a
+          simple step 1 (consecutive) access.
+
+   FORNOW: handle only arrays and pointer accesses.  */
+
+static bool
+vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo)
+{
+  unsigned int i;
+  VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
+  struct data_reference *dr;
+
+  if (vect_print_dump_info (REPORT_DETAILS))
+    fprintf (vect_dump, "=== vect_analyze_data_ref_accesses ===");
+
+  for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
+    if (!vect_analyze_data_ref_access (dr))
+      {
+       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
+         fprintf (vect_dump, "not vectorized: complicated access pattern.");
+       return false;
+      }
+
+  return true;
+}
+
+/* Function vect_prune_runtime_alias_test_list.
+
+   Prune a list of ddrs to be tested at run-time by versioning for alias.
+   Return FALSE if resulting list of ddrs is longer then allowed by
+   PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
+
+static bool
+vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
+{
+  VEC (ddr_p, heap) * ddrs =
+    LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
+  unsigned i, j;
+
+  if (vect_print_dump_info (REPORT_DETAILS))
+    fprintf (vect_dump, "=== vect_prune_runtime_alias_test_list ===");
+
+  for (i = 0; i < VEC_length (ddr_p, ddrs); )
+    {
+      bool found;
+      ddr_p ddr_i;
+
+      ddr_i = VEC_index (ddr_p, ddrs, i);
+      found = false;
+
+      for (j = 0; j < i; j++)
+        {
+         ddr_p ddr_j = VEC_index (ddr_p, ddrs, j);
+
+         if (vect_vfa_range_equal (ddr_i, ddr_j))
            {
-             fprintf (vect_dump, "interleaving size is greater than step for ");
-             print_generic_expr (vect_dump, DR_REF (dr), TDF_SLIM); 
+             if (vect_print_dump_info (REPORT_DR_DETAILS))
+               {
+                 fprintf (vect_dump, "found equal ranges ");
+                 print_generic_expr (vect_dump, DR_REF (DDR_A (ddr_i)), TDF_SLIM);
+                 fprintf (vect_dump, ", ");
+                 print_generic_expr (vect_dump, DR_REF (DDR_B (ddr_i)), TDF_SLIM);
+                 fprintf (vect_dump, " and ");
+                 print_generic_expr (vect_dump, DR_REF (DDR_A (ddr_j)), TDF_SLIM);
+                 fprintf (vect_dump, ", ");
+                 print_generic_expr (vect_dump, DR_REF (DDR_B (ddr_j)), TDF_SLIM);
+               }
+             found = true;
+             break;
            }
+       }
+      
+      if (found)
+      {
+       VEC_ordered_remove (ddr_p, ddrs, i);
+       continue;
+      }
+      i++;
+    }
+
+  if (VEC_length (ddr_p, ddrs) >
+       (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
+    {
+      if (vect_print_dump_info (REPORT_DR_DETAILS))
+       {
+         fprintf (vect_dump,
+                  "disable versioning for alias - max number of generated "
+                  "checks exceeded.");
+       }
+
+      VEC_truncate (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo), 0);
+
+      return false;
+    }
+
+  return true;
+}
+
+/* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
+
+static void
+vect_free_slp_tree (slp_tree node)
+{
+  if (!node)
+    return;
+
+  if (SLP_TREE_LEFT (node))
+    vect_free_slp_tree (SLP_TREE_LEFT (node));
+   
+  if (SLP_TREE_RIGHT (node))
+    vect_free_slp_tree (SLP_TREE_RIGHT (node));
+   
+  VEC_free (gimple, heap, SLP_TREE_SCALAR_STMTS (node));
+  
+  if (SLP_TREE_VEC_STMTS (node))
+    VEC_free (gimple, heap, SLP_TREE_VEC_STMTS (node));
+
+  free (node);
+}
+
+
+/* Free the memory allocated for the SLP instance.  */
+
+void
+vect_free_slp_instance (slp_instance instance)
+{
+  vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
+  VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (instance));
+  VEC_free (slp_tree, heap, SLP_INSTANCE_LOADS (instance));
+}
+
+
+/* Get the defs for the rhs of STMT (collect them in DEF_STMTS0/1), check that
+   they are of a legal type and that they match the defs of the first stmt of
+   the SLP group (stored in FIRST_STMT_...).  */
+
+static bool
+vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, slp_tree slp_node,
+                            gimple stmt, VEC (gimple, heap) **def_stmts0,
+                            VEC (gimple, heap) **def_stmts1,
+                            enum vect_def_type *first_stmt_dt0,
+                            enum vect_def_type *first_stmt_dt1,
+                            tree *first_stmt_def0_type, 
+                            tree *first_stmt_def1_type,
+                            tree *first_stmt_const_oprnd,
+                            int ncopies_for_cost,
+                             bool *pattern0, bool *pattern1)
+{
+  tree oprnd;
+  unsigned int i, number_of_oprnds;
+  tree def;
+  gimple def_stmt;
+  enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
+  stmt_vec_info stmt_info = 
+    vinfo_for_stmt (VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0));
+  enum gimple_rhs_class rhs_class;
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+
+  rhs_class = get_gimple_rhs_class (gimple_assign_rhs_code (stmt));
+  number_of_oprnds = gimple_num_ops (stmt) - 1;        /* RHS only */
+
+  for (i = 0; i < number_of_oprnds; i++)
+    {
+      oprnd = gimple_op (stmt, i + 1);
+
+      if (!vect_is_simple_use (oprnd, loop_vinfo, &def_stmt, &def, &dt[i])
+         || (!def_stmt && dt[i] != vect_constant_def))
+       {
+         if (vect_print_dump_info (REPORT_SLP)) 
+           {
+             fprintf (vect_dump, "Build SLP failed: can't find def for ");
+             print_generic_expr (vect_dump, oprnd, TDF_SLIM);
+           }
+
          return false;
        }
 
-      /* Check that the size of the interleaving is equal to STEP for stores, 
-         i.e., that there are no gaps.  */ 
-      if (!DR_IS_READ (dr) && dr_step != count_in_bytes) 
+      /* Check if DEF_STMT is a part of a pattern and get the def stmt from
+         the pattern. Check that all the stmts of the node are in the
+         pattern.  */
+      if (def_stmt && gimple_bb (def_stmt)
+          && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
+          && vinfo_for_stmt (def_stmt)
+          && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt)))
+        {
+          if (!*first_stmt_dt0)
+            *pattern0 = true;
+          else
+            {
+              if (i == 1 && !*first_stmt_dt1)
+                *pattern1 = true;
+              else if ((i == 0 && !*pattern0) || (i == 1 && !*pattern1))
+                {
+                  if (vect_print_dump_info (REPORT_DETAILS))
+                    {
+                      fprintf (vect_dump, "Build SLP failed: some of the stmts"
+                                     " are in a pattern, and others are not ");
+                      print_generic_expr (vect_dump, oprnd, TDF_SLIM);
+                    }
+
+                  return false;
+                }
+            }
+
+          def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
+          dt[i] = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
+
+          if (*dt == vect_unknown_def_type)
+            {
+              if (vect_print_dump_info (REPORT_DETAILS))
+                fprintf (vect_dump, "Unsupported pattern.");
+              return false;
+            }
+
+          switch (gimple_code (def_stmt))
+            {
+              case GIMPLE_PHI:
+                def = gimple_phi_result (def_stmt);
+                break;
+
+              case GIMPLE_ASSIGN:
+                def = gimple_assign_lhs (def_stmt);
+                break;
+
+              default:
+                if (vect_print_dump_info (REPORT_DETAILS))
+                  fprintf (vect_dump, "unsupported defining stmt: ");
+                return false;
+            }
+        }
+
+      if (!*first_stmt_dt0)
+       {
+         /* op0 of the first stmt of the group - store its info.  */
+         *first_stmt_dt0 = dt[i];
+         if (def)
+           *first_stmt_def0_type = TREE_TYPE (def);
+         else
+           *first_stmt_const_oprnd = oprnd;
+
+         /* Analyze costs (for the first stmt of the group only).  */
+         if (rhs_class != GIMPLE_SINGLE_RHS)
+           /* Not memory operation (we don't call this functions for loads).  */
+           vect_model_simple_cost (stmt_info, ncopies_for_cost, dt, slp_node);
+         else
+           /* Store.  */
+           vect_model_store_cost (stmt_info, ncopies_for_cost, dt[0], slp_node);
+       }
+      
+      else
+       {
+         if (!*first_stmt_dt1 && i == 1)
+           {
+             /* op1 of the first stmt of the group - store its info.  */
+             *first_stmt_dt1 = dt[i];
+             if (def)
+               *first_stmt_def1_type = TREE_TYPE (def);
+             else
+               {
+                 /* We assume that the stmt contains only one constant 
+                    operand. We fail otherwise, to be on the safe side.  */
+                 if (*first_stmt_const_oprnd)
+                   {
+                     if (vect_print_dump_info (REPORT_SLP)) 
+                       fprintf (vect_dump, "Build SLP failed: two constant "
+                                "oprnds in stmt");                 
+                     return false;
+                   }
+                 *first_stmt_const_oprnd = oprnd;
+               }
+           }
+         else
+           {
+             /* Not first stmt of the group, check that the def-stmt/s match 
+                the def-stmt/s of the first stmt.  */
+             if ((i == 0 
+                  && (*first_stmt_dt0 != dt[i]
+                      || (*first_stmt_def0_type && def
+                          && *first_stmt_def0_type != TREE_TYPE (def))))
+                 || (i == 1 
+                     && (*first_stmt_dt1 != dt[i]
+                         || (*first_stmt_def1_type && def
+                             && *first_stmt_def1_type != TREE_TYPE (def))))              
+                 || (!def 
+                     && TREE_TYPE (*first_stmt_const_oprnd) 
+                     != TREE_TYPE (oprnd)))
+               { 
+                 if (vect_print_dump_info (REPORT_SLP)) 
+                   fprintf (vect_dump, "Build SLP failed: different types ");
+                 
+                 return false;
+               }
+           }
+       }
+
+      /* Check the types of the definitions.  */
+      switch (dt[i])
        {
-         if (vect_print_dump_info (REPORT_DETAILS))
-           fprintf (vect_dump, "interleaved store with gaps");
+       case vect_constant_def:
+       case vect_invariant_def:
+         break;
+         
+       case vect_loop_def:
+         if (i == 0)
+           VEC_safe_push (gimple, heap, *def_stmts0, def_stmt);
+         else
+           VEC_safe_push (gimple, heap, *def_stmts1, def_stmt);
+         break;
+
+       default:
+         /* FORNOW: Not supported.  */
+         if (vect_print_dump_info (REPORT_SLP)) 
+           {
+             fprintf (vect_dump, "Build SLP failed: illegal type of def ");
+             print_generic_expr (vect_dump, def, TDF_SLIM);
+           }
+
          return false;
        }
+    }
 
-      /* Check that STEP is a multiple of type size.  */
-      if ((dr_step % type_size) != 0)
+  return true;
+}
+
+
+/* Recursively build an SLP tree starting from NODE.
+   Fail (and return FALSE) if def-stmts are not isomorphic, require data 
+   permutation or are of unsupported types of operation. Otherwise, return 
+   TRUE.  */
+
+static bool
+vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node, 
+                    unsigned int group_size, 
+                    int *inside_cost, int *outside_cost,
+                    int ncopies_for_cost, unsigned int *max_nunits,
+                     VEC (int, heap) **load_permutation,
+                     VEC (slp_tree, heap) **loads)
+{
+  VEC (gimple, heap) *def_stmts0 = VEC_alloc (gimple, heap, group_size);
+  VEC (gimple, heap) *def_stmts1 =  VEC_alloc (gimple, heap, group_size);
+  unsigned int i;
+  VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (*node);
+  gimple stmt = VEC_index (gimple, stmts, 0);
+  enum vect_def_type first_stmt_dt0 = 0, first_stmt_dt1 = 0;
+  enum tree_code first_stmt_code = 0, rhs_code;
+  tree first_stmt_def1_type = NULL_TREE, first_stmt_def0_type = NULL_TREE;
+  tree lhs;
+  bool stop_recursion = false, need_same_oprnds = false;
+  tree vectype, scalar_type, first_op1 = NULL_TREE;
+  unsigned int vectorization_factor = 0, ncopies;
+  optab optab;
+  int icode;
+  enum machine_mode optab_op2_mode;
+  enum machine_mode vec_mode;
+  tree first_stmt_const_oprnd = NULL_TREE;
+  struct data_reference *first_dr;
+  bool pattern0 = false, pattern1 = false;
+  HOST_WIDE_INT dummy;
+  bool permutation = false;
+  unsigned int load_place;
+  gimple first_load;
+
+  /* For every stmt in NODE find its def stmt/s.  */
+  for (i = 0; VEC_iterate (gimple, stmts, i, stmt); i++)
+    {
+      if (vect_print_dump_info (REPORT_SLP)) 
        {
-         if (vect_print_dump_info (REPORT_DETAILS)) 
+         fprintf (vect_dump, "Build SLP for ");
+         print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+       }
+
+      lhs = gimple_get_lhs (stmt);
+      if (lhs == NULL_TREE)
+       {
+         if (vect_print_dump_info (REPORT_SLP)) 
+           {
+             fprintf (vect_dump,
+                      "Build SLP failed: not GIMPLE_ASSIGN nor GIMPLE_CALL");
+             print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+           }
+         
+         return false;
+       }
+
+      scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy); 
+      vectype = get_vectype_for_scalar_type (scalar_type);
+      if (!vectype)
+        {
+          if (vect_print_dump_info (REPORT_SLP))
             {
-              fprintf (vect_dump, "step is not a multiple of type size: step ");
-              print_generic_expr (vect_dump, step, TDF_SLIM);
-              fprintf (vect_dump, " size ");
-              print_generic_expr (vect_dump, TYPE_SIZE_UNIT (scalar_type),
-                                  TDF_SLIM);
+              fprintf (vect_dump, "Build SLP failed: unsupported data-type ");
+              print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
+            }
+          return false;
+        }
+
+      gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+      vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      ncopies = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
+      if (ncopies > 1 && vect_print_dump_info (REPORT_SLP))
+        fprintf (vect_dump, "SLP with multiple types ");
+
+      /* In case of multiple types we need to detect the smallest type.  */
+      if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
+        *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
+         
+      if (is_gimple_call (stmt))
+       rhs_code = CALL_EXPR;
+      else
+       rhs_code = gimple_assign_rhs_code (stmt);
+
+      /* Check the operation.  */
+      if (i == 0)
+       {
+         first_stmt_code = rhs_code;
+
+         /* Shift arguments should be equal in all the packed stmts for a 
+            vector shift with scalar shift operand.  */
+         if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
+             || rhs_code == LROTATE_EXPR
+             || rhs_code == RROTATE_EXPR)
+           {
+             vec_mode = TYPE_MODE (vectype);
+
+             /* First see if we have a vector/vector shift.  */
+             optab = optab_for_tree_code (rhs_code, vectype,
+                                          optab_vector);
+
+             if (!optab
+                 || (optab->handlers[(int) vec_mode].insn_code
+                     == CODE_FOR_nothing))
+               {
+                 /* No vector/vector shift, try for a vector/scalar shift.  */
+                 optab = optab_for_tree_code (rhs_code, vectype,
+                                              optab_scalar);
+
+                 if (!optab)
+                   {
+                     if (vect_print_dump_info (REPORT_SLP))
+                       fprintf (vect_dump, "Build SLP failed: no optab.");
+                     return false;
+                   }
+                 icode = (int) optab->handlers[(int) vec_mode].insn_code;
+                 if (icode == CODE_FOR_nothing)
+                   {
+                     if (vect_print_dump_info (REPORT_SLP))
+                       fprintf (vect_dump, "Build SLP failed: "
+                                           "op not supported by target.");
+                     return false;
+                   }
+                 optab_op2_mode = insn_data[icode].operand[2].mode;
+                 if (!VECTOR_MODE_P (optab_op2_mode))
+                   {
+                     need_same_oprnds = true;
+                     first_op1 = gimple_assign_rhs2 (stmt);
+                   }
+               }
+           }
+       }
+      else
+       {
+         if (first_stmt_code != rhs_code
+             && (first_stmt_code != IMAGPART_EXPR
+                 || rhs_code != REALPART_EXPR)
+             && (first_stmt_code != REALPART_EXPR
+                 || rhs_code != IMAGPART_EXPR))
+           {
+             if (vect_print_dump_info (REPORT_SLP)) 
+               {
+                 fprintf (vect_dump, 
+                          "Build SLP failed: different operation in stmt ");
+                 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+               }
+             
+             return false;
+           }
+         
+         if (need_same_oprnds 
+             && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
+           {
+             if (vect_print_dump_info (REPORT_SLP)) 
+               {
+                 fprintf (vect_dump, 
+                          "Build SLP failed: different shift arguments in ");
+                 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+               }
+             
+             return false;
+           }
+       }
+
+      /* Strided store or load.  */
+      if (STMT_VINFO_STRIDED_ACCESS (vinfo_for_stmt (stmt)))
+       {
+         if (REFERENCE_CLASS_P (lhs))
+           {
+             /* Store.  */
+             if (!vect_get_and_check_slp_defs (loop_vinfo, *node, stmt,
+                                               &def_stmts0, &def_stmts1, 
+                                               &first_stmt_dt0, 
+                                               &first_stmt_dt1, 
+                                               &first_stmt_def0_type, 
+                                               &first_stmt_def1_type,
+                                               &first_stmt_const_oprnd,
+                                               ncopies_for_cost,
+                                                &pattern0, &pattern1))
+               return false;
+           }
+           else
+             {
+               /* Load.  */
+                /* FORNOW: Check that there is no gap between the loads.  */
+                if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt
+                     && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
+                    || (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt
+                        && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
+                  {
+                    if (vect_print_dump_info (REPORT_SLP))
+                      {
+                        fprintf (vect_dump, "Build SLP failed: strided "
+                                            "loads have gaps ");
+                        print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+                      }
+                    return false;
+                  }
+                first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
+              if (first_load == stmt)
+                {
+                  first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
+                  if (vect_supportable_dr_alignment (first_dr)
+                      == dr_unaligned_unsupported)
+                    {
+                      if (vect_print_dump_info (REPORT_SLP))
+                        {
+                          fprintf (vect_dump, "Build SLP failed: unsupported "
+                                              "unaligned load ");
+                          print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+                        }
+  
+                      return false;
+                    }
+                  /* Analyze costs (for the first stmt in the group).  */
+                  vect_model_load_cost (vinfo_for_stmt (stmt),
+                                        ncopies_for_cost, *node);
+                }
+  
+              /* Store the place of this load in the interleaving chain. In
+                 case that permutation is needed we later decide if a specific
+                 permutation is supported.  */
+              load_place = vect_get_place_in_interleaving_chain (stmt,
+                                                                 first_load);
+              if (load_place != i)
+                permutation = true;
+              VEC_safe_push (int, heap, *load_permutation, load_place);
+              /* We stop the tree when we reach a group of loads.  */
+              stop_recursion = true;
+             continue;
+           }
+        } /* Strided access.  */
+      else
+       {
+         if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
+           {
+             /* Not strided load. */
+             if (vect_print_dump_info (REPORT_SLP)) 
+               {
+                 fprintf (vect_dump, "Build SLP failed: not strided load ");
+                 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+               }
+
+             /* FORNOW: Not strided loads are not supported.  */
+             return false;
+           }
+
+         /* Not memory operation.  */
+         if (TREE_CODE_CLASS (rhs_code) != tcc_binary
+             && TREE_CODE_CLASS (rhs_code) != tcc_unary)
+           {
+             if (vect_print_dump_info (REPORT_SLP)) 
+               {
+                 fprintf (vect_dump, "Build SLP failed: operation");
+                 fprintf (vect_dump, " unsupported ");
+                 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+               }
+
+             return false;
+           }
+
+         /* Find the def-stmts.  */ 
+         if (!vect_get_and_check_slp_defs (loop_vinfo, *node, stmt,
+                                           &def_stmts0, &def_stmts1,
+                                           &first_stmt_dt0, &first_stmt_dt1, 
+                                           &first_stmt_def0_type, 
+                                           &first_stmt_def1_type,
+                                           &first_stmt_const_oprnd,
+                                           ncopies_for_cost,
+                                            &pattern0, &pattern1))
+           return false;
+       }
+    }
+
+  /* Add the costs of the node to the overall instance costs.  */
+  *inside_cost += SLP_TREE_INSIDE_OF_LOOP_COST (*node); 
+  *outside_cost += SLP_TREE_OUTSIDE_OF_LOOP_COST (*node);
+
+  /* Strided loads were reached - stop the recursion.  */
+  if (stop_recursion)
+    {
+      if (permutation)
+        {
+          VEC_safe_push (slp_tree, heap, *loads, *node); 
+          *inside_cost += TARG_VEC_PERMUTE_COST * group_size;  
+        }
+
+      return true;
+    }
+
+  /* Create SLP_TREE nodes for the definition node/s.  */ 
+  if (first_stmt_dt0 == vect_loop_def)
+    {
+      slp_tree left_node = XNEW (struct _slp_tree);
+      SLP_TREE_SCALAR_STMTS (left_node) = def_stmts0;
+      SLP_TREE_VEC_STMTS (left_node) = NULL;
+      SLP_TREE_LEFT (left_node) = NULL;
+      SLP_TREE_RIGHT (left_node) = NULL;
+      SLP_TREE_OUTSIDE_OF_LOOP_COST (left_node) = 0;
+      SLP_TREE_INSIDE_OF_LOOP_COST (left_node) = 0;
+      if (!vect_build_slp_tree (loop_vinfo, &left_node, group_size, 
+                               inside_cost, outside_cost, ncopies_for_cost, 
+                               max_nunits, load_permutation, loads))
+       return false;
+      
+      SLP_TREE_LEFT (*node) = left_node;
+    }
+
+  if (first_stmt_dt1 == vect_loop_def)
+    {
+      slp_tree right_node = XNEW (struct _slp_tree);
+      SLP_TREE_SCALAR_STMTS (right_node) = def_stmts1;
+      SLP_TREE_VEC_STMTS (right_node) = NULL;
+      SLP_TREE_LEFT (right_node) = NULL;
+      SLP_TREE_RIGHT (right_node) = NULL;
+      SLP_TREE_OUTSIDE_OF_LOOP_COST (right_node) = 0;
+      SLP_TREE_INSIDE_OF_LOOP_COST (right_node) = 0;
+      if (!vect_build_slp_tree (loop_vinfo, &right_node, group_size,
+                               inside_cost, outside_cost, ncopies_for_cost,
+                               max_nunits, load_permutation, loads))
+       return false;
+      
+      SLP_TREE_RIGHT (*node) = right_node;
+    }
+
+  return true;
+}
+
+
+static void
+vect_print_slp_tree (slp_tree node)
+{
+  int i;
+  gimple stmt;
+
+  if (!node)
+    return;
+
+  fprintf (vect_dump, "node ");
+  for (i = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt); i++)
+    {
+      fprintf (vect_dump, "\n\tstmt %d ", i);
+      print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);  
+    }
+  fprintf (vect_dump, "\n");
+
+  vect_print_slp_tree (SLP_TREE_LEFT (node));
+  vect_print_slp_tree (SLP_TREE_RIGHT (node));
+}
+
+
+/* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID). 
+   If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index 
+   J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the 
+   stmts in NODE are to be marked.  */
+
+static void
+vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
+{
+  int i;
+  gimple stmt;
+
+  if (!node)
+    return;
+
+  for (i = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt); i++)
+    if (j < 0 || i == j)
+      STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
+
+  vect_mark_slp_stmts (SLP_TREE_LEFT (node), mark, j);
+  vect_mark_slp_stmts (SLP_TREE_RIGHT (node), mark, j);
+}
+
+
+/* Check if the permutation required by the SLP INSTANCE is supported.  
+   Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed.  */
+
+static bool
+vect_supported_slp_permutation_p (slp_instance instance)
+{
+  slp_tree node = VEC_index (slp_tree, SLP_INSTANCE_LOADS (instance), 0);
+  gimple stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
+  gimple first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
+  VEC (slp_tree, heap) *sorted_loads = NULL;
+  int index;
+  slp_tree *tmp_loads = NULL;
+  int group_size = SLP_INSTANCE_GROUP_SIZE (instance), i, j; 
+  slp_tree load;
+  /* FORNOW: The only supported loads permutation is loads from the same 
+     location in all the loads in the node, when the data-refs in
+     nodes of LOADS constitute an interleaving chain.  
+     Sort the nodes according to the order of accesses in the chain.  */
+  tmp_loads = (slp_tree *) xmalloc (sizeof (slp_tree) * group_size);
+  for (i = 0, j = 0; 
+       VEC_iterate (int, SLP_INSTANCE_LOAD_PERMUTATION (instance), i, index) 
+       && VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), j, load); 
+       i += group_size, j++)
+    {
+      gimple scalar_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (load), 0);
+      /* Check that the loads are all in the same interleaving chain.  */
+      if (DR_GROUP_FIRST_DR (vinfo_for_stmt (scalar_stmt)) != first_load)
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            {
+              fprintf (vect_dump, "Build SLP failed: unsupported data "
+                                   "permutation ");
+              print_gimple_stmt (vect_dump, scalar_stmt, 0, TDF_SLIM);
+            }
+             
+          free (tmp_loads);
+          return false; 
+        }
+
+      tmp_loads[index] = load;
+    }
+  
+  sorted_loads = VEC_alloc (slp_tree, heap, group_size);
+  for (i = 0; i < group_size; i++)
+     VEC_safe_push (slp_tree, heap, sorted_loads, tmp_loads[i]);
+
+  VEC_free (slp_tree, heap, SLP_INSTANCE_LOADS (instance));
+  SLP_INSTANCE_LOADS (instance) = sorted_loads;
+  free (tmp_loads);
+
+  if (!vect_transform_slp_perm_load (stmt, NULL, NULL,
+                                     SLP_INSTANCE_UNROLLING_FACTOR (instance),
+                                     instance, true))
+    return false;
+
+  return true;
+}
+
+
+/* Check if the required load permutation is supported.
+   LOAD_PERMUTATION contains a list of indices of the loads.
+   In SLP this permutation is relative to the order of strided stores that are
+   the base of the SLP instance.  */
+
+static bool
+vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
+                                   VEC (int, heap) *load_permutation)
+{
+  int i = 0, j, prev = -1, next, k;
+  bool supported;
+
+  /* FORNOW: permutations are only supported for loop-aware SLP.  */
+  if (!slp_instn)
+    return false;
+
+  if (vect_print_dump_info (REPORT_SLP))
+    {
+      fprintf (vect_dump, "Load permutation ");
+      for (i = 0; VEC_iterate (int, load_permutation, i, next); i++)
+        fprintf (vect_dump, "%d ", next);
+    }
+
+  /* FORNOW: the only supported permutation is 0..01..1.. of length equal to 
+     GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as 
+     well.  */
+  if (VEC_length (int, load_permutation)
+      != (unsigned int) (group_size * group_size))
+    return false;
+
+  supported = true;
+  for (j = 0; j < group_size; j++)
+    {
+      for (i = j * group_size, k = 0;
+           VEC_iterate (int, load_permutation, i, next) && k < group_size;
+           i++, k++)
+       {
+         if (i != j * group_size && next != prev)
+          {
+            supported = false;
+            break;
+          }
+
+         prev = next;
+       }  
+    }
+
+  if (supported && i == group_size * group_size
+      && vect_supported_slp_permutation_p (slp_instn))
+    return true;
+
+  return false; 
+}
+
+
+/* Find the first load in the loop that belongs to INSTANCE. 
+   When loads are in several SLP nodes, there can be a case in which the first
+   load does not appear in the first SLP node to be transformed, causing 
+   incorrect order of statements. Since we generate all the loads together,
+   they must be inserted before the first load of the SLP instance and not
+   before the first load of the first node of the instance.  */
+static gimple 
+vect_find_first_load_in_slp_instance (slp_instance instance) 
+{
+  int i, j;
+  slp_tree load_node;
+  gimple first_load = NULL, load;
+
+  for (i = 0; 
+       VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), i, load_node); 
+       i++)
+    for (j = 0; 
+         VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (load_node), j, load);
+         j++)
+      first_load = get_earlier_stmt (load, first_load);
+  
+  return first_load;
+}
+
+
+/* Analyze an SLP instance starting from a group of strided stores. Call
+   vect_build_slp_tree to build a tree of packed stmts if possible.  
+   Return FALSE if it's impossible to SLP any stmt in the loop.  */
+
+static bool
+vect_analyze_slp_instance (loop_vec_info loop_vinfo, gimple stmt)
+{
+  slp_instance new_instance;
+  slp_tree node = XNEW (struct _slp_tree);
+  unsigned int group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt));
+  unsigned int unrolling_factor = 1, nunits;
+  tree vectype, scalar_type;
+  gimple next;
+  unsigned int vectorization_factor = 0, ncopies;
+  bool slp_impossible = false; 
+  int inside_cost = 0, outside_cost = 0, ncopies_for_cost;
+  unsigned int max_nunits = 0;
+  VEC (int, heap) *load_permutation;
+  VEC (slp_tree, heap) *loads;
+  scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (
+                                             vinfo_for_stmt (stmt))));
+  vectype = get_vectype_for_scalar_type (scalar_type);
+  if (!vectype)
+    {
+      if (vect_print_dump_info (REPORT_SLP))
+        {
+          fprintf (vect_dump, "Build SLP failed: unsupported data-type ");
+          print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
+        }
+      return false;
+    }
+
+  nunits = TYPE_VECTOR_SUBPARTS (vectype);
+  vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  ncopies = vectorization_factor / nunits;
+
+  /* Create a node (a root of the SLP tree) for the packed strided stores.  */ 
+  SLP_TREE_SCALAR_STMTS (node) = VEC_alloc (gimple, heap, group_size);
+  next = stmt;
+  /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
+  while (next)
+    {
+      VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next);
+      next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
+    }
+
+  SLP_TREE_VEC_STMTS (node) = NULL;
+  SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
+  SLP_TREE_LEFT (node) = NULL;
+  SLP_TREE_RIGHT (node) = NULL;
+  SLP_TREE_OUTSIDE_OF_LOOP_COST (node) = 0;
+  SLP_TREE_INSIDE_OF_LOOP_COST (node) = 0;
+
+  /* Calculate the unrolling factor.  */
+  unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
+       
+  /* Calculate the number of vector stmts to create based on the unrolling
+     factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
+     GROUP_SIZE / NUNITS otherwise.  */
+  ncopies_for_cost = unrolling_factor * group_size / nunits;
+  
+  load_permutation = VEC_alloc (int, heap, group_size * group_size); 
+  loads = VEC_alloc (slp_tree, heap, group_size); 
+
+  /* Build the tree for the SLP instance.  */
+  if (vect_build_slp_tree (loop_vinfo, &node, group_size, &inside_cost,  
+                          &outside_cost, ncopies_for_cost, &max_nunits,
+                           &load_permutation, &loads))
+    {
+      /* Create a new SLP instance.  */  
+      new_instance = XNEW (struct _slp_instance);
+      SLP_INSTANCE_TREE (new_instance) = node;
+      SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
+      /* Calculate the unrolling factor based on the smallest type in the
+         loop.  */
+      if (max_nunits > nunits)
+        unrolling_factor = least_common_multiple (max_nunits, group_size)
+                           / group_size;
+
+      SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
+      SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (new_instance) = outside_cost;
+      SLP_INSTANCE_INSIDE_OF_LOOP_COST (new_instance) = inside_cost;
+      SLP_INSTANCE_LOADS (new_instance) = loads;
+      SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
+      SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation;
+      if (VEC_length (slp_tree, loads))
+        {
+          if (!vect_supported_load_permutation_p (new_instance, group_size,
+                                                  load_permutation)) 
+            {
+              if (vect_print_dump_info (REPORT_SLP))
+                {
+                  fprintf (vect_dump, "Build SLP failed: unsupported load "
+                                      "permutation ");
+                  print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+                }
+
+              vect_free_slp_instance (new_instance);
+              return false;
             }
-         return false;
-       }
 
-      /* FORNOW: we handle only interleaving that is a power of 2.  */
-      if (exact_log2 (stride) == -1)
-       {
-         if (vect_print_dump_info (REPORT_DETAILS))
-           fprintf (vect_dump, "interleaving is not a power of 2");
-         return false;
-       }
-      DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = stride;
-    }
-  return true;
-}
+          SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
+             = vect_find_first_load_in_slp_instance (new_instance);
+        }
+      else
+        VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (new_instance));
 
+      VEC_safe_push (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo), 
+                    new_instance);
+      if (vect_print_dump_info (REPORT_SLP))
+       vect_print_slp_tree (node);
 
-/* Function vect_analyze_data_ref_accesses.
+      return true;
+    }
 
-   Analyze the access pattern of all the data references in the loop.
+  /* Failed to SLP.  */
+  /* Free the allocated memory.  */
+  vect_free_slp_tree (node);
+  VEC_free (int, heap, load_permutation);
+  VEC_free (slp_tree, heap, loads);
+   
+  if (slp_impossible)
+    return false;
 
-   FORNOW: the only access pattern that is considered vectorizable is a
-          simple step 1 (consecutive) access.
+  /* SLP failed for this instance, but it is still possible to SLP other stmts 
+     in the loop.  */
+  return true;
+}
 
-   FORNOW: handle only arrays and pointer accesses.  */
+
+/* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
+   trees of packed scalar stmts if SLP is possible.  */
 
 static bool
-vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo)
+vect_analyze_slp (loop_vec_info loop_vinfo)
 {
   unsigned int i;
-  VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
-  struct data_reference *dr;
+  VEC (gimple, heap) *strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo);
+  gimple store;
 
-  if (vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "=== vect_analyze_data_ref_accesses ===");
+  if (vect_print_dump_info (REPORT_SLP))
+    fprintf (vect_dump, "=== vect_analyze_slp ===");
 
-  for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
-    if (!vect_analyze_data_ref_access (dr))
+  for (i = 0; VEC_iterate (gimple, strided_stores, i, store); i++)
+    if (!vect_analyze_slp_instance (loop_vinfo, store))
       {
-       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
-         fprintf (vect_dump, "not vectorized: complicated access pattern.");
+       /* SLP failed. No instance can be SLPed in the loop.  */
+       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))   
+         fprintf (vect_dump, "SLP failed.");
+
        return false;
       }
 
@@ -2019,6 +3412,86 @@ vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo)
 }
 
 
+/* For each possible SLP instance decide whether to SLP it and calculate overall
+   unrolling factor needed to SLP the loop.  */
+
+static void
+vect_make_slp_decision (loop_vec_info loop_vinfo)
+{
+  unsigned int i, unrolling_factor = 1;
+  VEC (slp_instance, heap) *slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
+  slp_instance instance;
+  int decided_to_slp = 0;
+
+  if (vect_print_dump_info (REPORT_SLP))
+    fprintf (vect_dump, "=== vect_make_slp_decision ===");
+
+  for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
+    {
+      /* FORNOW: SLP if you can.  */
+      if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
+       unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
+
+      /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we 
+        call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and 
+        loop-based vectorization. Such stmts will be marked as HYBRID.  */
+      vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
+      decided_to_slp++;
+    }
+
+  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
+
+  if (decided_to_slp && vect_print_dump_info (REPORT_SLP)) 
+    fprintf (vect_dump, "Decided to SLP %d instances. Unrolling factor %d", 
+            decided_to_slp, unrolling_factor);
+}
+
+
+/* Find stmts that must be both vectorized and SLPed (since they feed stmts that
+   can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID.  */
+
+static void
+vect_detect_hybrid_slp_stmts (slp_tree node)
+{
+  int i;
+  gimple stmt;
+  imm_use_iterator imm_iter;
+  gimple use_stmt;
+
+  if (!node)
+    return;
+
+  for (i = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt); i++)
+    if (PURE_SLP_STMT (vinfo_for_stmt (stmt))
+       && TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
+      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
+       if (vinfo_for_stmt (use_stmt)
+           && !STMT_SLP_TYPE (vinfo_for_stmt (use_stmt))
+            && STMT_VINFO_RELEVANT (vinfo_for_stmt (use_stmt)))
+         vect_mark_slp_stmts (node, hybrid, i);
+
+  vect_detect_hybrid_slp_stmts (SLP_TREE_LEFT (node));
+  vect_detect_hybrid_slp_stmts (SLP_TREE_RIGHT (node));
+}
+
+
+/* Find stmts that must be both vectorized and SLPed.  */
+
+static void
+vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
+{
+  unsigned int i;
+  VEC (slp_instance, heap) *slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
+  slp_instance instance;
+
+  if (vect_print_dump_info (REPORT_SLP))
+    fprintf (vect_dump, "=== vect_detect_hybrid_slp ===");
+
+  for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
+    vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance));
+}
+
+
 /* Function vect_analyze_data_refs.
 
   Find all the data references in the loop.
@@ -2055,8 +3528,10 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo)
 
   for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
     {
-      tree stmt;
+      gimple stmt;
       stmt_vec_info stmt_info;
+      basic_block bb;
+      tree base, offset, init; 
    
       if (!dr || !DR_REF (dr))
         {
@@ -2064,31 +3539,18 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo)
            fprintf (vect_dump, "not vectorized: unhandled data-ref ");
           return false;
         }
-      /* Update DR field in stmt_vec_info struct.  */
+
       stmt = DR_STMT (dr);
       stmt_info = vinfo_for_stmt (stmt);
 
-      if (STMT_VINFO_DATA_REF (stmt_info))
-        {
-          if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
-            {
-              fprintf (vect_dump,
-                       "not vectorized: more than one data ref in stmt: ");
-              print_generic_expr (vect_dump, stmt, TDF_SLIM);
-            }
-          return false;
-        }
-      STMT_VINFO_DATA_REF (stmt_info) = dr;
-     
       /* Check that analysis of the data-ref succeeded.  */
       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
-          || !DR_STEP (dr))   
+          || !DR_STEP (dr))
         {
           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
             {
               fprintf (vect_dump, "not vectorized: data ref analysis failed ");
-              print_generic_expr (vect_dump, stmt, TDF_SLIM);
+              print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
             }
           return false;
         }
@@ -2110,7 +3572,129 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo)
             }
           return false;
         }
-                       
+
+      base = unshare_expr (DR_BASE_ADDRESS (dr));
+      offset = unshare_expr (DR_OFFSET (dr));
+      init = unshare_expr (DR_INIT (dr));
+       
+      /* Update DR field in stmt_vec_info struct.  */
+      bb = gimple_bb (stmt);
+
+      /* If the dataref is in an inner-loop of the loop that is considered for
+        for vectorization, we also want to analyze the access relative to
+        the outer-loop (DR contains information only relative to the 
+        inner-most enclosing loop).  We do that by building a reference to the
+        first location accessed by the inner-loop, and analyze it relative to
+        the outer-loop.  */    
+      if (nested_in_vect_loop_p (loop, stmt)) 
+       {
+         tree outer_step, outer_base, outer_init;
+         HOST_WIDE_INT pbitsize, pbitpos;
+         tree poffset;
+         enum machine_mode pmode;
+         int punsignedp, pvolatilep;
+         affine_iv base_iv, offset_iv;
+         tree dinit;
+
+         /* Build a reference to the first location accessed by the 
+            inner-loop: *(BASE+INIT). (The first location is actually
+            BASE+INIT+OFFSET, but we add OFFSET separately later).  */
+          tree inner_base = build_fold_indirect_ref
+                                (fold_build2 (POINTER_PLUS_EXPR,
+                                              TREE_TYPE (base), base, 
+                                              fold_convert (sizetype, init)));
+
+         if (vect_print_dump_info (REPORT_DETAILS))
+           {
+             fprintf (vect_dump, "analyze in outer-loop: ");
+             print_generic_expr (vect_dump, inner_base, TDF_SLIM);
+           }
+
+         outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos, 
+                         &poffset, &pmode, &punsignedp, &pvolatilep, false);
+         gcc_assert (outer_base != NULL_TREE);
+
+         if (pbitpos % BITS_PER_UNIT != 0)
+           {
+             if (vect_print_dump_info (REPORT_DETAILS))
+               fprintf (vect_dump, "failed: bit offset alignment.\n");
+             return false;
+           }
+
+         outer_base = build_fold_addr_expr (outer_base);
+         if (!simple_iv (loop, stmt, outer_base, &base_iv, false))
+           {
+             if (vect_print_dump_info (REPORT_DETAILS))
+               fprintf (vect_dump, "failed: evolution of base is not affine.\n");
+             return false;
+           }
+
+         if (offset)
+           {
+             if (poffset)
+               poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, poffset);
+             else
+               poffset = offset;
+           }
+
+         if (!poffset)
+           {
+             offset_iv.base = ssize_int (0);
+             offset_iv.step = ssize_int (0);
+           }
+         else if (!simple_iv (loop, stmt, poffset, &offset_iv, false))
+           {
+             if (vect_print_dump_info (REPORT_DETAILS))
+               fprintf (vect_dump, "evolution of offset is not affine.\n");
+             return false;
+           }
+
+         outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
+         split_constant_offset (base_iv.base, &base_iv.base, &dinit);
+         outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
+         split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
+         outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
+
+         outer_step = size_binop (PLUS_EXPR,
+                               fold_convert (ssizetype, base_iv.step),
+                               fold_convert (ssizetype, offset_iv.step));
+
+         STMT_VINFO_DR_STEP (stmt_info) = outer_step;
+         /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
+         STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base; 
+         STMT_VINFO_DR_INIT (stmt_info) = outer_init;
+         STMT_VINFO_DR_OFFSET (stmt_info) = 
+                               fold_convert (ssizetype, offset_iv.base);
+         STMT_VINFO_DR_ALIGNED_TO (stmt_info) = 
+                               size_int (highest_pow2_factor (offset_iv.base));
+
+         if (vect_print_dump_info (REPORT_DETAILS))
+           {
+             fprintf (vect_dump, "\touter base_address: ");
+             print_generic_expr (vect_dump, STMT_VINFO_DR_BASE_ADDRESS (stmt_info), TDF_SLIM);
+             fprintf (vect_dump, "\n\touter offset from base address: ");
+             print_generic_expr (vect_dump, STMT_VINFO_DR_OFFSET (stmt_info), TDF_SLIM);
+             fprintf (vect_dump, "\n\touter constant offset from base address: ");
+             print_generic_expr (vect_dump, STMT_VINFO_DR_INIT (stmt_info), TDF_SLIM);
+             fprintf (vect_dump, "\n\touter step: ");
+             print_generic_expr (vect_dump, STMT_VINFO_DR_STEP (stmt_info), TDF_SLIM);
+             fprintf (vect_dump, "\n\touter aligned to: ");
+             print_generic_expr (vect_dump, STMT_VINFO_DR_ALIGNED_TO (stmt_info), TDF_SLIM);
+           }
+       }
+
+      if (STMT_VINFO_DATA_REF (stmt_info))
+        {
+          if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
+            {
+              fprintf (vect_dump,
+                       "not vectorized: more than one data ref in stmt: ");
+              print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+            }
+          return false;
+        }
+      STMT_VINFO_DATA_REF (stmt_info) = dr;
+     
       /* Set vectype for STMT.  */
       scalar_type = TREE_TYPE (DR_REF (dr));
       STMT_VINFO_VECTYPE (stmt_info) =
@@ -2121,7 +3705,7 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo)
             {
               fprintf (vect_dump,
                        "not vectorized: no vectype for stmt: ");
-              print_generic_expr (vect_dump, stmt, TDF_SLIM);
+              print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
               fprintf (vect_dump, " scalar_type: ");
               print_generic_expr (vect_dump, scalar_type, TDF_DETAILS);
             }
@@ -2140,7 +3724,7 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo)
    Mark STMT as "relevant for vectorization" and add it to WORKLIST.  */
 
 static void
-vect_mark_relevant (VEC(tree,heap) **worklist, tree stmt,
+vect_mark_relevant (VEC(gimple,heap) **worklist, gimple stmt,
                    enum vect_relevant relevant, bool live_p)
 {
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
@@ -2152,15 +3736,17 @@ vect_mark_relevant (VEC(tree,heap) **worklist, tree stmt,
 
   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
     {
-      tree pattern_stmt;
+      gimple pattern_stmt;
 
       /* This is the last stmt in a sequence that was detected as a 
          pattern that can potentially be vectorized.  Don't mark the stmt
-         as relevant/live because it's not going to vectorized.
+         as relevant/live because it's not going to be vectorized.
          Instead mark the pattern-stmt that replaces it.  */
+
+      pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
+
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "last stmt in pattern. don't mark relevant/live.");
-      pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
       stmt_info = vinfo_for_stmt (pattern_stmt);
       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == stmt);
       save_relevant = STMT_VINFO_RELEVANT (stmt_info);
@@ -2180,7 +3766,7 @@ vect_mark_relevant (VEC(tree,heap) **worklist, tree stmt,
       return;
     }
 
-  VEC_safe_push (tree, heap, *worklist, stmt);
+  VEC_safe_push (gimple, heap, *worklist, stmt);
 }
 
 
@@ -2197,7 +3783,7 @@ vect_mark_relevant (VEC(tree,heap) **worklist, tree stmt,
    CHECKME: what other side effects would the vectorizer allow?  */
 
 static bool
-vect_stmt_relevant_p (tree stmt, loop_vec_info loop_vinfo,
+vect_stmt_relevant_p (gimple stmt, loop_vec_info loop_vinfo,
                      enum vect_relevant *relevant, bool *live_p)
 {
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -2210,11 +3796,12 @@ vect_stmt_relevant_p (tree stmt, loop_vec_info loop_vinfo,
   *live_p = false;
 
   /* cond stmt other than loop exit cond.  */
-  if (is_ctrl_stmt (stmt) && (stmt != LOOP_VINFO_EXIT_COND (loop_vinfo)))
+  if (is_ctrl_stmt (stmt) 
+      && STMT_VINFO_TYPE (vinfo_for_stmt (stmt)) != loop_exit_ctrl_vec_info_type) 
     *relevant = vect_used_in_loop;
 
   /* changing memory.  */
-  if (TREE_CODE (stmt) != PHI_NODE)
+  if (gimple_code (stmt) != GIMPLE_PHI)
     if (!ZERO_SSA_OPERANDS (stmt, SSA_OP_VIRTUAL_DEFS))
       {
        if (vect_print_dump_info (REPORT_DETAILS))
@@ -2227,7 +3814,7 @@ vect_stmt_relevant_p (tree stmt, loop_vec_info loop_vinfo,
     {
       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, DEF_FROM_PTR (def_p))
        {
-         basic_block bb = bb_for_stmt (USE_STMT (use_p));
+         basic_block bb = gimple_bb (USE_STMT (use_p));
          if (!flow_bb_inside_loop_p (loop, bb))
            {
              if (vect_print_dump_info (REPORT_DETAILS))
@@ -2235,7 +3822,7 @@ vect_stmt_relevant_p (tree stmt, loop_vec_info loop_vinfo,
 
              /* We expect all such uses to be in the loop exit phis
                 (because of loop closed form)   */
-             gcc_assert (TREE_CODE (USE_STMT (use_p)) == PHI_NODE);
+             gcc_assert (gimple_code (USE_STMT (use_p)) == GIMPLE_PHI);
              gcc_assert (bb == single_exit (loop)->dest);
 
               *live_p = true;
@@ -2253,8 +3840,8 @@ vect_stmt_relevant_p (tree stmt, loop_vec_info loop_vinfo,
    Inputs:
    - a USE in STMT in a loop represented by LOOP_VINFO
    - LIVE_P, RELEVANT - enum values to be set in the STMT_VINFO of the stmt 
-     that defined USE. This is dont by calling mark_relevant and passing it
-     the WORKLIST (to add DEF_STMT to the WORKlist in case itis relevant). 
+     that defined USE. This is done by calling mark_relevant and passing it
+     the WORKLIST (to add DEF_STMT to the WORKLIST in case it is relevant).
 
    Outputs:
    Generally, LIVE_P and RELEVANT are used to define the liveness and
@@ -2267,18 +3854,21 @@ vect_stmt_relevant_p (tree stmt, loop_vec_info loop_vinfo,
    of the respective DEF_STMT is left unchanged.
    - case 2: If STMT is a reduction phi and DEF_STMT is a reduction stmt, we 
    skip DEF_STMT cause it had already been processed.  
+   - case 3: If DEF_STMT and STMT are in different nests, then  "relevant" will
+   be modified accordingly.
 
    Return true if everything is as expected. Return false otherwise.  */
 
 static bool
-process_use (tree stmt, tree use, loop_vec_info loop_vinfo, bool live_p, 
-            enum vect_relevant relevant, VEC(tree,heap) **worklist)
+process_use (gimple stmt, tree use, loop_vec_info loop_vinfo, bool live_p, 
+            enum vect_relevant relevant, VEC(gimple,heap) **worklist)
 {
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
   stmt_vec_info dstmt_vinfo;
-  basic_block def_bb;
-  tree def, def_stmt;
+  basic_block bb, def_bb;
+  tree def;
+  gimple def_stmt;
   enum vect_def_type dt;
 
   /* case 1: we are only interested in uses that need to be vectorized.  Uses 
@@ -2293,22 +3883,32 @@ process_use (tree stmt, tree use, loop_vec_info loop_vinfo, bool live_p,
       return false;
     }
 
-  if (!def_stmt || IS_EMPTY_STMT (def_stmt))
+  if (!def_stmt || gimple_nop_p (def_stmt))
     return true;
 
-  def_bb = bb_for_stmt (def_stmt);
+  def_bb = gimple_bb (def_stmt);
   if (!flow_bb_inside_loop_p (loop, def_bb))
-    return true;
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+       fprintf (vect_dump, "def_stmt is out of loop.");
+      return true;
+    }
 
-  /* case 2: A reduction phi defining a reduction stmt (DEF_STMT). DEF_STMT 
-     must have already been processed, so we just check that everything is as 
-     expected, and we are done.  */
+  /* case 2: A reduction phi (STMT) defined by a reduction stmt (DEF_STMT). 
+     DEF_STMT must have already been processed, because this should be the 
+     only way that STMT, which is a reduction-phi, was put in the worklist, 
+     as there should be no other uses for DEF_STMT in the loop.  So we just 
+     check that everything is as expected, and we are done.  */
   dstmt_vinfo = vinfo_for_stmt (def_stmt);
-  if (TREE_CODE (stmt) == PHI_NODE
+  bb = gimple_bb (stmt);
+  if (gimple_code (stmt) == GIMPLE_PHI
       && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
-      && TREE_CODE (def_stmt) != PHI_NODE
-      && STMT_VINFO_DEF_TYPE (dstmt_vinfo) == vect_reduction_def)
+      && gimple_code (def_stmt) != GIMPLE_PHI
+      && STMT_VINFO_DEF_TYPE (dstmt_vinfo) == vect_reduction_def
+      && bb->loop_father == def_bb->loop_father)
     {
+      if (vect_print_dump_info (REPORT_DETAILS))
+       fprintf (vect_dump, "reduc-stmt defining reduc-phi in the same nest.");
       if (STMT_VINFO_IN_PATTERN_P (dstmt_vinfo))
        dstmt_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (dstmt_vinfo));
       gcc_assert (STMT_VINFO_RELEVANT (dstmt_vinfo) < vect_used_by_reduction);
@@ -2317,6 +3917,73 @@ process_use (tree stmt, tree use, loop_vec_info loop_vinfo, bool live_p,
       return true;
     }
 
+  /* case 3a: outer-loop stmt defining an inner-loop stmt:
+       outer-loop-header-bb:
+               d = def_stmt
+       inner-loop:
+               stmt # use (d)
+       outer-loop-tail-bb:
+               ...               */
+  if (flow_loop_nested_p (def_bb->loop_father, bb->loop_father))
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+       fprintf (vect_dump, "outer-loop def-stmt defining inner-loop stmt.");
+      switch (relevant)
+       {
+       case vect_unused_in_loop:
+         relevant = (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def) ?
+                       vect_used_by_reduction : vect_unused_in_loop;
+         break;
+       case vect_used_in_outer_by_reduction:
+         relevant = vect_used_by_reduction;
+         break;
+       case vect_used_in_outer:
+         relevant = vect_used_in_loop;
+         break;
+       case vect_used_by_reduction: 
+       case vect_used_in_loop:
+         break;
+
+       default:
+         gcc_unreachable ();
+       }   
+    }
+
+  /* case 3b: inner-loop stmt defining an outer-loop stmt:
+       outer-loop-header-bb:
+               ...
+       inner-loop:
+               d = def_stmt
+       outer-loop-tail-bb:
+               stmt # use (d)          */
+  else if (flow_loop_nested_p (bb->loop_father, def_bb->loop_father))
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+       fprintf (vect_dump, "inner-loop def-stmt defining outer-loop stmt.");
+      switch (relevant)
+        {
+        case vect_unused_in_loop:
+          relevant = (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def) ?
+                        vect_used_in_outer_by_reduction : vect_unused_in_loop;
+          break;
+
+        case vect_used_in_outer_by_reduction:
+        case vect_used_in_outer:
+          break;
+
+        case vect_used_by_reduction:
+          relevant = vect_used_in_outer_by_reduction;
+          break;
+
+        case vect_used_in_loop:
+          relevant = vect_used_in_outer;
+          break;
+
+        default:
+          gcc_unreachable ();
+        }
+    }
+
   vect_mark_relevant (worklist, def_stmt, relevant, live_p);
   return true;
 }
@@ -2341,47 +4008,47 @@ process_use (tree stmt, tree use, loop_vec_info loop_vinfo, bool live_p,
 static bool
 vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
 {
-  VEC(tree,heap) *worklist;
+  VEC(gimple,heap) *worklist;
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
   unsigned int nbbs = loop->num_nodes;
-  block_stmt_iterator si;
-  tree stmt;
-  stmt_ann_t ann;
+  gimple_stmt_iterator si;
+  gimple stmt;
   unsigned int i;
   stmt_vec_info stmt_vinfo;
   basic_block bb;
-  tree phi;
+  gimple phi;
   bool live_p;
   enum vect_relevant relevant;
 
   if (vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "=== vect_mark_stmts_to_be_vectorized ===");
 
-  worklist = VEC_alloc (tree, heap, 64);
+  worklist = VEC_alloc (gimple, heap, 64);
 
   /* 1. Init worklist.  */
   for (i = 0; i < nbbs; i++)
     {
       bb = bbs[i];
-      for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
+      for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
        { 
+         phi = gsi_stmt (si);
          if (vect_print_dump_info (REPORT_DETAILS))
            {
              fprintf (vect_dump, "init: phi relevant? ");
-             print_generic_expr (vect_dump, phi, TDF_SLIM);
+             print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
            }
 
          if (vect_stmt_relevant_p (phi, loop_vinfo, &relevant, &live_p))
            vect_mark_relevant (&worklist, phi, relevant, live_p);
        }
-      for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
+      for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
        {
-         stmt = bsi_stmt (si);
+         stmt = gsi_stmt (si);
          if (vect_print_dump_info (REPORT_DETAILS))
            {
              fprintf (vect_dump, "init: stmt relevant? ");
-             print_generic_expr (vect_dump, stmt, TDF_SLIM);
+             print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
            } 
 
          if (vect_stmt_relevant_p (stmt, loop_vinfo, &relevant, &live_p))
@@ -2390,22 +4057,21 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
     }
 
   /* 2. Process_worklist */
-  while (VEC_length (tree, worklist) > 0)
+  while (VEC_length (gimple, worklist) > 0)
     {
       use_operand_p use_p;
       ssa_op_iter iter;
 
-      stmt = VEC_pop (tree, worklist);
+      stmt = VEC_pop (gimple, worklist);
       if (vect_print_dump_info (REPORT_DETAILS))
        {
           fprintf (vect_dump, "worklist: examine stmt: ");
-          print_generic_expr (vect_dump, stmt, TDF_SLIM);
+          print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
        }
 
       /* Examine the USEs of STMT. For each USE, mark the stmt that defines it 
         (DEF_STMT) as relevant/irrelevant and live/dead according to the 
         liveness and relevance properties of STMT.  */
-      ann = stmt_ann (stmt);
       stmt_vinfo = vinfo_for_stmt (stmt);
       relevant = STMT_VINFO_RELEVANT (stmt_vinfo);
       live_p = STMT_VINFO_LIVE_P (stmt_vinfo);
@@ -2425,33 +4091,47 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
         identify stmts that are used solely by a reduction, and therefore the 
         order of the results that they produce does not have to be kept.
 
-         Reduction phis are expected to be used by a reduction stmt;  Other 
-        reduction stmts are expected to be unused in the loop.  These are the 
-        expected values of "relevant" for reduction phis/stmts in the loop:
+        Reduction phis are expected to be used by a reduction stmt, or by
+        in an outer loop;  Other reduction stmts are expected to be
+        in the loop, and possibly used by a stmt in an outer loop. 
+        Here are the expected values of "relevant" for reduction phis/stmts:
 
         relevance:                             phi     stmt
         vect_unused_in_loop                            ok
+        vect_used_in_outer_by_reduction        ok      ok
+        vect_used_in_outer                     ok      ok
         vect_used_by_reduction                 ok
         vect_used_in_loop                                                */
 
       if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def)
         {
-         switch (relevant)
+         enum vect_relevant tmp_relevant = relevant;
+         switch (tmp_relevant)
            {
            case vect_unused_in_loop:
-             gcc_assert (TREE_CODE (stmt) != PHI_NODE);
+             gcc_assert (gimple_code (stmt) != GIMPLE_PHI);
+             relevant = vect_used_by_reduction;
+             break;
+
+           case vect_used_in_outer_by_reduction:
+           case vect_used_in_outer:
+             gcc_assert (gimple_code (stmt) != GIMPLE_ASSIGN
+                          || (gimple_assign_rhs_code (stmt) != WIDEN_SUM_EXPR
+                              && (gimple_assign_rhs_code (stmt)
+                                  != DOT_PROD_EXPR)));
              break;
+
            case vect_used_by_reduction:
-             if (TREE_CODE (stmt) == PHI_NODE)
+             if (gimple_code (stmt) == GIMPLE_PHI)
                break;
+             /* fall through */
            case vect_used_in_loop:
            default:
              if (vect_print_dump_info (REPORT_DETAILS))
                fprintf (vect_dump, "unsupported use of reduction.");
-             VEC_free (tree, heap, worklist);
+             VEC_free (gimple, heap, worklist);
              return false;
            }
-         relevant = vect_used_by_reduction;
          live_p = false;       
        }
 
@@ -2460,13 +4140,13 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
          tree op = USE_FROM_PTR (use_p);
          if (!process_use (stmt, op, loop_vinfo, live_p, relevant, &worklist))
            {
-             VEC_free (tree, heap, worklist);
+             VEC_free (gimple, heap, worklist);
              return false;
            }
        }
     } /* while worklist */
 
-  VEC_free (tree, heap, worklist);
+  VEC_free (gimple, heap, worklist);
   return true;
 }
 
@@ -2485,22 +4165,24 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
 {
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block bb = loop->header;
-  tree phi;
+  gimple phi;
+  gimple_stmt_iterator gsi;
 
   /* Analyze phi functions of the loop header.  */
 
   if (vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "vect_can_advance_ivs_p:");
 
-  for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
+  for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
     {
       tree access_fn = NULL;
       tree evolution_part;
 
+      phi = gsi_stmt (gsi);
       if (vect_print_dump_info (REPORT_DETAILS))
        {
           fprintf (vect_dump, "Analyze phi: ");
-          print_generic_expr (vect_dump, phi, TDF_SLIM);
+          print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
        }
 
       /* Skip virtual phi's. The data dependences that are associated with
@@ -2567,7 +4249,7 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
    can be constructed, place it in NUMBER_OF_ITERATIONS.
    Return the loop exit condition.  */
 
-static tree
+static gimple
 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 {
   tree niters;
@@ -2593,47 +4275,178 @@ vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 }
 
 
+/* Function vect_analyze_loop_1.
+
+   Apply a set of analyses on LOOP, and create a loop_vec_info struct
+   for it. The different analyses will record information in the
+   loop_vec_info struct.  This is a subset of the analyses applied in
+   vect_analyze_loop, to be applied on an inner-loop nested in the loop
+   that is now considered for (outer-loop) vectorization.  */
+
+static loop_vec_info
+vect_analyze_loop_1 (struct loop *loop)
+{
+  loop_vec_info loop_vinfo;
+
+  if (vect_print_dump_info (REPORT_DETAILS))
+    fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
+
+  /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
+
+  loop_vinfo = vect_analyze_loop_form (loop);
+  if (!loop_vinfo)
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+        fprintf (vect_dump, "bad inner-loop form.");
+      return NULL;
+    }
+
+  return loop_vinfo;
+}
+
+
 /* Function vect_analyze_loop_form.
 
-   Verify the following restrictions (some may be relaxed in the future):
-   - it's an inner-most loop
-   - number of BBs = 2 (which are the loop header and the latch)
+   Verify that certain CFG restrictions hold, including:
    - the loop has a pre-header
    - the loop has a single entry and exit
    - the loop exit condition is simple enough, and the number of iterations
      can be analyzed (a countable loop).  */
 
-static loop_vec_info
+loop_vec_info
 vect_analyze_loop_form (struct loop *loop)
 {
   loop_vec_info loop_vinfo;
-  tree loop_cond;
+  gimple loop_cond;
   tree number_of_iterations = NULL;
+  loop_vec_info inner_loop_vinfo = NULL;
 
   if (vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "=== vect_analyze_loop_form ===");
 
-  if (loop->inner)
+  /* Different restrictions apply when we are considering an inner-most loop,
+     vs. an outer (nested) loop.  
+     (FORNOW. May want to relax some of these restrictions in the future).  */
+
+  if (!loop->inner)
     {
-      if (vect_print_dump_info (REPORT_OUTER_LOOPS))
-        fprintf (vect_dump, "not vectorized: nested loop.");
+      /* Inner-most loop.  We currently require that the number of BBs is 
+        exactly 2 (the header and latch).  Vectorizable inner-most loops 
+        look like this:
+
+                        (pre-header)
+                           |
+                          header <--------+
+                           | |            |
+                           | +--> latch --+
+                           |
+                        (exit-bb)  */
+
+      if (loop->num_nodes != 2)
+        {
+          if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
+            fprintf (vect_dump, "not vectorized: too many BBs in loop.");
+          return NULL;
+        }
+
+      if (empty_block_p (loop->header))
+    {
+          if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
+            fprintf (vect_dump, "not vectorized: empty loop.");
       return NULL;
     }
+    }
+  else
+    {
+      struct loop *innerloop = loop->inner;
+      edge backedge, entryedge;
+
+      /* Nested loop. We currently require that the loop is doubly-nested,
+        contains a single inner loop, and the number of BBs is exactly 5. 
+        Vectorizable outer-loops look like this:
+
+                       (pre-header)
+                          |
+                         header <---+
+                          |         |
+                         inner-loop |
+                          |         |
+                         tail ------+
+                          | 
+                       (exit-bb)
+
+        The inner-loop has the properties expected of inner-most loops
+        as described above.  */
+
+      if ((loop->inner)->inner || (loop->inner)->next)
+       {
+         if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
+           fprintf (vect_dump, "not vectorized: multiple nested loops.");
+         return NULL;
+       }
+
+      /* Analyze the inner-loop.  */
+      inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
+      if (!inner_loop_vinfo)
+       {
+         if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
+            fprintf (vect_dump, "not vectorized: Bad inner loop.");
+         return NULL;
+       }
+
+      if (!expr_invariant_in_loop_p (loop,
+                                       LOOP_VINFO_NITERS (inner_loop_vinfo)))
+       {
+         if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
+           fprintf (vect_dump,
+                    "not vectorized: inner-loop count not invariant.");
+         destroy_loop_vec_info (inner_loop_vinfo, true);
+         return NULL;
+       }
+
+      if (loop->num_nodes != 5) 
+        {
+         if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
+           fprintf (vect_dump, "not vectorized: too many BBs in loop.");
+         destroy_loop_vec_info (inner_loop_vinfo, true);
+         return NULL;
+        }
+
+      gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
+      backedge = EDGE_PRED (innerloop->header, 1);       
+      entryedge = EDGE_PRED (innerloop->header, 0);
+      if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
+       {
+         backedge = EDGE_PRED (innerloop->header, 0);
+         entryedge = EDGE_PRED (innerloop->header, 1); 
+       }
+       
+      if (entryedge->src != loop->header
+         || !single_exit (innerloop)
+         || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
+       {
+         if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
+           fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
+         destroy_loop_vec_info (inner_loop_vinfo, true);
+         return NULL;
+       }
+
+      if (vect_print_dump_info (REPORT_DETAILS))
+        fprintf (vect_dump, "Considering outer-loop vectorization.");
+    }
   
   if (!single_exit (loop) 
-      || loop->num_nodes != 2
       || EDGE_COUNT (loop->header->preds) != 2)
     {
       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
         {
           if (!single_exit (loop))
             fprintf (vect_dump, "not vectorized: multiple exits.");
-          else if (loop->num_nodes != 2)
-            fprintf (vect_dump, "not vectorized: too many BBs in loop.");
           else if (EDGE_COUNT (loop->header->preds) != 2)
             fprintf (vect_dump, "not vectorized: too many incoming edges.");
         }
-
+      if (inner_loop_vinfo)
+       destroy_loop_vec_info (inner_loop_vinfo, true);
       return NULL;
     }
 
@@ -2646,6 +4459,8 @@ vect_analyze_loop_form (struct loop *loop)
     {
       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
         fprintf (vect_dump, "not vectorized: unexpected loop form.");
+      if (inner_loop_vinfo)
+       destroy_loop_vec_info (inner_loop_vinfo, true);
       return NULL;
     }
 
@@ -2663,22 +4478,19 @@ vect_analyze_loop_form (struct loop *loop)
        {
          if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
            fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
+         if (inner_loop_vinfo)
+           destroy_loop_vec_info (inner_loop_vinfo, true);
          return NULL;
        }
     }
 
-  if (empty_block_p (loop->header))
-    {
-      if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
-        fprintf (vect_dump, "not vectorized: empty loop.");
-      return NULL;
-    }
-
   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
   if (!loop_cond)
     {
       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
        fprintf (vect_dump, "not vectorized: complicated exit condition.");
+      if (inner_loop_vinfo)
+       destroy_loop_vec_info (inner_loop_vinfo, true);
       return NULL;
     }
   
@@ -2687,6 +4499,8 @@ vect_analyze_loop_form (struct loop *loop)
       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
        fprintf (vect_dump, 
                 "not vectorized: number of iterations cannot be computed.");
+      if (inner_loop_vinfo)
+       destroy_loop_vec_info (inner_loop_vinfo, true);
       return NULL;
     }
 
@@ -2694,7 +4508,9 @@ vect_analyze_loop_form (struct loop *loop)
     {
       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
         fprintf (vect_dump, "Infinite number of iterations.");
-      return false;
+      if (inner_loop_vinfo)
+       destroy_loop_vec_info (inner_loop_vinfo, true);
+      return NULL;
     }
 
   if (!NITERS_KNOWN_P (number_of_iterations))
@@ -2709,12 +4525,20 @@ vect_analyze_loop_form (struct loop *loop)
     {
       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
         fprintf (vect_dump, "not vectorized: number of iterations = 0.");
+      if (inner_loop_vinfo)
+        destroy_loop_vec_info (inner_loop_vinfo, false);
       return NULL;
     }
 
   loop_vinfo = new_loop_vec_info (loop);
   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
-  LOOP_VINFO_EXIT_COND (loop_vinfo) = loop_cond;
+  LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
+
+  STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
+
+  /* CHECKME: May want to keep it around it in the future.  */
+  if (inner_loop_vinfo)
+    destroy_loop_vec_info (inner_loop_vinfo, false);
 
   gcc_assert (!loop->aux);
   loop->aux = loop_vinfo;
@@ -2736,6 +4560,15 @@ vect_analyze_loop (struct loop *loop)
   if (vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "===== analyze_loop_nest =====");
 
+  if (loop_outer (loop) 
+      && loop_vec_info_for_loop (loop_outer (loop))
+      && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+       fprintf (vect_dump, "outer-loop already vectorized.");
+      return NULL;
+    }
+
   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 
   loop_vinfo = vect_analyze_loop_form (loop);
@@ -2757,7 +4590,7 @@ vect_analyze_loop (struct loop *loop)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
        fprintf (vect_dump, "bad data references.");
-      destroy_loop_vec_info (loop_vinfo);
+      destroy_loop_vec_info (loop_vinfo, true);
       return NULL;
     }
 
@@ -2775,7 +4608,7 @@ vect_analyze_loop (struct loop *loop)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
        fprintf (vect_dump, "unexpected pattern.");
-      destroy_loop_vec_info (loop_vinfo);
+      destroy_loop_vec_info (loop_vinfo, true);
       return NULL;
     }
 
@@ -2787,7 +4620,7 @@ vect_analyze_loop (struct loop *loop)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
        fprintf (vect_dump, "bad data alignment.");
-      destroy_loop_vec_info (loop_vinfo);
+      destroy_loop_vec_info (loop_vinfo, true);
       return NULL;
     }
 
@@ -2796,7 +4629,7 @@ vect_analyze_loop (struct loop *loop)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "can't determine vectorization factor.");
-      destroy_loop_vec_info (loop_vinfo);
+      destroy_loop_vec_info (loop_vinfo, true);
       return NULL;
     }
 
@@ -2808,7 +4641,7 @@ vect_analyze_loop (struct loop *loop)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
        fprintf (vect_dump, "bad data dependence.");
-      destroy_loop_vec_info (loop_vinfo);
+      destroy_loop_vec_info (loop_vinfo, true);
       return NULL;
     }
 
@@ -2820,10 +4653,34 @@ vect_analyze_loop (struct loop *loop)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
        fprintf (vect_dump, "bad data access.");
-      destroy_loop_vec_info (loop_vinfo);
+      destroy_loop_vec_info (loop_vinfo, true);
+      return NULL;
+    }
+
+  /* Prune the list of ddrs to be tested at run-time by versioning for alias.
+     It is important to call pruning after vect_analyze_data_ref_accesses,
+     since we use grouping information gathered by interleaving analysis.  */
+  ok = vect_prune_runtime_alias_test_list (loop_vinfo);
+  if (!ok)
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+       fprintf (vect_dump, "too long list of versioning for alias "
+                           "run-time tests.");
+      destroy_loop_vec_info (loop_vinfo, true);
       return NULL;
     }
 
+  /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
+  ok = vect_analyze_slp (loop_vinfo);
+  if (ok)
+    {
+      /* Decide which possible SLP instances to SLP.  */
+      vect_make_slp_decision (loop_vinfo);
+
+      /* Find stmts that need to be both vectorized and SLPed.  */
+      vect_detect_hybrid_slp (loop_vinfo);
+    }
+
   /* This pass will decide on using loop versioning and/or loop peeling in
      order to enhance the alignment of data references in the loop.  */
 
@@ -2832,7 +4689,7 @@ vect_analyze_loop (struct loop *loop)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
        fprintf (vect_dump, "bad data alignment.");
-      destroy_loop_vec_info (loop_vinfo);
+      destroy_loop_vec_info (loop_vinfo, true);
       return NULL;
     }
 
@@ -2844,7 +4701,7 @@ vect_analyze_loop (struct loop *loop)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
        fprintf (vect_dump, "bad operation or unsupported loop bound.");
-      destroy_loop_vec_info (loop_vinfo);
+      destroy_loop_vec_info (loop_vinfo, true);
       return NULL;
     }