OSDN Git Service

Factor out common tests in 8-byte reg/reg move splitters on 32-bit sparc.
[pf3gnuchains/gcc-fork.git] / gcc / tree-vect-slp.c
index 2dbe328..f75817b 100644 (file)
@@ -1,5 +1,5 @@
 /* SLP - Basic Block Vectorization
-   Copyright (C) 2007, 2008, 2009, 2010
+   Copyright (C) 2007, 2008, 2009, 2010, 2011
    Free Software Foundation, Inc.
    Contributed by Dorit Naishlos <dorit@il.ibm.com>
    and Ira Rosen <irar@il.ibm.com>
@@ -38,6 +38,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "recog.h"
 #include "optabs.h"
 #include "tree-vectorizer.h"
+#include "langhooks.h"
 
 /* Extract the location of the basic block in the source code.
    Return the basic block location if succeed and NULL if not.  */
@@ -115,13 +116,15 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 {
   tree oprnd;
   unsigned int i, number_of_oprnds;
-  tree def;
+  tree def[2];
   gimple def_stmt;
   enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
   stmt_vec_info stmt_info =
     vinfo_for_stmt (VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0));
   enum gimple_rhs_class rhs_class;
   struct loop *loop = NULL;
+  enum tree_code rhs_code;
+  bool different_types = false;
 
   if (loop_vinfo)
     loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -133,7 +136,7 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
     {
       oprnd = gimple_op (stmt, i + 1);
 
-      if (!vect_is_simple_use (oprnd, loop_vinfo, bb_vinfo, &def_stmt, &def,
+      if (!vect_is_simple_use (oprnd, loop_vinfo, bb_vinfo, &def_stmt, &def[i],
                                &dt[i])
          || (!def_stmt && dt[i] != vect_constant_def))
        {
@@ -152,7 +155,9 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
       if (loop && def_stmt && gimple_bb (def_stmt)
           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
           && vinfo_for_stmt (def_stmt)
-          && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt)))
+          && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
+          && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
+          && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
         {
           if (!*first_stmt_dt0)
             *pattern0 = true;
@@ -186,11 +191,11 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
           switch (gimple_code (def_stmt))
             {
               case GIMPLE_PHI:
-                def = gimple_phi_result (def_stmt);
+                def[i] = gimple_phi_result (def_stmt);
                 break;
 
               case GIMPLE_ASSIGN:
-                def = gimple_assign_lhs (def_stmt);
+                def[i] = gimple_assign_lhs (def_stmt);
                 break;
 
               default:
@@ -204,8 +209,8 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
        {
          /* op0 of the first stmt of the group - store its info.  */
          *first_stmt_dt0 = dt[i];
-         if (def)
-           *first_stmt_def0_type = TREE_TYPE (def);
+         if (def[i])
+           *first_stmt_def0_type = TREE_TYPE (def[i]);
          else
            *first_stmt_const_oprnd = oprnd;
 
@@ -215,7 +220,8 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
            vect_model_simple_cost (stmt_info, ncopies_for_cost, dt, slp_node);
          else
            /* Store.  */
-           vect_model_store_cost (stmt_info, ncopies_for_cost, dt[0], slp_node);
+           vect_model_store_cost (stmt_info, ncopies_for_cost, false,
+                                  dt[0], slp_node);
        }
 
       else
@@ -224,8 +230,8 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
            {
              /* op1 of the first stmt of the group - store its info.  */
              *first_stmt_dt1 = dt[i];
-             if (def)
-               *first_stmt_def1_type = TREE_TYPE (def);
+             if (def[i])
+               *first_stmt_def1_type = TREE_TYPE (def[i]);
              else
                {
                  /* We assume that the stmt contains only one constant
@@ -243,25 +249,64 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
          else
            {
              /* Not first stmt of the group, check that the def-stmt/s match
-                the def-stmt/s of the first stmt.  */
+                the def-stmt/s of the first stmt.  Allow different definition
+                types for reduction chains: the first stmt must be a
+                vect_reduction_def (a phi node), and the rest
+                vect_internal_def.  */
              if ((i == 0
-                  && (*first_stmt_dt0 != dt[i]
-                      || (*first_stmt_def0_type && def
+                  && ((*first_stmt_dt0 != dt[i]
+                        && !(*first_stmt_dt0 == vect_reduction_def
+                             && dt[i] == vect_internal_def))
+                      || (*first_stmt_def0_type && def[0]
                           && !types_compatible_p (*first_stmt_def0_type,
-                                                  TREE_TYPE (def)))))
+                                                  TREE_TYPE (def[0])))))
                  || (i == 1
-                     && (*first_stmt_dt1 != dt[i]
-                         || (*first_stmt_def1_type && def
+                     && ((*first_stmt_dt1 != dt[i]
+                           && !(*first_stmt_dt1 == vect_reduction_def
+                                && dt[i] == vect_internal_def))
+                         || (*first_stmt_def1_type && def[1]
                              && !types_compatible_p (*first_stmt_def1_type,
-                                                     TREE_TYPE (def)))))
-                 || (!def
+                                                     TREE_TYPE (def[1])))))
+                 || (!def[i]
                      && !types_compatible_p (TREE_TYPE (*first_stmt_const_oprnd),
-                                             TREE_TYPE (oprnd))))
+                                             TREE_TYPE (oprnd)))
+                 || different_types)
                {
-                 if (vect_print_dump_info (REPORT_SLP))
-                   fprintf (vect_dump, "Build SLP failed: different types ");
+                  if (i != number_of_oprnds - 1)
+                    different_types = true;
+                  else
+                   {
+                      if (is_gimple_assign (stmt)
+                          && (rhs_code = gimple_assign_rhs_code (stmt))
+                         && TREE_CODE_CLASS (rhs_code) == tcc_binary
+                         && commutative_tree_code (rhs_code)
+                         && *first_stmt_dt0 == dt[1]
+                         && *first_stmt_dt1 == dt[0]
+                         && def[0] && def[1]
+                         && !(*first_stmt_def0_type
+                              && !types_compatible_p (*first_stmt_def0_type,
+                                                      TREE_TYPE (def[1])))
+                          && !(*first_stmt_def1_type
+                               && !types_compatible_p (*first_stmt_def1_type,
+                                                       TREE_TYPE (def[0]))))
+                        {
+                           if (vect_print_dump_info (REPORT_SLP))
+                            {
+                              fprintf (vect_dump, "Swapping operands of ");
+                              print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+                            }
+
+                          swap_tree_operands (stmt, gimple_assign_rhs1_ptr (stmt),
+                                             gimple_assign_rhs2_ptr (stmt));
+                       }
+                      else
+                        {
+                         if (vect_print_dump_info (REPORT_SLP))
+                           fprintf (vect_dump, "Build SLP failed: different types ");
 
-                 return false;
+                         return false;
+                       }
+                   }
                }
            }
        }
@@ -275,10 +320,10 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 
        case vect_internal_def:
         case vect_reduction_def:
-         if (i == 0)
+         if ((i == 0 && !different_types) || (i == 1 && different_types))
            VEC_safe_push (gimple, heap, *def_stmts0, def_stmt);
          else
-           VEC_safe_push (gimple, heap, *def_stmts1, def_stmt);
+           VEC_safe_push (gimple, heap, *def_stmts1, def_stmt);
          break;
 
        default:
@@ -286,7 +331,7 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
          if (vect_print_dump_info (REPORT_SLP))
            {
              fprintf (vect_dump, "Build SLP failed: illegal type of def ");
-             print_generic_expr (vect_dump, def, TDF_SLIM);
+             print_generic_expr (vect_dump, def[i], TDF_SLIM);
            }
 
          return false;
@@ -309,7 +354,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                      int ncopies_for_cost, unsigned int *max_nunits,
                      VEC (int, heap) **load_permutation,
                      VEC (slp_tree, heap) **loads,
-                     unsigned int vectorization_factor)
+                     unsigned int vectorization_factor, bool *loads_permuted)
 {
   VEC (gimple, heap) *def_stmts0 = VEC_alloc (gimple, heap, group_size);
   VEC (gimple, heap) *def_stmts1 =  VEC_alloc (gimple, heap, group_size);
@@ -383,20 +428,15 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
           return false;
         }
 
-      ncopies = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
-      if (ncopies != 1)
+      /* In case of multiple types we need to detect the smallest type.  */
+      if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
         {
-         if (vect_print_dump_info (REPORT_SLP))
-            fprintf (vect_dump, "SLP with multiple types ");
-
-          /* FORNOW: multiple types are unsupported in BB SLP.  */
-         if (bb_vinfo)
-           return false;
+          *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
+          if (bb_vinfo)
+            vectorization_factor = *max_nunits;
         }
 
-      /* In case of multiple types we need to detect the smallest type.  */
-      if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
-        *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
+      ncopies = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
 
       if (is_gimple_call (stmt))
        rhs_code = CALL_EXPR;
@@ -449,6 +489,11 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                    }
                }
            }
+         else if (rhs_code == WIDEN_LSHIFT_EXPR)
+            {
+              need_same_oprnds = true;
+              first_op1 = gimple_assign_rhs2 (stmt);
+            }
        }
       else
        {
@@ -508,10 +553,10 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
            {
              /* Load.  */
               /* FORNOW: Check that there is no gap between the loads.  */
-              if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt
-                   && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
-                  || (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt
-                      && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
+              if ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
+                   && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
+                  || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
+                      && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
                 {
                   if (vect_print_dump_info (REPORT_SLP))
                     {
@@ -525,7 +570,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 
               /* Check that the size of interleaved loads group is not
                  greater than the SLP group size.  */
-              if (DR_GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size)
+              if (loop_vinfo
+                  && GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size)
                 {
                   if (vect_print_dump_info (REPORT_SLP))
                     {
@@ -538,7 +584,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                   return false;
                 }
 
-              first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
+              first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
               if (prev_first_load)
                 {
                   /* Check that there are no loads from different interleaving
@@ -579,7 +625,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 
                   /* Analyze costs (for the first stmt in the group).  */
                   vect_model_load_cost (vinfo_for_stmt (stmt),
-                                        ncopies_for_cost, *node);
+                                        ncopies_for_cost, false, *node);
                 }
 
               /* Store the place of this load in the interleaving chain.  In
@@ -646,19 +692,22 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
   /* Strided loads were reached - stop the recursion.  */
   if (stop_recursion)
     {
+      VEC_safe_push (slp_tree, heap, *loads, *node);
       if (permutation)
         {
-          VEC_safe_push (slp_tree, heap, *loads, *node);
+
+          *loads_permuted = true;
           *inside_cost 
             += targetm.vectorize.builtin_vectorization_cost (vec_perm, NULL, 0) 
                * group_size;
         }
       else
-        { 
-          /* We don't check here complex numbers chains, so we keep them in
-            LOADS for further check in vect_supported_load_permutation_p.  */ 
+        {
+          /* We don't check here complex numbers chains, so we set
+             LOADS_PERMUTED for further check in
+             vect_supported_load_permutation_p.  */
           if (rhs_code == REALPART_EXPR || rhs_code == IMAGPART_EXPR)
-            VEC_safe_push (slp_tree, heap, *loads, *node);
+            *loads_permuted = true;
         }
 
       return true;
@@ -677,7 +726,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
       if (!vect_build_slp_tree (loop_vinfo, bb_vinfo, &left_node, group_size,
                                inside_cost, outside_cost, ncopies_for_cost,
                                max_nunits, load_permutation, loads,
-                               vectorization_factor))
+                               vectorization_factor, loads_permuted))
        return false;
 
       SLP_TREE_LEFT (*node) = left_node;
@@ -695,7 +744,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
       if (!vect_build_slp_tree (loop_vinfo, bb_vinfo, &right_node, group_size,
                                inside_cost, outside_cost, ncopies_for_cost,
                                max_nunits, load_permutation, loads,
-                               vectorization_factor))
+                               vectorization_factor, loads_permuted))
        return false;
 
       SLP_TREE_RIGHT (*node) = right_node;
@@ -783,7 +832,7 @@ vect_supported_slp_permutation_p (slp_instance instance)
 {
   slp_tree node = VEC_index (slp_tree, SLP_INSTANCE_LOADS (instance), 0);
   gimple stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
-  gimple first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
+  gimple first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
   VEC (slp_tree, heap) *sorted_loads = NULL;
   int index;
   slp_tree *tmp_loads = NULL;
@@ -802,7 +851,7 @@ vect_supported_slp_permutation_p (slp_instance instance)
     {
       gimple scalar_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (load), 0);
       /* Check that the loads are all in the same interleaving chain.  */
-      if (DR_GROUP_FIRST_DR (vinfo_for_stmt (scalar_stmt)) != first_load)
+      if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (scalar_stmt)) != first_load)
         {
           if (vect_print_dump_info (REPORT_DETAILS))
             {
@@ -881,8 +930,10 @@ vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
   bool supported, bad_permutation = false;
   sbitmap load_index;
   slp_tree node, other_complex_node;
-  gimple stmt, first = NULL, other_node_first;
+  gimple stmt, first = NULL, other_node_first, load, next_load, first_load;
   unsigned complex_numbers = 0;
+  struct data_reference *dr;
+  bb_vec_info bb_vinfo;
 
   /* FORNOW: permutations are only supported in SLP.  */
   if (!slp_instn)
@@ -932,7 +983,7 @@ vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
                 first = stmt;
               else
                 {
-                  if (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != first)
+                  if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != first)
                     {
                       if (complex_numbers != 2)
                         return false;
@@ -947,7 +998,7 @@ vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
                       other_node_first = VEC_index (gimple, 
                                 SLP_TREE_SCALAR_STMTS (other_complex_node), 0);
 
-                      if (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) 
+                      if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
                           != other_node_first)
                        return false;
                     }
@@ -973,8 +1024,10 @@ vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
      GROUP_SIZE.  */
   number_of_groups = VEC_length (int, load_permutation) / group_size;
 
-  /* Reduction (there are no data-refs in the root).  */
-  if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
+  /* Reduction (there are no data-refs in the root).
+     In reduction chain the order of the loads is important.  */
+  if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
+      && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
     {
       int first_group_load_index;
 
@@ -1002,7 +1055,36 @@ vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
 
       if (!bad_permutation)
         {
-          /* This permutaion is valid for reduction.  Since the order of the
+          /* Check that the loads in the first sequence are different and there
+             are no gaps between them.  */
+          load_index = sbitmap_alloc (group_size);
+          sbitmap_zero (load_index);
+          for (k = 0; k < group_size; k++)
+            {
+              first_group_load_index = VEC_index (int, load_permutation, k);
+              if (TEST_BIT (load_index, first_group_load_index))
+                {
+                  bad_permutation = true;
+                  break;
+                }
+
+              SET_BIT (load_index, first_group_load_index);
+            }
+
+          if (!bad_permutation)
+            for (k = 0; k < group_size; k++)
+              if (!TEST_BIT (load_index, k))
+                {
+                  bad_permutation = true;
+                  break;
+                }
+
+          sbitmap_free (load_index);
+        }
+
+      if (!bad_permutation)
+        {
+          /* This permutation is valid for reduction.  Since the order of the
              statements in the nodes is not important unless they are memory
              accesses, we can rearrange the statements in all the nodes 
              according to the order of the loads.  */
@@ -1013,6 +1095,76 @@ vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
         }
     }
 
+  /* In basic block vectorization we allow any subchain of an interleaving
+     chain.
+     FORNOW: not supported in loop SLP because of realignment compications.  */
+  bb_vinfo = STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt));
+  bad_permutation = false;
+  /* Check that for every node in the instance teh loads form a subchain.  */
+  if (bb_vinfo)
+    {
+      FOR_EACH_VEC_ELT (slp_tree, SLP_INSTANCE_LOADS (slp_instn), i, node)
+        {
+          next_load = NULL;
+          first_load = NULL;
+          FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), j, load)
+            {
+              if (!first_load)
+                first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (load));
+              else if (first_load
+                         != GROUP_FIRST_ELEMENT (vinfo_for_stmt (load)))
+                {
+                  bad_permutation = true;
+                 break;
+               }
+
+              if (j != 0 && next_load != load)
+                {
+                  bad_permutation = true;
+                  break;
+                }
+
+              next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
+            }
+
+          if (bad_permutation)
+            break;
+        }
+
+      /* Check that the alignment of the first load in every subchain, i.e.,
+         the first statement in every load node, is supported.  */
+      if (!bad_permutation)
+        {
+          FOR_EACH_VEC_ELT (slp_tree, SLP_INSTANCE_LOADS (slp_instn), i, node)
+            {
+              first_load = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
+              if (first_load
+                    != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
+                {
+                  dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
+                  if (vect_supportable_dr_alignment (dr, false)
+                      == dr_unaligned_unsupported)
+                    {
+                     if (vect_print_dump_info (REPORT_SLP))
+                       {
+                         fprintf (vect_dump, "unsupported unaligned load ");
+                          print_gimple_stmt (vect_dump, first_load, 0,
+                                            TDF_SLIM);
+                        }
+                     bad_permutation = true;
+                      break;
+                    }
+               }
+            }
+
+          if (!bad_permutation)
+            {
+              VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (slp_instn));
+              return true;
+           }
+        }
+    }
+
   /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
      GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
      well (unless it's reduction).  */
@@ -1112,7 +1264,7 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 {
   slp_instance new_instance;
   slp_tree node = XNEW (struct _slp_tree);
-  unsigned int group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt));
+  unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
   unsigned int unrolling_factor = 1, nunits;
   tree vectype, scalar_type = NULL_TREE;
   gimple next;
@@ -1122,12 +1274,22 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
   VEC (int, heap) *load_permutation;
   VEC (slp_tree, heap) *loads;
   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
+  bool loads_permuted = false;
 
-  if (dr)
+  if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
     {
-      scalar_type = TREE_TYPE (DR_REF (dr));
-      vectype = get_vectype_for_scalar_type (scalar_type);
-      group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt));
+      if (dr)
+        {
+          scalar_type = TREE_TYPE (DR_REF (dr));
+          vectype = get_vectype_for_scalar_type (scalar_type);
+        }
+      else
+        {
+          gcc_assert (loop_vinfo);
+          vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
+        }
+
+      group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
     }
   else
     {
@@ -1151,7 +1313,6 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
   if (loop_vinfo)
     vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   else
-    /* No multitypes in BB SLP.  */
     vectorization_factor = nunits;
 
   /* Calculate the unrolling factor.  */
@@ -1168,13 +1329,13 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
   /* Create a node (a root of the SLP tree) for the packed strided stores.  */
   SLP_TREE_SCALAR_STMTS (node) = VEC_alloc (gimple, heap, group_size);
   next = stmt;
-  if (dr)
+  if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
     {
       /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
       while (next)
         {
           VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next);
-          next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
+          next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
         }
     }
   else
@@ -1183,14 +1344,7 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
       for (i = 0; VEC_iterate (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i, 
                                next); 
            i++)
-        {
-          VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next);
-          if (vect_print_dump_info (REPORT_DETAILS))
-            {
-              fprintf (vect_dump, "pushing reduction into node: ");
-              print_gimple_stmt (vect_dump, next, 0, TDF_SLIM);
-            }
-        }
+        VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next);
     }
 
   SLP_TREE_VEC_STMTS (node) = NULL;
@@ -1212,25 +1366,33 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
   if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
                            &inside_cost, &outside_cost, ncopies_for_cost,
                           &max_nunits, &load_permutation, &loads,
-                          vectorization_factor))
+                          vectorization_factor, &loads_permuted))
     {
-      /* Create a new SLP instance.  */
-      new_instance = XNEW (struct _slp_instance);
-      SLP_INSTANCE_TREE (new_instance) = node;
-      SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
-      /* Calculate the unrolling factor based on the smallest type in the
-         loop.  */
+      /* Calculate the unrolling factor based on the smallest type.  */
       if (max_nunits > nunits)
         unrolling_factor = least_common_multiple (max_nunits, group_size)
                            / group_size;
 
+      if (unrolling_factor != 1 && !loop_vinfo)
+        {
+          if (vect_print_dump_info (REPORT_SLP))
+            fprintf (vect_dump, "Build SLP failed: unrolling required in basic"
+                               " block SLP");
+          return false;
+        }
+
+      /* Create a new SLP instance.  */
+      new_instance = XNEW (struct _slp_instance);
+      SLP_INSTANCE_TREE (new_instance) = node;
+      SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
       SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (new_instance) = outside_cost;
       SLP_INSTANCE_INSIDE_OF_LOOP_COST (new_instance) = inside_cost;
       SLP_INSTANCE_LOADS (new_instance) = loads;
       SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
       SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation;
-      if (VEC_length (slp_tree, loads))
+
+      if (loads_permuted)
         {
           if (!vect_supported_load_permutation_p (new_instance, group_size,
                                                   load_permutation))
@@ -1283,8 +1445,8 @@ bool
 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
 {
   unsigned int i;
-  VEC (gimple, heap) *strided_stores, *reductions = NULL;
-  gimple store;
+  VEC (gimple, heap) *strided_stores, *reductions = NULL, *reduc_chains = NULL;
+  gimple first_element;
   bool ok = false;
 
   if (vect_print_dump_info (REPORT_SLP))
@@ -1293,14 +1455,15 @@ vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
   if (loop_vinfo)
     {
       strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo);
+      reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
       reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
     }
   else
     strided_stores = BB_VINFO_STRIDED_STORES (bb_vinfo);
 
   /* Find SLP sequences starting from groups of strided stores.  */
-  FOR_EACH_VEC_ELT (gimple, strided_stores, i, store)
-    if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, store))
+  FOR_EACH_VEC_ELT (gimple, strided_stores, i, first_element)
+    if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
       ok = true;
 
   if (bb_vinfo && !ok)
@@ -1311,6 +1474,21 @@ vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
       return false;
     }
 
+  if (loop_vinfo
+      && VEC_length (gimple, LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)) > 0)
+    {
+      /* Find SLP sequences starting from reduction chains.  */
+      FOR_EACH_VEC_ELT (gimple, reduc_chains, i, first_element)
+        if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
+          ok = true;
+        else
+          return false;
+
+      /* Don't try to vectorize SLP reductions if reduction chain was
+         detected.  */
+      return ok;
+    }
+
   /* Find SLP sequences starting from groups of reductions.  */
   if (loop_vinfo && VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo)) > 1
       && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, 
@@ -1322,9 +1500,10 @@ vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
 
 
 /* For each possible SLP instance decide whether to SLP it and calculate overall
-   unrolling factor needed to SLP the loop.  */
+   unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
+   least one instance.  */
 
-void
+bool
 vect_make_slp_decision (loop_vec_info loop_vinfo)
 {
   unsigned int i, unrolling_factor = 1;
@@ -1353,6 +1532,8 @@ vect_make_slp_decision (loop_vec_info loop_vinfo)
   if (decided_to_slp && vect_print_dump_info (REPORT_SLP))
     fprintf (vect_dump, "Decided to SLP %d instances. Unrolling factor %d",
             decided_to_slp, unrolling_factor);
+
+  return (decided_to_slp > 0);
 }
 
 
@@ -1457,6 +1638,8 @@ destroy_bb_vec_info (bb_vec_info bb_vinfo)
         free_stmt_vec_info (stmt);
     }
 
+  free_data_refs (BB_VINFO_DATAREFS (bb_vinfo));
+  free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
   VEC_free (gimple, heap, BB_VINFO_STRIDED_STORES (bb_vinfo));
   VEC_free (slp_instance, heap, BB_VINFO_SLP_INSTANCES (bb_vinfo));
   free (bb_vinfo);
@@ -1523,43 +1706,6 @@ vect_slp_analyze_operations (bb_vec_info bb_vinfo)
   return true;
 }
 
-/* Check if loads and stores are mixed in the basic block (in that
-   case if we are not sure that the accesses differ, we can't vectorize the
-   basic block).  Also return FALSE in case that there is statement marked as
-   not vectorizable.  */
-
-static bool
-vect_bb_vectorizable_with_dependencies (bb_vec_info bb_vinfo)
-{
-  basic_block bb = BB_VINFO_BB (bb_vinfo);
-  gimple_stmt_iterator si;
-  bool detected_store = false;
-  gimple stmt;
-  struct data_reference *dr;
-
-  for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
-    {
-      stmt = gsi_stmt (si);
-
-      /* We can't allow not analyzed statements, since they may contain data
-         accesses.  */ 
-      if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
-        return false;
-
-      if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
-        continue;
-
-      dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
-      if (DR_IS_READ (dr) && detected_store)
-        return false;
-
-      if (!DR_IS_READ (dr))
-        detected_store = true;
-    }
-
-  return true;
-}
-
 /* Check if vectorization of the basic block is profitable.  */
 
 static bool
@@ -1630,41 +1776,16 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
 
 /* Check if the basic block can be vectorized.  */
 
-bb_vec_info
-vect_slp_analyze_bb (basic_block bb)
+static bb_vec_info
+vect_slp_analyze_bb_1 (basic_block bb)
 {
   bb_vec_info bb_vinfo;
   VEC (ddr_p, heap) *ddrs;
   VEC (slp_instance, heap) *slp_instances;
   slp_instance instance;
-  int i, insns = 0;
-  gimple_stmt_iterator gsi;
+  int i;
   int min_vf = 2;
   int max_vf = MAX_VECTORIZATION_FACTOR;
-  bool data_dependence_in_bb = false;
-
-  current_vector_size = 0;
-
-  if (vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "===vect_slp_analyze_bb===\n");
-
-  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
-    {
-      gimple stmt = gsi_stmt (gsi);
-      if (!is_gimple_debug (stmt)
-         && !gimple_nop_p (stmt)
-         && gimple_code (stmt) != GIMPLE_LABEL)
-       insns++;
-    }
-
-  if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
-    {
-      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
-        fprintf (vect_dump, "not vectorized: too many instructions in basic "
-                            "block.\n");
-
-      return NULL;
-    }
 
   bb_vinfo = new_bb_vec_info (bb);
   if (!bb_vinfo)
@@ -1691,11 +1812,8 @@ vect_slp_analyze_bb (basic_block bb)
       return NULL;
     }
 
-   if (!vect_analyze_data_ref_dependences (NULL, bb_vinfo, &max_vf, 
-                                           &data_dependence_in_bb)
-       || min_vf > max_vf
-       || (data_dependence_in_bb 
-           && !vect_bb_vectorizable_with_dependencies (bb_vinfo)))
+   if (!vect_analyze_data_ref_dependences (NULL, bb_vinfo, &max_vf)
+       || min_vf > max_vf)
      {
        if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
         fprintf (vect_dump, "not vectorized: unhandled data dependence "
@@ -1785,6 +1903,61 @@ vect_slp_analyze_bb (basic_block bb)
 }
 
 
+bb_vec_info
+vect_slp_analyze_bb (basic_block bb)
+{
+  bb_vec_info bb_vinfo;
+  int insns = 0;
+  gimple_stmt_iterator gsi;
+  unsigned int vector_sizes;
+
+  if (vect_print_dump_info (REPORT_DETAILS))
+    fprintf (vect_dump, "===vect_slp_analyze_bb===\n");
+
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple stmt = gsi_stmt (gsi);
+      if (!is_gimple_debug (stmt)
+          && !gimple_nop_p (stmt)
+          && gimple_code (stmt) != GIMPLE_LABEL)
+        insns++;
+    }
+
+  if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
+    {
+      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
+        fprintf (vect_dump, "not vectorized: too many instructions in basic "
+                            "block.\n");
+
+      return NULL;
+    }
+
+  /* Autodetect first vector size we try.  */
+  current_vector_size = 0;
+  vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
+
+  while (1)
+    {
+      bb_vinfo = vect_slp_analyze_bb_1 (bb);
+      if (bb_vinfo)
+        return bb_vinfo;
+
+      destroy_bb_vec_info (bb_vinfo);
+
+      vector_sizes &= ~current_vector_size;
+      if (vector_sizes == 0
+          || current_vector_size == 0)
+        return NULL;
+
+      /* Try the next biggest vector size.  */
+      current_vector_size = 1 << floor_log2 (vector_sizes);
+      if (vect_print_dump_info (REPORT_DETAILS))
+        fprintf (vect_dump, "***** Re-trying analysis with "
+                 "vector size %d\n", current_vector_size);
+    }
+}
+
+
 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
    the number of created vector stmts depends on the unrolling factor).
    However, the actual number of vector stmts for every SLP node depends on
@@ -1811,13 +1984,14 @@ vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
 
 /* For constant and loop invariant defs of SLP_NODE this function returns
    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
-   OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
-   stmts. NUMBER_OF_VECTORS is the number of vector defs to create.  
+   OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
+   scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
    REDUC_INDEX is the index of the reduction operand in the statements, unless
    it is -1.  */
 
 static void
-vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
+vect_get_constant_vectors (tree op, slp_tree slp_node,
+                           VEC (tree, heap) **vec_oprnds,
                           unsigned int op_num, unsigned int number_of_vectors,
                            int reduc_index)
 {
@@ -1829,25 +2003,22 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
   tree t = NULL_TREE;
   int j, number_of_places_left_in_vector;
   tree vector_type;
-  tree op, vop;
+  tree vop;
   int group_size = VEC_length (gimple, stmts);
   unsigned int vec_num, i;
   int number_of_copies = 1;
   VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
   bool constant_p, is_store;
   tree neutral_op = NULL;
+  enum tree_code code = gimple_assign_rhs_code (stmt);
+  gimple def_stmt;
+  struct loop *loop;
 
-  if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def)
+  if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
+      && reduc_index != -1)
     {
-      enum tree_code code = gimple_assign_rhs_code (stmt);
-      if (reduc_index == -1)
-        {
-          VEC_free (tree, heap, *vec_oprnds);
-          return;
-        }
-
       op_num = reduc_index - 1;
-      op = gimple_op (stmt, op_num + 1);
+      op = gimple_op (stmt, reduc_index);
       /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
          we need either neutral operands or the original operands.  See
          get_initial_def_for_reduction() for details.  */
@@ -1878,8 +2049,16 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
             neutral_op = build_int_cst (TREE_TYPE (op), -1);
             break;
 
+          case MAX_EXPR:
+          case MIN_EXPR:
+            def_stmt = SSA_NAME_DEF_STMT (op);
+            loop = (gimple_bb (stmt))->loop_father;
+            neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
+                                                loop_preheader_edge (loop));
+            break;
+
           default:
-             neutral_op = NULL;
+            neutral_op = NULL;
         }
     }
 
@@ -1889,10 +2068,9 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
       op = gimple_assign_rhs1 (stmt);
     }
   else
-    {
-      is_store = false;
-      op = gimple_op (stmt, op_num + 1);
-    }
+    is_store = false;
+
+  gcc_assert (op);
 
   if (CONSTANT_CLASS_P (op))
     constant_p = true;
@@ -1901,7 +2079,6 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
 
   vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
   gcc_assert (vector_type);
-
   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
 
   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
@@ -1934,15 +2111,21 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
 
           if (reduc_index != -1)
             {
-              struct loop *loop = (gimple_bb (stmt))->loop_father;
-              gimple def_stmt = SSA_NAME_DEF_STMT (op);
+              loop = (gimple_bb (stmt))->loop_father;
+              def_stmt = SSA_NAME_DEF_STMT (op);
 
               gcc_assert (loop);
-              /* Get the def before the loop.  */
-              op = PHI_ARG_DEF_FROM_EDGE (def_stmt, 
-                                          loop_preheader_edge (loop));
-              if (j != (number_of_copies - 1) && neutral_op)
+
+              /* Get the def before the loop.  In reduction chain we have only
+                 one initial value.  */
+              if ((j != (number_of_copies - 1)
+                   || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
+                       && i != 0))
+                  && neutral_op)
                 op = neutral_op;
+              else
+                op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
+                                            loop_preheader_edge (loop));
             }
 
           /* Create 'vect_ = {op0,op1,...,opn}'.  */
@@ -1987,12 +2170,7 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
       if (neutral_op)
         {
           if (!neutral_vec)
-            {
-              t = NULL;
-              for (i = 0; i < (unsigned) nunits; i++)
-                 t = tree_cons (NULL_TREE, neutral_op, t);
-              neutral_vec = build_vector (vector_type, t);
-            }
+           neutral_vec = build_vector_from_val (vector_type, neutral_op);
 
           VEC_quick_push (tree, *vec_oprnds, neutral_vec);
         }
@@ -2036,7 +2214,8 @@ vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
    the right node. This is used when the second operand must remain scalar.  */
 
 void
-vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
+vect_get_slp_defs (tree op0, tree op1, slp_tree slp_node,
+                   VEC (tree,heap) **vec_oprnds0,
                    VEC (tree,heap) **vec_oprnds1, int reduc_index)
 {
   gimple first_stmt;
@@ -2076,7 +2255,7 @@ vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
     vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
   else
     /* Build vectors from scalar defs.  */
-    vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects,
+    vect_get_constant_vectors (op0, slp_node, vec_oprnds0, 0, number_of_vects,
                                reduc_index);
 
   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
@@ -2089,7 +2268,7 @@ vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
     return;
 
   code = gimple_assign_rhs_code (first_stmt);
-  if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1)
+  if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1 || !op1)
     return;
 
   /* The number of vector defs is determined by the number of vector statements
@@ -2106,7 +2285,8 @@ vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
     vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
   else
     /* Build vectors from scalar defs.  */
-    vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects, -1);
+    vect_get_constant_vectors (op1, slp_node, vec_oprnds1, 1, number_of_vects,
+                               -1);
 }
 
 
@@ -2123,8 +2303,7 @@ static inline void
 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
                            tree mask, int first_vec_indx, int second_vec_indx,
                            gimple_stmt_iterator *gsi, slp_tree node,
-                           tree builtin_decl, tree vectype,
-                           VEC(tree,heap) *dr_chain,
+                           tree vectype, VEC(tree,heap) *dr_chain,
                            int ncopies, int vect_stmts_counter)
 {
   tree perm_dest;
@@ -2148,10 +2327,10 @@ vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
       second_vec = VEC_index (tree, dr_chain, second_vec_indx);
 
       /* Generate the permute statement.  */
-      perm_stmt = gimple_build_call (builtin_decl,
-                                    3, first_vec, second_vec, mask);
+      perm_stmt = gimple_build_assign_with_ops3 (VEC_PERM_EXPR, perm_dest,
+                                                first_vec, second_vec, mask);
       data_ref = make_ssa_name (perm_dest, perm_stmt);
-      gimple_call_set_lhs (perm_stmt, data_ref);
+      gimple_set_lhs (perm_stmt, data_ref);
       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
 
       /* Store the vector statement in NODE.  */
@@ -2258,9 +2437,9 @@ vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
 {
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   tree mask_element_type = NULL_TREE, mask_type;
-  int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index;
+  int i, j, k, nunits, vec_index = 0, scalar_index;
   slp_tree node;
-  tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl;
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   gimple next_scalar_stmt;
   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
   int first_mask_element;
@@ -2271,35 +2450,24 @@ vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
   bool mask_fixed = false;
   bool needs_first_vector = false;
 
-  if (!targetm.vectorize.builtin_vec_perm)
+  if (!can_vec_perm_expr_p (vectype, NULL_TREE))
     {
       if (vect_print_dump_info (REPORT_DETAILS))
         {
-          fprintf (vect_dump, "no builtin for vect permute for ");
+          fprintf (vect_dump, "no vect permute for ");
           print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
         }
-
-       return false;
-    }
-
-  builtin_decl = targetm.vectorize.builtin_vec_perm (vectype,
-                                                     &mask_element_type);
-  if (!builtin_decl || !mask_element_type)
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        {
-          fprintf (vect_dump, "no builtin for vect permute for ");
-          print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
-        }
-
-       return false;
+      return false;
     }
 
+  /* The generic VEC_PERM_EXPR code always uses an integral type of the
+     same size as the vector element being permuted.  */
+  mask_element_type
+    = lang_hooks.types.type_for_size
+    (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (vectype))), 1);
   mask_type = get_vectype_for_scalar_type (mask_element_type);
-  mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type);
-  mask = (int *) xmalloc (sizeof (int) * mask_nunits);
   nunits = TYPE_VECTOR_SUBPARTS (vectype);
-  scale = mask_nunits / nunits;
+  mask = (int *) xmalloc (sizeof (int) * nunits);
   unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
 
   /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
@@ -2322,8 +2490,7 @@ vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
      ...
 
-     The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target
-     scpecific type, e.g., in bytes for Altivec.
+     The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
      The last mask is illegal since we assume two operands for permute
      operation, and the mask element values can't be outside that range.
      Hence, the last mask must be converted into {2,5,5,5}.
@@ -2348,20 +2515,17 @@ vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
         {
           for (k = 0; k < group_size; k++)
             {
-              first_mask_element = (i + j * group_size) * scale;
-              for (m = 0; m < scale; m++)
-                {
-                  if (!vect_get_mask_element (stmt, first_mask_element, m,
-                                   mask_nunits, only_one_vec, index, mask,
-                                   &current_mask_element, &need_next_vector,
-                                   &number_of_mask_fixes, &mask_fixed,
-                                   &needs_first_vector))
-                    return false;
-
-                  mask[index++] = current_mask_element;
-                }
+              first_mask_element = i + j * group_size;
+              if (!vect_get_mask_element (stmt, first_mask_element, 0,
+                                         nunits, only_one_vec, index,
+                                         mask, &current_mask_element,
+                                         &need_next_vector,
+                                         &number_of_mask_fixes, &mask_fixed,
+                                         &needs_first_vector))
+               return false;
+             mask[index++] = current_mask_element;
 
-              if (index == mask_nunits)
+              if (index == nunits)
                 {
                  tree mask_vec = NULL;
 
@@ -2373,8 +2537,7 @@ vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
                  mask_vec = build_vector (mask_type, mask_vec);
                  index = 0;
 
-                 if (!targetm.vectorize.builtin_vec_perm_ok (vectype,
-                                                             mask_vec))
+                 if (!can_vec_perm_expr_p (vectype, mask_vec))
                    {
                      if (vect_print_dump_info (REPORT_DETAILS))
                        {
@@ -2398,7 +2561,7 @@ vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
 
                       vect_create_mask_and_perm (stmt, next_scalar_stmt,
                                mask_vec, first_vec_index, second_vec_index,
-                              gsi, node, builtin_decl, vectype, dr_chain,
+                              gsi, node, vectype, dr_chain,
                               ncopies, vect_stmts_counter++);
                     }
                 }
@@ -2480,8 +2643,11 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
   /* Loads should be inserted before the first load.  */
   if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
       && STMT_VINFO_STRIDED_ACCESS (stmt_info)
-      && !REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
+      && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
+      && SLP_INSTANCE_LOAD_PERMUTATION (instance))
     si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
+  else if (is_pattern_stmt_p (stmt_info))
+    si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
   else
     si = gsi_for_stmt (stmt);
 
@@ -2493,6 +2659,16 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
       si = gsi_for_stmt (last_store);
     }
 
+  /* Mark the first element of the reduction chain as reduction to properly
+     transform the node.  In the analysis phase only the last element of the
+     chain is marked as reduction.  */
+  if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_STRIDED_ACCESS (stmt_info)
+      && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
+    {
+      STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
+      STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
+    }
+
   is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance);
   return is_store;
 }