2010-04-13 Richard Guenther <rguenther@suse.de>

[pf3gnuchains/gcc-fork.git] / gcc / tree-vect-loop.c
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

index 33b0a9d..809f3e1 100644 (file)
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1,7 +1,7 @@
  /* Loop Vectorization
-   Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software
-   Foundation, Inc.
-   Contributed by Dorit Naishlos <dorit@il.ibm.com> and 
+   Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
+   Free Software Foundation, Inc.
+   Contributed by Dorit Naishlos <dorit@il.ibm.com> and
     Ira Rosen <irar@il.ibm.com>
  
  This file is part of GCC.
@@ -43,7 +43,7 @@ along with GCC; see the file COPYING3.  If not see
  
  /* Loop Vectorization Pass.
  
-   This pass tries to vectorize loops. 
+   This pass tries to vectorize loops.
  
     For example, the vectorizer transforms the following simple loop:
  
@@ -240,6 +240,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
  
        for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
          {
+         tree vf_vectype;
           gimple stmt = gsi_stmt (si);
           stmt_info = vinfo_for_stmt (stmt);
  
@@ -282,55 +283,89 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
  
           if (STMT_VINFO_VECTYPE (stmt_info))
             {
-             /* The only case when a vectype had been already set is for stmts 
+             /* The only case when a vectype had been already set is for stmts
                  that contain a dataref, or for "pattern-stmts" (stmts generated
                  by the vectorizer to represent/replace a certain idiom).  */
-             gcc_assert (STMT_VINFO_DATA_REF (stmt_info) 
+             gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
                           || is_pattern_stmt_p (stmt_info));
               vectype = STMT_VINFO_VECTYPE (stmt_info);
             }
           else
             {
-
-             gcc_assert (! STMT_VINFO_DATA_REF (stmt_info)
+             gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
                           && !is_pattern_stmt_p (stmt_info));
  
-             scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, 
-                                                           &dummy);
+             scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
               if (vect_print_dump_info (REPORT_DETAILS))
                 {
                   fprintf (vect_dump, "get vectype for scalar type:  ");
                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
                 }
-
               vectype = get_vectype_for_scalar_type (scalar_type);
               if (!vectype)
                 {
                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
                     {
-                     fprintf (vect_dump, 
+                     fprintf (vect_dump,
                                "not vectorized: unsupported data-type ");
                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
                     }
                   return false;
                 }
+
               STMT_VINFO_VECTYPE (stmt_info) = vectype;
              }
  
+         /* The vectorization factor is according to the smallest
+            scalar type (or the largest vector size, but we only
+            support one vector size per loop).  */
+         scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
+                                                      &dummy);
+         if (vect_print_dump_info (REPORT_DETAILS))
+           {
+             fprintf (vect_dump, "get vectype for scalar type:  ");
+             print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
+           }
+         vf_vectype = get_vectype_for_scalar_type (scalar_type);
+         if (!vf_vectype)
+           {
+             if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
+               {
+                 fprintf (vect_dump,
+                          "not vectorized: unsupported data-type ");
+                 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
+               }
+             return false;
+           }
+
+         if ((GET_MODE_SIZE (TYPE_MODE (vectype))
+              != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
+           {
+             if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
+               {
+                 fprintf (vect_dump,
+                          "not vectorized: different sized vector "
+                          "types in statement, ");
+                 print_generic_expr (vect_dump, vectype, TDF_SLIM);
+                 fprintf (vect_dump, " and ");
+                 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
+               }
+             return false;
+           }
+
           if (vect_print_dump_info (REPORT_DETAILS))
             {
               fprintf (vect_dump, "vectype: ");
-             print_generic_expr (vect_dump, vectype, TDF_SLIM);
+             print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
             }
  
-         nunits = TYPE_VECTOR_SUBPARTS (vectype);
+         nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
           if (vect_print_dump_info (REPORT_DETAILS))
             fprintf (vect_dump, "nunits = %d", nunits);
  
           if (!vectorization_factor
               || (nunits > vectorization_factor))
             vectorization_factor = nunits;
-
          }
      }
  
@@ -410,11 +445,14 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
    tree dumy;
    VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
    gimple_stmt_iterator gsi;
+  bool double_reduc;
  
    if (vect_print_dump_info (REPORT_DETAILS))
      fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
  
-  /* First - identify all inductions.  */
+  /* First - identify all inductions. Reduction detection assumes that all the
+     inductions have been identified, therefore, this order must not be
+     changed.  */
    for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
      {
        gimple phi = gsi_stmt (gsi);
@@ -444,9 +482,9 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
         }
  
        if (!access_fn
-         || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy)) 
+         || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
         {
-         VEC_safe_push (gimple, heap, worklist, phi);    
+         VEC_safe_push (gimple, heap, worklist, phi);
           continue;
         }
  
@@ -456,16 +494,17 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
      }
  
  
-  /* Second - identify all reductions.  */
+  /* Second - identify all reductions and nested cycles.  */
    while (VEC_length (gimple, worklist) > 0)
      {
        gimple phi = VEC_pop (gimple, worklist);
        tree def = PHI_RESULT (phi);
        stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
        gimple reduc_stmt;
+      bool nested_cycle;
  
        if (vect_print_dump_info (REPORT_DETAILS))
-        { 
+        {
            fprintf (vect_dump, "Analyze phi: ");
            print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
          }
@@ -473,14 +512,41 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
        gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
        gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
  
-      reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi);
+      nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
+      reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle,
+                                             &double_reduc);
        if (reduc_stmt)
          {
-          if (vect_print_dump_info (REPORT_DETAILS))
-            fprintf (vect_dump, "Detected reduction.");
-          STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
-          STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
-                                                        vect_reduction_def;
+          if (double_reduc)
+            {
+              if (vect_print_dump_info (REPORT_DETAILS))
+                fprintf (vect_dump, "Detected double reduction.");
+
+              STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
+              STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
+                                                    vect_double_reduction_def;
+            }
+          else
+            {
+              if (nested_cycle)
+                {
+                  if (vect_print_dump_info (REPORT_DETAILS))
+                    fprintf (vect_dump, "Detected vectorizable nested cycle.");
+
+                  STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
+                  STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
+                                                             vect_nested_cycle;
+                }
+              else
+                {
+                  if (vect_print_dump_info (REPORT_DETAILS))
+                    fprintf (vect_dump, "Detected reduction.");
+
+                  STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
+                  STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
+                                                           vect_reduction_def;
+                }
+            }
          }
        else
          if (vect_print_dump_info (REPORT_DETAILS))
@@ -488,14 +554,13 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
      }
  
    VEC_free (gimple, heap, worklist);
-  return;
  }
  
  
  /* Function vect_analyze_scalar_cycles.
  
     Examine the cross iteration def-use cycles of scalar variables, by
-   analyzing the loop-header PHIs of scalar variables; Classify each 
+   analyzing the loop-header PHIs of scalar variables; Classify each
     cycle as one of the following: invariant, induction, reduction, unknown.
     We do that for the loop represented by LOOP_VINFO, and also to its
     inner-loop, if exists.
@@ -526,7 +591,7 @@ vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
       1. When vectorized, they are executed in the same order as in the original
          scalar loop, so we can't change the order of computation when
          vectorizing them.
-     2. FIXME: Inner-loop reductions can be used in the inner-loop, so the 
+     2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
          current checks are too strict.  */
  
    if (loop->inner)
@@ -826,13 +891,13 @@ vect_analyze_loop_form (struct loop *loop)
      fprintf (vect_dump, "=== vect_analyze_loop_form ===");
  
    /* Different restrictions apply when we are considering an inner-most loop,
-     vs. an outer (nested) loop.  
+     vs. an outer (nested) loop.
       (FORNOW. May want to relax some of these restrictions in the future).  */
  
    if (!loop->inner)
      {
-      /* Inner-most loop.  We currently require that the number of BBs is 
-        exactly 2 (the header and latch).  Vectorizable inner-most loops 
+      /* Inner-most loop.  We currently require that the number of BBs is
+        exactly 2 (the header and latch).  Vectorizable inner-most loops
          look like this:
  
                          (pre-header)
@@ -846,7 +911,7 @@ vect_analyze_loop_form (struct loop *loop)
        if (loop->num_nodes != 2)
          {
            if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
-            fprintf (vect_dump, "not vectorized: too many BBs in loop.");
+            fprintf (vect_dump, "not vectorized: control flow in loop.");
            return NULL;
          }
  
@@ -860,10 +925,10 @@ vect_analyze_loop_form (struct loop *loop)
    else
      {
        struct loop *innerloop = loop->inner;
-      edge backedge, entryedge;
+      edge entryedge;
  
        /* Nested loop. We currently require that the loop is doubly-nested,
-        contains a single inner loop, and the number of BBs is exactly 5. 
+        contains a single inner loop, and the number of BBs is exactly 5.
          Vectorizable outer-loops look like this:
  
                         (pre-header)
@@ -873,7 +938,7 @@ vect_analyze_loop_form (struct loop *loop)
                           inner-loop |
                            |         |
                           tail ------+
-                          | 
+                          |
                         (exit-bb)
  
          The inner-loop has the properties expected of inner-most loops
@@ -905,23 +970,19 @@ vect_analyze_loop_form (struct loop *loop)
           return NULL;
         }
  
-      if (loop->num_nodes != 5) 
+      if (loop->num_nodes != 5)
          {
           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
-           fprintf (vect_dump, "not vectorized: too many BBs in loop.");
+           fprintf (vect_dump, "not vectorized: control flow in loop.");
           destroy_loop_vec_info (inner_loop_vinfo, true);
           return NULL;
          }
  
        gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
-      backedge = EDGE_PRED (innerloop->header, 1);       
        entryedge = EDGE_PRED (innerloop->header, 0);
        if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
-       {
-         backedge = EDGE_PRED (innerloop->header, 0);
-         entryedge = EDGE_PRED (innerloop->header, 1); 
-       }
-       
+       entryedge = EDGE_PRED (innerloop->header, 1);
+
        if (entryedge->src != loop->header
           || !single_exit (innerloop)
           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
@@ -935,8 +996,8 @@ vect_analyze_loop_form (struct loop *loop)
        if (vect_print_dump_info (REPORT_DETAILS))
          fprintf (vect_dump, "Considering outer-loop vectorization.");
      }
-  
-  if (!single_exit (loop) 
+
+  if (!single_exit (loop)
        || EDGE_COUNT (loop->header->preds) != 2)
      {
        if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
@@ -956,7 +1017,7 @@ vect_analyze_loop_form (struct loop *loop)
       before the loop if needed), where the loop header contains all the
       executable statements, and the latch is empty.  */
    if (!empty_block_p (loop->latch)
-        || phi_nodes (loop->latch))
+        || !gimple_seq_empty_p (phi_nodes (loop->latch)))
      {
        if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
          fprintf (vect_dump, "not vectorized: unexpected loop form.");
@@ -994,11 +1055,11 @@ vect_analyze_loop_form (struct loop *loop)
         destroy_loop_vec_info (inner_loop_vinfo, true);
        return NULL;
      }
-  
-  if (!number_of_iterations) 
+
+  if (!number_of_iterations)
      {
        if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
-       fprintf (vect_dump, 
+       fprintf (vect_dump,
                  "not vectorized: number of iterations cannot be computed.");
        if (inner_loop_vinfo)
         destroy_loop_vec_info (inner_loop_vinfo, true);
@@ -1095,10 +1156,13 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
                /* inner-loop loop-closed exit phi in outer-loop vectorization
                   (i.e. a phi in the tail of the outer-loop).
                   FORNOW: we currently don't support the case that these phis
-                 are not used in the outerloop, cause this case requires
-                 to actually do something here.  */
-              if (!STMT_VINFO_RELEVANT_P (stmt_info)
-                  || STMT_VINFO_LIVE_P (stmt_info))
+                 are not used in the outerloop (unless it is double reduction,
+                 i.e., this phi is vect_reduction_def), cause this case
+                 requires to actually do something here.  */
+              if ((!STMT_VINFO_RELEVANT_P (stmt_info)
+                   || STMT_VINFO_LIVE_P (stmt_info))
+                  && STMT_VINFO_DEF_TYPE (stmt_info)
+                     != vect_double_reduction_def)
                  {
                    if (vect_print_dump_info (REPORT_DETAILS))
                      fprintf (vect_dump,
@@ -1156,10 +1220,13 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
             return false;
  
-          if (STMT_VINFO_RELEVANT_P (stmt_info) && !PURE_SLP_STMT (stmt_info))
+          if ((STMT_VINFO_RELEVANT_P (stmt_info)
+               || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
+              && !PURE_SLP_STMT (stmt_info))
+
              /* STMT needs both SLP and loop-based vectorization.  */
              only_slp_in_loop = false;
-        } 
+        }
      } /* bbs */
  
    /* All operations in the loop are either irrelevant (deal with loop
@@ -1287,11 +1354,13 @@ vect_analyze_loop (struct loop *loop)
  {
    bool ok;
    loop_vec_info loop_vinfo;
+  int max_vf = MAX_VECTORIZATION_FACTOR;
+  int min_vf = 2;
  
    if (vect_print_dump_info (REPORT_DETAILS))
      fprintf (vect_dump, "===== analyze_loop_nest =====");
  
-  if (loop_outer (loop) 
+  if (loop_outer (loop)
        && loop_vec_info_for_loop (loop_outer (loop))
        && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
      {
@@ -1311,12 +1380,13 @@ vect_analyze_loop (struct loop *loop)
      }
  
    /* Find all data references in the loop (which correspond to vdefs/vuses)
-     and analyze their evolution in the loop.
+     and analyze their evolution in the loop.  Also adjust the minimal
+     vectorization factor according to the loads and stores.
  
       FORNOW: Handle only simple, array references, which
       alignment can be forced, and aligned pointer-references.  */
  
-  ok = vect_analyze_data_refs (loop_vinfo, NULL);
+  ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
    if (!ok)
      {
        if (vect_print_dump_info (REPORT_DETAILS))
@@ -1343,14 +1413,17 @@ vect_analyze_loop (struct loop *loop)
        return NULL;
      }
  
-  /* Analyze the alignment of the data-refs in the loop.
-     Fail if a data reference is found that cannot be vectorized.  */
+  /* Analyze data dependences between the data-refs in the loop
+     and adjust the maximum vectorization factor according to
+     the dependences.
+     FORNOW: fail at the first data dependence that we encounter.  */
  
-  ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
-  if (!ok)
+  ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
+  if (!ok
+      || max_vf < min_vf)
      {
        if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "bad data alignment.");
+       fprintf (vect_dump, "bad data dependence.");
        destroy_loop_vec_info (loop_vinfo, true);
        return NULL;
      }
@@ -1363,15 +1436,22 @@ vect_analyze_loop (struct loop *loop)
        destroy_loop_vec_info (loop_vinfo, true);
        return NULL;
      }
+  if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+       fprintf (vect_dump, "bad data dependence.");
+      destroy_loop_vec_info (loop_vinfo, true);
+      return NULL;
+    }
  
-  /* Analyze data dependences between the data-refs in the loop. 
-     FORNOW: fail at the first data dependence that we encounter.  */
+  /* Analyze the alignment of the data-refs in the loop.
+     Fail if a data reference is found that cannot be vectorized.  */
  
-  ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL);
+  ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
    if (!ok)
      {
        if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "bad data dependence.");
+       fprintf (vect_dump, "bad data alignment.");
        destroy_loop_vec_info (loop_vinfo, true);
        return NULL;
      }
@@ -1450,31 +1530,40 @@ vect_analyze_loop (struct loop *loop)
     Output:
     REDUC_CODE - the corresponding tree-code to be used to reduce the
        vector of partial results into a single scalar result (which
-      will also reside in a vector).
+      will also reside in a vector) or ERROR_MARK if the operation is
+      a supported reduction operation, but does not have such tree-code.
  
-   Return TRUE if a corresponding REDUC_CODE was found, FALSE otherwise.  */
+   Return FALSE if CODE currently cannot be vectorized as reduction.  */
  
  static bool
  reduction_code_for_scalar_code (enum tree_code code,
                                  enum tree_code *reduc_code)
  {
    switch (code)
-  {
-  case MAX_EXPR:
-    *reduc_code = REDUC_MAX_EXPR;
-    return true;
-
-  case MIN_EXPR:
-    *reduc_code = REDUC_MIN_EXPR;
-    return true;
-
-  case PLUS_EXPR:
-    *reduc_code = REDUC_PLUS_EXPR;
-    return true;
-
-  default:
-    return false;
-  }
+    {
+      case MAX_EXPR:
+        *reduc_code = REDUC_MAX_EXPR;
+        return true;
+
+      case MIN_EXPR:
+        *reduc_code = REDUC_MIN_EXPR;
+        return true;
+
+      case PLUS_EXPR:
+        *reduc_code = REDUC_PLUS_EXPR;
+        return true;
+
+      case MULT_EXPR:
+      case MINUS_EXPR:
+      case BIT_IOR_EXPR:
+      case BIT_XOR_EXPR:
+      case BIT_AND_EXPR:
+        *reduc_code = ERROR_MARK;
+        return true;
+
+      default:
+       return false;
+    }
  }
  
  
@@ -1491,46 +1580,66 @@ report_vect_op (gimple stmt, const char *msg)
  
  /* Function vect_is_simple_reduction
  
-   Detect a cross-iteration def-use cycle that represents a simple
+   (1) Detect a cross-iteration def-use cycle that represents a simple
     reduction computation. We look for the following pattern:
  
     loop_header:
       a1 = phi < a0, a2 >
       a3 = ...
       a2 = operation (a3, a1)
-  
+
     such that:
-   1. operation is commutative and associative and it is safe to 
-      change the order of the computation.
+   1. operation is commutative and associative and it is safe to
+      change the order of the computation (if CHECK_REDUCTION is true)
     2. no uses for a2 in the loop (a2 is used out of the loop)
     3. no uses of a1 in the loop besides the reduction operation.
  
     Condition 1 is tested here.
-   Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.  */
+   Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
+
+   (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
+   nested cycles, if CHECK_REDUCTION is false.
+
+   (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
+   reductions:
+
+     a1 = phi < a0, a2 >
+     inner loop (def of a3)
+     a2 = phi < a3 >
+*/
  
  gimple
-vect_is_simple_reduction (loop_vec_info loop_info, gimple phi)
+vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
+                          bool check_reduction, bool *double_reduc)
  {
    struct loop *loop = (gimple_bb (phi))->loop_father;
    struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
    edge latch_e = loop_latch_edge (loop);
    tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
-  gimple def_stmt, def1, def2;
+  gimple def_stmt, def1 = NULL, def2 = NULL;
    enum tree_code code;
-  tree op1, op2;
+  tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
    tree type;
    int nloop_uses;
    tree name;
    imm_use_iterator imm_iter;
    use_operand_p use_p;
+  bool phi_def;
+
+  *double_reduc = false;
  
-  gcc_assert (loop == vect_loop || flow_loop_nested_p (vect_loop, loop));
+  /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
+     otherwise, we assume outer loop vectorization.  */
+  gcc_assert ((check_reduction && loop == vect_loop)
+              || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
  
    name = PHI_RESULT (phi);
    nloop_uses = 0;
    FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
      {
        gimple use_stmt = USE_STMT (use_p);
+      if (is_gimple_debug (use_stmt))
+       continue;
        if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
           && vinfo_for_stmt (use_stmt)
           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
@@ -1561,18 +1670,30 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple phi)
        return NULL;
      }
  
-  if (!is_gimple_assign (def_stmt))
+  if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
      {
        if (vect_print_dump_info (REPORT_DETAILS))
          print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
        return NULL;
      }
  
-  name = gimple_assign_lhs (def_stmt);
+  if (is_gimple_assign (def_stmt))
+    {
+      name = gimple_assign_lhs (def_stmt);
+      phi_def = false;
+    }
+  else
+    {
+      name = PHI_RESULT (def_stmt);
+      phi_def = true;
+    }
+
    nloop_uses = 0;
    FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
      {
        gimple use_stmt = USE_STMT (use_p);
+      if (is_gimple_debug (use_stmt))
+       continue;
        if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
           && vinfo_for_stmt (use_stmt)
           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
@@ -1585,9 +1706,41 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple phi)
         }
      }
  
+  /* If DEF_STMT is a phi node itself, we expect it to have a single argument
+     defined in the inner loop.  */
+  if (phi_def)
+    {
+      op1 = PHI_ARG_DEF (def_stmt, 0);
+
+      if (gimple_phi_num_args (def_stmt) != 1
+          || TREE_CODE (op1) != SSA_NAME)
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            fprintf (vect_dump, "unsupported phi node definition.");
+
+          return NULL;
+        }
+
+      def1 = SSA_NAME_DEF_STMT (op1);
+      if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
+          && loop->inner
+          && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
+          && is_gimple_assign (def1))
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            report_vect_op (def_stmt, "detected double reduction: ");
+
+          *double_reduc = true;
+          return def_stmt;
+        }
+
+      return NULL;
+    }
+
    code = gimple_assign_rhs_code (def_stmt);
  
-  if (!commutative_tree_code (code) || !associative_tree_code (code))
+  if (check_reduction
+      && (!commutative_tree_code (code) || !associative_tree_code (code)))
      {
        if (vect_print_dump_info (REPORT_DETAILS))
          report_vect_op (def_stmt, "reduction: not commutative/associative: ");
@@ -1596,24 +1749,55 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple phi)
  
    if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
      {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       report_vect_op (def_stmt, "reduction: not binary operation: ");
-      return NULL;
-    }
+      if (code != COND_EXPR)
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+           report_vect_op (def_stmt, "reduction: not binary operation: ");
  
-  op1 = gimple_assign_rhs1 (def_stmt);
-  op2 = gimple_assign_rhs2 (def_stmt);
-  if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
-      return NULL;
+          return NULL;
+        }
+
+      op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
+      if (COMPARISON_CLASS_P (op3))
+        {
+          op4 = TREE_OPERAND (op3, 1);
+          op3 = TREE_OPERAND (op3, 0);
+        }
+
+      op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
+      op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
+
+      if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
+
+          return NULL;
+        }
      }
+  else
+    {
+      op1 = gimple_assign_rhs1 (def_stmt);
+      op2 = gimple_assign_rhs2 (def_stmt);
+
+      if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+           report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
+
+          return NULL;
+        }
+   }
  
-  /* Check that it's ok to change the order of the computation.  */
    type = TREE_TYPE (gimple_assign_lhs (def_stmt));
-  if (TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op1))
-      || TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op2)))
+  if ((TREE_CODE (op1) == SSA_NAME
+       && !types_compatible_p (type,TREE_TYPE (op1)))
+      || (TREE_CODE (op2) == SSA_NAME
+          && !types_compatible_p (type, TREE_TYPE (op2)))
+      || (op3 && TREE_CODE (op3) == SSA_NAME
+          && !types_compatible_p (type, TREE_TYPE (op3)))
+      || (op4 && TREE_CODE (op4) == SSA_NAME
+          && !types_compatible_p (type, TREE_TYPE (op4))))
      {
        if (vect_print_dump_info (REPORT_DETAILS))
          {
@@ -1623,20 +1807,33 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple phi)
            print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
            fprintf (vect_dump, ",");
            print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
+          if (op3)
+            {
+              fprintf (vect_dump, ",");
+              print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
+            }
+
+          if (op4)
+            {
+              fprintf (vect_dump, ",");
+              print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
+            }
          }
+
        return NULL;
      }
  
-  /* Generally, when vectorizing a reduction we change the order of the
+  /* Check that it's ok to change the order of the computation.
+     Generally, when vectorizing a reduction we change the order of the
       computation.  This may change the behavior of the program in some
-     cases, so we need to check that this is ok.  One exception is when 
+     cases, so we need to check that this is ok.  One exception is when
       vectorizing an outer-loop: the inner-loop is executed sequentially,
       and therefore vectorizing reductions in the inner-loop during
       outer-loop vectorization is safe.  */
  
    /* CHECKME: check for !flag_finite_math_only too?  */
    if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
-      && !nested_in_vect_loop_p (vect_loop, def_stmt)) 
+      && check_reduction)
      {
        /* Changing the order of operations changes the semantics.  */
        if (vect_print_dump_info (REPORT_DETAILS))
@@ -1644,74 +1841,95 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple phi)
        return NULL;
      }
    else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
-          && !nested_in_vect_loop_p (vect_loop, def_stmt))
+          && check_reduction)
      {
        /* Changing the order of operations changes the semantics.  */
        if (vect_print_dump_info (REPORT_DETAILS))
         report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
        return NULL;
      }
-  else if (SAT_FIXED_POINT_TYPE_P (type))
+  else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
      {
        /* Changing the order of operations changes the semantics.  */
        if (vect_print_dump_info (REPORT_DETAILS))
-       report_vect_op (def_stmt, 
+       report_vect_op (def_stmt,
                         "reduction: unsafe fixed-point math optimization: ");
        return NULL;
      }
  
-  /* reduction is safe. we're dealing with one of the following:
+  /* Reduction is safe. We're dealing with one of the following:
       1) integer arithmetic and no trapv
-     2) floating point arithmetic, and special flags permit this optimization.
-   */
-  def1 = SSA_NAME_DEF_STMT (op1);
-  def2 = SSA_NAME_DEF_STMT (op2);
-  if (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2))
+     2) floating point arithmetic, and special flags permit this optimization
+     3) nested cycle (i.e., outer loop vectorization).  */
+  if (TREE_CODE (op1) == SSA_NAME)
+    def1 = SSA_NAME_DEF_STMT (op1);
+
+  if (TREE_CODE (op2) == SSA_NAME)
+    def2 = SSA_NAME_DEF_STMT (op2);
+
+  if (code != COND_EXPR
+      && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
      {
        if (vect_print_dump_info (REPORT_DETAILS))
         report_vect_op (def_stmt, "reduction: no defs for operands: ");
        return NULL;
      }
  
-
    /* Check that one def is the reduction def, defined by PHI,
       the other def is either defined in the loop ("vect_internal_def"),
       or it's an induction (defined by a loop-header phi-node).  */
  
-  if (def2 == phi
-      && flow_bb_inside_loop_p (loop, gimple_bb (def1))
-      && (is_gimple_assign (def1)
-         || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def
-         || (gimple_code (def1) == GIMPLE_PHI
-             && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_internal_def
-             && !is_loop_header_bb_p (gimple_bb (def1)))))
+  if (def2 && def2 == phi
+      && (code == COND_EXPR
+          || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
+              && (is_gimple_assign (def1)
+                 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
+                      == vect_induction_def
+                 || (gimple_code (def1) == GIMPLE_PHI
+                     && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
+                          == vect_internal_def
+                     && !is_loop_header_bb_p (gimple_bb (def1)))))))
      {
        if (vect_print_dump_info (REPORT_DETAILS))
-       report_vect_op (def_stmt, "detected reduction:");
+       report_vect_op (def_stmt, "detected reduction: ");
        return def_stmt;
      }
-  else if (def1 == phi
-          && flow_bb_inside_loop_p (loop, gimple_bb (def2))
-          && (is_gimple_assign (def2)
-              || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_induction_def
-              || (gimple_code (def2) == GIMPLE_PHI
-                  && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_internal_def
-                  && !is_loop_header_bb_p (gimple_bb (def2)))))
-    {
-      /* Swap operands (just for simplicity - so that the rest of the code
-        can assume that the reduction variable is always the last (second)
-        argument).  */
-      if (vect_print_dump_info (REPORT_DETAILS))
-       report_vect_op (def_stmt ,
-                       "detected reduction: need to swap operands:");
-      swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
-                         gimple_assign_rhs2_ptr (def_stmt));
+  else if (def1 && def1 == phi
+          && (code == COND_EXPR
+               || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
+                  && (is_gimple_assign (def2)
+                      || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
+                           == vect_induction_def
+                      || (gimple_code (def2) == GIMPLE_PHI
+                          && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
+                               == vect_internal_def
+                          && !is_loop_header_bb_p (gimple_bb (def2)))))))
+    {
+      if (check_reduction)
+        {
+          /* Swap operands (just for simplicity - so that the rest of the code
+            can assume that the reduction variable is always the last (second)
+            argument).  */
+          if (vect_print_dump_info (REPORT_DETAILS))
+           report_vect_op (def_stmt,
+                           "detected reduction: need to swap operands: ");
+
+          swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
+                             gimple_assign_rhs2_ptr (def_stmt));
+        }
+      else
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            report_vect_op (def_stmt, "detected reduction: ");
+        }
+
        return def_stmt;
      }
    else
      {
        if (vect_print_dump_info (REPORT_DETAILS))
-       report_vect_op (def_stmt, "reduction: unknown pattern.");
+       report_vect_op (def_stmt, "reduction: unknown pattern: ");
+
        return NULL;
      }
  }
@@ -1751,12 +1969,12 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
    if (!flag_vect_cost_model)
      {
        if (vect_print_dump_info (REPORT_COST))
-        fprintf (vect_dump, "cost model disabled.");      
+        fprintf (vect_dump, "cost model disabled.");
        return 0;
      }
  
    /* Requires loop versioning tests to handle misalignment.  */
-  if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
+  if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
      {
        /*  FIXME: Make cost depend on complexity of individual check.  */
        vec_outside_cost +=
@@ -1766,7 +1984,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
                   "versioning to treat misalignment.\n");
      }
  
-  if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+  /* Requires loop versioning with alias checks.  */
+  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
      {
        /*  FIXME: Make cost depend on complexity of individual check.  */
        vec_outside_cost +=
@@ -1776,11 +1995,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
                   "versioning aliasing.\n");
      }
  
-  if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
-      || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
-    {
-      vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
-    }
+  if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
+      || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
+    vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
  
    /* Count statements in scalar loop.  Using this as scalar cost for a single
       iteration for now.
@@ -1852,7 +2069,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
        peel_guard_costs +=  2 * (TARG_COND_TAKEN_BRANCH_COST
                                + TARG_COND_NOT_TAKEN_BRANCH_COST);
      }
-  else 
+  else
      {
        if (byte_misalign)
         {
@@ -1879,10 +2096,10 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
           peel_guard_costs +=  2 * TARG_COND_TAKEN_BRANCH_COST;
  
          }
-      else      
+      else
         {
           int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
-         peel_iters_prologue = niters < peel_iters_prologue ? 
+         peel_iters_prologue = niters < peel_iters_prologue ?
                                         niters : peel_iters_prologue;
           peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
         }
@@ -1946,12 +2163,12 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
       decide whether to vectorize at compile time. Hence the scalar version
       do not carry cost model guard costs.  */
    if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      || VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
-      || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+      || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
+      || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
      {
        /* Cost model check occurs at versioning.  */
-      if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
-         || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+      if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
+          || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
         scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
        else
         {
@@ -1973,9 +2190,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
        vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
      }
  
-  /* Calculate number of iterations required to make the vector version 
+  /* Calculate number of iterations required to make the vector version
       profitable, relative to the loop bodies only. The following condition
-     must hold true: 
+     must hold true:
       SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
       where
       SIC = scalar iteration cost, VIC = vector iteration cost,
@@ -2005,9 +2222,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
    else
      {
        if (vect_print_dump_info (REPORT_COST))
-        fprintf (vect_dump, "cost model: vector iteration cost = %d "
-                 "is divisible by scalar iteration cost = %d by a factor "
-                 "greater than or equal to the vectorization factor = %d .",
+        fprintf (vect_dump, "cost model: the vector iteration cost = %d "
+                "divided by the scalar iteration cost = %d "
+                "is greater or equal to the vectorization factor = %d.",
                   vec_inside_cost, scalar_single_iter_cost, vf);
        return -1;
      }
@@ -2030,7 +2247,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
                min_profitable_iters);
      }
  
-  min_profitable_iters = 
+  min_profitable_iters =
         min_profitable_iters < vf ? vf : min_profitable_iters;
  
    /* Because the condition we create is:
@@ -2041,21 +2258,21 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
    if (vect_print_dump_info (REPORT_COST))
      fprintf (vect_dump, "  Profitability threshold = %d\n",
              min_profitable_iters);
-    
+
    return min_profitable_iters;
  }
  
  
-/* TODO: Close dependency between vect_model_*_cost and vectorizable_* 
+/* TODO: Close dependency between vect_model_*_cost and vectorizable_*
     functions. Design better to avoid maintenance issues.  */
-    
-/* Function vect_model_reduction_cost.  
  
-   Models cost for a reduction operation, including the vector ops 
+/* Function vect_model_reduction_cost.
+
+   Models cost for a reduction operation, including the vector ops
     generated within the strip-mine loop, the initial definition before
     the loop, and the epilogue code that must be generated.  */
  
-static bool 
+static bool
  vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
                            int ncopies)
  {
@@ -2101,11 +2318,11 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
          }
        return false;
     }
-  
+
    mode = TYPE_MODE (vectype);
    orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
  
-  if (!orig_stmt) 
+  if (!orig_stmt)
      orig_stmt = STMT_VINFO_STMT (stmt_info);
  
    code = gimple_assign_rhs_code (orig_stmt);
@@ -2122,7 +2339,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
      {
        if (reduc_code != ERROR_MARK)
         outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
-      else 
+      else
         {
           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
           tree bitsize =
@@ -2139,7 +2356,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
             /* Final reduction via vector shifts and the reduction operator. Also
                requires scalar extract.  */
             outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
-                               + TARG_VEC_TO_SCALAR_COST); 
+                               + TARG_VEC_TO_SCALAR_COST);
           else
             /* Use extracts and reduction op for final reduction.  For N elements,
                 we have N extracts and N-1 reduction ops.  */
@@ -2169,7 +2386,7 @@ vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
    STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
    /* prologue cost for vec_init and vec_step.  */
    STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
-  
+
    if (vect_print_dump_info (REPORT_COST))
      fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
               "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
@@ -2186,7 +2403,7 @@ vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
     Output:
     Return a vector variable, initialized with the first VF values of
     the induction variable. E.g., for an iv with IV_PHI='X' and
-   evolution S, for a vector of 4 units, we want to return: 
+   evolution S, for a vector of 4 units, we want to return:
     [X, X + S, X + 2*S, X + 3*S].  */
  
  static tree
@@ -2196,7 +2413,7 @@ get_initial_def_for_induction (gimple iv_phi)
    loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
-  tree vectype; 
+  tree vectype;
    int nunits;
    edge pe = loop_preheader_edge (loop);
    struct loop *iv_loop;
@@ -2259,7 +2476,7 @@ get_initial_def_for_induction (gimple iv_phi)
    access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
    gcc_assert (access_fn);
    ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
-                                  &init_expr, &step_expr);
+                                    &init_expr, &step_expr);
    gcc_assert (ok);
    pe = loop_preheader_edge (iv_loop);
  
@@ -2269,7 +2486,8 @@ get_initial_def_for_induction (gimple iv_phi)
        /* iv_loop is nested in the loop to be vectorized.  init_expr had already
          been created during vectorization of previous stmts; We obtain it from
          the STMT_VINFO_VEC_STMT of the defining stmt. */
-      tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
+      tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
+                                           loop_preheader_edge (iv_loop));
        vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
      }
    else
@@ -2363,12 +2581,13 @@ get_initial_def_for_induction (gimple iv_phi)
    vec_def = make_ssa_name (vec_dest, new_stmt);
    gimple_assign_set_lhs (new_stmt, vec_def);
    gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
-  set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo, 
+  set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
                                                     NULL));
  
    /* Set the arguments of the phi node:  */
-  add_phi_arg (induction_phi, vec_init, pe);
-  add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
+  add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
+  add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
+              UNKNOWN_LOCATION);
  
  
    /* In case that vectorization factor (VF) is bigger than the number
@@ -2376,7 +2595,7 @@ get_initial_def_for_induction (gimple iv_phi)
       more than one vector stmt - i.e - we need to "unroll" the
       vector stmt by a factor VF/nunits.  For more details see documentation
       in vectorizable_operation.  */
-  
+
    if (ncopies > 1)
      {
        stmt_vec_info prev_stmt_vinfo;
@@ -2408,7 +2627,7 @@ get_initial_def_for_induction (gimple iv_phi)
           set_vinfo_for_stmt (new_stmt,
                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
-         prev_stmt_vinfo = vinfo_for_stmt (new_stmt); 
+         prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
         }
      }
  
@@ -2425,7 +2644,7 @@ get_initial_def_for_induction (gimple iv_phi)
               break;
             }
          }
-      if (exit_phi) 
+      if (exit_phi)
         {
           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
           /* FORNOW. Currently not supporting the case that an inner-loop induction
@@ -2470,18 +2689,16 @@ get_initial_def_for_induction (gimple iv_phi)
          vector of partial results.
  
     Option1 (adjust in epilog): Initialize the vector as follows:
-     add:         [0,0,...,0,0]
-     mult:        [1,1,...,1,1]
-     min/max:     [init_val,init_val,..,init_val,init_val]
-     bit and/or:  [init_val,init_val,..,init_val,init_val]
+     add/bit or/xor:    [0,0,...,0,0]
+     mult/bit and:      [1,1,...,1,1]
+     min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
     and when necessary (e.g. add/mult case) let the caller know
     that it needs to adjust the result by init_val.
  
     Option2: Initialize the vector as follows:
-     add:         [0,0,...,0,init_val]
-     mult:        [1,1,...,1,init_val]
-     min/max:     [init_val,init_val,...,init_val]
-     bit and/or:  [init_val,init_val,...,init_val]
+     add/bit or/xor:    [init_val,0,0,...,0]
+     mult/bit and:      [init_val,1,1,...,1]
+     min/max/cond_expr: [init_val,init_val,...,init_val]
     and no adjustments are needed.
  
     For example, for the following code:
@@ -2496,11 +2713,14 @@ get_initial_def_for_induction (gimple iv_phi)
     the result at the end by 'init_val'.
  
     FORNOW, we are using the 'adjust in epilog' scheme, because this way the
-   initialization vector is simpler (same element in all entries).
+   initialization vector is simpler (same element in all entries), if
+   ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
+
     A cost model should help decide between these two schemes.  */
  
  tree
-get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
+get_initial_def_for_reduction (gimple stmt, tree init_val,
+                               tree *adjustment_def)
  {
    stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
    loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
@@ -2513,58 +2733,136 @@ get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
    tree init_def;
    tree t = NULL_TREE;
    int i;
-  bool nested_in_vect_loop = false; 
+  bool nested_in_vect_loop = false;
+  tree init_value;
+  REAL_VALUE_TYPE real_init_val = dconst0;
+  int int_init_val = 0;
+  gimple def_stmt = NULL;
  
    gcc_assert (vectype);
    nunits = TYPE_VECTOR_SUBPARTS (vectype);
  
    gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
               || SCALAR_FLOAT_TYPE_P (scalar_type));
+
    if (nested_in_vect_loop_p (loop, stmt))
      nested_in_vect_loop = true;
    else
      gcc_assert (loop == (gimple_bb (stmt))->loop_father);
  
+  /* In case of double reduction we only create a vector variable to be put
+     in the reduction phi node. The actual statement creation is done in
+     vect_create_epilog_for_reduction.  */
+  if (adjustment_def && nested_in_vect_loop
+      && TREE_CODE (init_val) == SSA_NAME
+      && (def_stmt = SSA_NAME_DEF_STMT (init_val))
+      && gimple_code (def_stmt) == GIMPLE_PHI
+      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
+      && vinfo_for_stmt (def_stmt)
+      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
+          == vect_double_reduction_def)
+    {
+      *adjustment_def = NULL;
+      return vect_create_destination_var (init_val, vectype);
+    }
+
+  if (TREE_CONSTANT (init_val))
+    {
+      if (SCALAR_FLOAT_TYPE_P (scalar_type))
+        init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
+      else
+        init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
+    }
+  else
+    init_value = init_val;
+
    switch (code)
-  {
-  case WIDEN_SUM_EXPR:
-  case DOT_PROD_EXPR:
-  case PLUS_EXPR:
-    if (nested_in_vect_loop)
-      *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
-    else
-      *adjustment_def = init_val;
-    /* Create a vector of zeros for init_def.  */
-    if (SCALAR_FLOAT_TYPE_P (scalar_type))
-      def_for_init = build_real (scalar_type, dconst0);
-    else
-      def_for_init = build_int_cst (scalar_type, 0);
-      
-    for (i = nunits - 1; i >= 0; --i)
-      t = tree_cons (NULL_TREE, def_for_init, t);
-    init_def = build_vector (vectype, t);
-    break;
-
-  case MIN_EXPR:
-  case MAX_EXPR:
-    *adjustment_def = NULL_TREE;
-    init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
-    break;
-
-  default:
-    gcc_unreachable ();
-  }
+    {
+      case WIDEN_SUM_EXPR:
+      case DOT_PROD_EXPR:
+      case PLUS_EXPR:
+      case MINUS_EXPR:
+      case BIT_IOR_EXPR:
+      case BIT_XOR_EXPR:
+      case MULT_EXPR:
+      case BIT_AND_EXPR:
+        /* ADJUSMENT_DEF is NULL when called from
+           vect_create_epilog_for_reduction to vectorize double reduction.  */
+        if (adjustment_def)
+          {
+            if (nested_in_vect_loop)
+              *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
+                                                              NULL);
+            else
+              *adjustment_def = init_val;
+          }
+
+        if (code == MULT_EXPR || code == BIT_AND_EXPR)
+          {
+            real_init_val = dconst1;
+            int_init_val = 1;
+          }
+
+        if (SCALAR_FLOAT_TYPE_P (scalar_type))
+          def_for_init = build_real (scalar_type, real_init_val);
+        else
+          def_for_init = build_int_cst (scalar_type, int_init_val);
+
+        /* Create a vector of '0' or '1' except the first element.  */
+        for (i = nunits - 2; i >= 0; --i)
+          t = tree_cons (NULL_TREE, def_for_init, t);
+
+        /* Option1: the first element is '0' or '1' as well.  */
+        if (adjustment_def)
+          {
+            t = tree_cons (NULL_TREE, def_for_init, t);
+            init_def = build_vector (vectype, t);
+            break;
+          }
+
+        /* Option2: the first element is INIT_VAL.  */
+        t = tree_cons (NULL_TREE, init_value, t);
+        if (TREE_CONSTANT (init_val))
+          init_def = build_vector (vectype, t);
+        else
+          init_def = build_constructor_from_list (vectype, t);
+
+        break;
+
+      case MIN_EXPR:
+      case MAX_EXPR:
+      case COND_EXPR:
+        if (adjustment_def)
+          {
+            *adjustment_def = NULL_TREE;
+            init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
+            break;
+          }
+
+        for (i = nunits - 1; i >= 0; --i)
+          t = tree_cons (NULL_TREE, init_value, t);
+
+        if (TREE_CONSTANT (init_val))
+          init_def = build_vector (vectype, t);
+        else
+          init_def = build_constructor_from_list (vectype, t);
+
+        break;
+
+      default:
+        gcc_unreachable ();
+    }
  
    return init_def;
  }
  
  
  /* Function vect_create_epilog_for_reduction
-    
+
     Create code at the loop-epilog to finalize the result of a reduction
-   computation. 
-  
-   VECT_DEF is a vector of partial results. 
+   computation.
+
+   VECT_DEF is a vector of partial results.
     REDUC_CODE is the tree-code for the epilog reduction.
     NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
       number of elements that we can fit in a vectype (nunits). In this case
@@ -2573,20 +2871,23 @@ get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
       in vectorizable_operation.
     STMT is the scalar reduction stmt that is being vectorized.
     REDUCTION_PHI is the phi-node that carries the reduction computation.
+   REDUC_INDEX is the index of the operand in the right hand side of the
+     statement that is defined by REDUCTION_PHI.
+   DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
  
     This function:
-   1. Creates the reduction def-use cycle: sets the arguments for 
+   1. Creates the reduction def-use cycle: sets the arguments for
        REDUCTION_PHI:
        The loop-entry argument is the vectorized initial-value of the reduction.
        The loop-latch argument is VECT_DEF - the vector of partial sums.
     2. "Reduces" the vector of partial results VECT_DEF into a single result,
-      by applying the operation specified by REDUC_CODE if available, or by 
+      by applying the operation specified by REDUC_CODE if available, or by
        other means (whole-vector shifts or a scalar loop).
-      The function also creates a new phi node at the loop exit to preserve 
+      The function also creates a new phi node at the loop exit to preserve
        loop-closed form, as illustrated below.
-  
+
       The flow at the entry to this function:
-    
+
          loop:
            vec_def = phi <null, null>            # REDUCTION_PHI
            VECT_DEF = vector_stmt                # vectorized form of STMT
@@ -2601,7 +2902,7 @@ get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
          loop:
            vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
            VECT_DEF = vector_stmt                # vectorized form of STMT
-          s_loop = scalar_stmt                  # (scalar) STMT 
+          s_loop = scalar_stmt                  # (scalar) STMT
          loop_exit:
            s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
            v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
@@ -2616,14 +2917,16 @@ static void
  vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
                                   int ncopies,
                                   enum tree_code reduc_code,
-                                 gimple reduction_phi)
+                                 gimple reduction_phi,
+                                  int reduc_index,
+                                  bool double_reduc)
  {
    stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
    stmt_vec_info prev_phi_info;
    tree vectype;
    enum machine_mode mode;
    loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
    basic_block exit_bb;
    tree scalar_dest;
    tree scalar_type;
@@ -2635,7 +2938,7 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
    gimple epilog_stmt = NULL;
    tree new_scalar_dest, new_dest;
    gimple exit_phi;
-  tree bitsize, bitpos, bytesize; 
+  tree bitsize, bitpos;
    enum tree_code code = gimple_assign_rhs_code (stmt);
    tree adjustment_def;
    tree vec_initial_def, def;
@@ -2650,24 +2953,27 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
    VEC(gimple,heap) *phis = NULL;
    enum vect_def_type dt = vect_unknown_def_type;
    int j, i;
-  
+
    if (nested_in_vect_loop_p (loop, stmt))
      {
+      outer_loop = loop;
        loop = loop->inner;
        nested_in_vect_loop = true;
      }
-  
+
    switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
      {
      case GIMPLE_SINGLE_RHS:
-      gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
-      reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
+      gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
+                                       == ternary_op);
+      reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
        break;
      case GIMPLE_UNARY_RHS:
        reduction_op = gimple_assign_rhs1 (stmt);
        break;
      case GIMPLE_BINARY_RHS:
-      reduction_op = gimple_assign_rhs2 (stmt);
+      reduction_op = reduc_index ?
+                     gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
        break;
      default:
        gcc_unreachable ();
@@ -2678,31 +2984,32 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
    mode = TYPE_MODE (vectype);
  
    /*** 1. Create the reduction def-use cycle  ***/
-  
+
    /* For the case of reduction, vect_get_vec_def_for_operand returns
       the scalar def before the loop, that defines the initial value
       of the reduction variable.  */
    vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
-                                                 &adjustment_def);
+                                                 &adjustment_def);
  
    phi = reduction_phi;
    def = vect_def;
    for (j = 0; j < ncopies; j++)
      {
        /* 1.1 set the loop-entry arg of the reduction-phi:  */
-      add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop));
+      add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop),
+                  UNKNOWN_LOCATION);
  
        /* 1.2 set the loop-latch arg for the reduction-phi:  */
        if (j > 0)
          def = vect_get_vec_def_for_stmt_copy (dt, def);
-      add_phi_arg (phi, def, loop_latch_edge (loop));
+      add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
  
        if (vect_print_dump_info (REPORT_DETAILS))
         {
           fprintf (vect_dump, "transform reduction: created def-use cycle: ");
           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
-         fprintf (vect_dump, "\n");
-         print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
+          fprintf (vect_dump, "\n");
+          print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
         }
  
        phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
@@ -2719,9 +3026,9 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
            Step 1 can be accomplished using one the following three schemes:
            (scheme 1) using reduc_code, if available.
            (scheme 2) using whole-vector shifts, if available.
-          (scheme 3) using a scalar loop. In this case steps 1+2 above are 
+          (scheme 3) using a scalar loop. In this case steps 1+2 above are
                       combined.
-                
+
            The overall epilog code looks like this:
  
            s_out0 = phi <s_loop>         # original EXIT_PHI
@@ -2755,16 +3062,17 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
        SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
        prev_phi_info = vinfo_for_stmt (phi);
      }
+
    exit_gsi = gsi_after_labels (exit_bb);
  
-  /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 
+  /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
           (i.e. when reduc_code is not available) and in the final adjustment
          code (if needed).  Also get the original scalar reduction variable as
-         defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it 
-         represents a reduction pattern), the tree-code and scalar-def are 
-         taken from the original stmt that the pattern-stmt (STMT) replaces.  
+         defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
+         represents a reduction pattern), the tree-code and scalar-def are
+         taken from the original stmt that the pattern-stmt (STMT) replaces.
           Otherwise (it is a regular reduction) - the tree-code and scalar-def
-         are taken from STMT.  */ 
+         are taken from STMT.  */
  
    orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
    if (!orig_stmt)
@@ -2779,22 +3087,32 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
        gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
        gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
      }
+
    code = gimple_assign_rhs_code (orig_stmt);
    scalar_dest = gimple_assign_lhs (orig_stmt);
    scalar_type = TREE_TYPE (scalar_dest);
    new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
    bitsize = TYPE_SIZE (scalar_type);
-  bytesize = TYPE_SIZE_UNIT (scalar_type);
  
+  /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
+     partial results are added and not subtracted.  */
+  if (code == MINUS_EXPR)
+    code = PLUS_EXPR;
  
    /* In case this is a reduction in an inner-loop while vectorizing an outer
       loop - we don't need to extract a single scalar result at the end of the
-     inner-loop.  The final vector of partial results will be used in the
-     vectorized outer-loop, or reduced to a scalar result at the end of the
-     outer-loop.  */
-  if (nested_in_vect_loop)
+     inner-loop (unless it is double reduction, i.e., the use of reduction is
+     outside the outer-loop). The final vector of partial results will be used
+     in the vectorized outer-loop, or reduced to a scalar result at the end of
+     the outer-loop.  */
+  if (nested_in_vect_loop && !double_reduc)
      goto vect_finalize_reduction;
  
+  /* The epilogue is created for the outer-loop, i.e., for the loop being
+     vectorized.  */
+  if (double_reduc)
+    loop = outer_loop;
+
    /* FORNOW */
    gcc_assert (ncopies == 1);
  
@@ -2869,6 +3187,7 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
                bit_offset /= 2)
             {
               tree bitpos = size_int (bit_offset);
+
               epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
                                                           new_temp, bitpos);
               new_name = make_ssa_name (vec_dest, epilog_stmt);
@@ -2888,10 +3207,10 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
          {
           tree rhs;
  
-         /*** Case 3: Create:  
+         /*** Case 3: Create:
              s = extract_field <v_out2, 0>
-            for (offset = element_size; 
-                 offset < vector_size; 
+            for (offset = element_size;
+                 offset < vector_size;
                   offset += element_size;)
                {
                  Create:  s' = extract_field <v_out2, offset>
@@ -2909,15 +3228,15 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
           gimple_assign_set_lhs (epilog_stmt, new_temp);
           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-             
+
           for (bit_offset = element_bitsize;
                bit_offset < vec_size_in_bits;
                bit_offset += element_bitsize)
-           { 
+           {
               tree bitpos = bitsize_int (bit_offset);
               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
                                  bitpos);
-               
+
               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
               new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
               gimple_assign_set_lhs (epilog_stmt, new_name);
@@ -2937,12 +3256,12 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
  
    /* 2.4  Extract the final scalar result.  Create:
           s_out3 = extract_field <v_out2, bitpos>  */
-  
+
    if (extract_scalar_result)
      {
        tree rhs;
  
-      gcc_assert (!nested_in_vect_loop);
+      gcc_assert (!nested_in_vect_loop || double_reduc);
        if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "extract scalar result");
  
@@ -2962,6 +3281,9 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
  
  vect_finalize_reduction:
  
+  if (double_reduc)
+    loop = loop->inner;
+
    /* 2.5 Adjust the final result by the initial value of the reduction
          variable. (When such adjustment is not needed, then
          'adjustment_def' is zero).  For example, if code is PLUS we create:
@@ -2981,6 +3303,7 @@ vect_finalize_reduction:
           expr = build2 (code, scalar_type, new_temp, adjustment_def);
           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
         }
+
        epilog_stmt = gimple_build_assign (new_dest, expr);
        new_temp = make_ssa_name (new_dest, epilog_stmt);
        gimple_assign_set_lhs (epilog_stmt, new_temp);
@@ -2993,7 +3316,7 @@ vect_finalize_reduction:
  
    /* Replace uses of s_out0 with uses of s_out3:
       Find the loop-closed-use at the loop exit of the original scalar result.
-     (The reduction result is expected to have two immediate uses - one at the 
+     (The reduction result is expected to have two immediate uses - one at the
       latch block, and one at the loop exit).  */
    phis = VEC_alloc (gimple, heap, 10);
    FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
@@ -3004,6 +3327,7 @@ vect_finalize_reduction:
           VEC_quick_push (gimple, phis, exit_phi);
         }
      }
+
    /* We expect to have found an exit_phi because of loop-closed-ssa form.  */
    gcc_assert (!VEC_empty (gimple, phis));
  
@@ -3012,22 +3336,104 @@ vect_finalize_reduction:
        if (nested_in_vect_loop)
         {
           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
+          gimple vect_phi;
  
           /* FORNOW. Currently not supporting the case that an inner-loop
              reduction is not used in the outer-loop (but only outside the
-            outer-loop).  */
-         gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo) 
-                     && !STMT_VINFO_LIVE_P (stmt_vinfo));
+            outer-loop), unless it is double reduction.  */
+         gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo)
+                      && !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc);
  
           epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
           STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
-         set_vinfo_for_stmt (epilog_stmt, 
-                             new_stmt_vec_info (epilog_stmt, loop_vinfo, 
+         set_vinfo_for_stmt (epilog_stmt,
+                             new_stmt_vec_info (epilog_stmt, loop_vinfo,
                                                  NULL));
           if (adjustment_def)
             STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
-         continue;
+
+          if (!double_reduc
+              || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def)
+            continue;
+
+          /* Handle double reduction:
+
+             stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
+             stmt2:   s3 = phi <s1, s4> - (regular) reduction phi (inner loop)
+             stmt3:   s4 = use (s3)     - (regular) reduction stmt (inner loop)
+             stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
+
+             At that point the regular reduction (stmt2 and stmt3) is already
+             vectorized, as well as the exit phi node, stmt4.
+             Here we vectorize the phi node of double reduction, stmt1, and
+             update all relevant statements.  */
+
+          /* Go through all the uses of s2 to find double reduction phi node,
+             i.e., stmt1 above.  */
+          orig_name = PHI_RESULT (exit_phi);
+          FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
+            {
+              stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
+              stmt_vec_info new_phi_vinfo;
+              tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
+              basic_block bb = gimple_bb (use_stmt);
+              gimple use;
+
+              /* Check that USE_STMT is really double reduction phi node.  */
+              if (gimple_code (use_stmt) != GIMPLE_PHI
+                  || gimple_phi_num_args (use_stmt) != 2
+                  || !use_stmt_vinfo
+                  || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
+                      != vect_double_reduction_def
+                  || bb->loop_father != outer_loop)
+                continue;
+
+              /* Create vector phi node for double reduction:
+                 vs1 = phi <vs0, vs2>
+                 vs1 was created previously in this function by a call to
+                 vect_get_vec_def_for_operand and is stored in vec_initial_def;
+                 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
+                 vs0 is created here.  */
+
+              /* Create vector phi node.  */
+              vect_phi = create_phi_node (vec_initial_def, bb);
+              new_phi_vinfo = new_stmt_vec_info (vect_phi,
+                                    loop_vec_info_for_loop (outer_loop), NULL);
+              set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
+
+              /* Create vs0 - initial def of the double reduction phi.  */
+              preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
+                                             loop_preheader_edge (outer_loop));
+              init_def = get_initial_def_for_reduction (stmt, preheader_arg,
+                                                        NULL);
+              vect_phi_init = vect_init_vector (use_stmt, init_def, vectype,
+                                                NULL);
+
+              /* Update phi node arguments with vs0 and vs2.  */
+              add_phi_arg (vect_phi, vect_phi_init,
+                           loop_preheader_edge (outer_loop), UNKNOWN_LOCATION);
+              add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
+                           loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
+              if (vect_print_dump_info (REPORT_DETAILS))
+                {
+                  fprintf (vect_dump, "created double reduction phi node: ");
+                  print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
+                }
+
+              vect_phi_res = PHI_RESULT (vect_phi);
+
+              /* Replace the use, i.e., set the correct vs1 in the regular
+                 reduction phi node. FORNOW, NCOPIES is always 1, so the loop
+                 is redundant.  */
+              use = reduction_phi;
+              for (j = 0; j < ncopies; j++)
+                {
+                  edge pr_edge = loop_preheader_edge (loop);
+                  SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
+                  use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
+                }
+            }
         }
  
        /* Replace the uses:  */
@@ -3036,29 +3442,30 @@ vect_finalize_reduction:
         FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
           SET_USE (use_p, new_temp);
      }
+
    VEC_free (gimple, heap, phis);
-} 
+}
  
  
  /* Function vectorizable_reduction.
  
     Check if STMT performs a reduction operation that can be vectorized.
     If VEC_STMT is also passed, vectorize the STMT: create a vectorized
-   stmt to replace it, put it in VEC_STMT, and insert it at BSI.
+   stmt to replace it, put it in VEC_STMT, and insert it at GSI.
     Return FALSE if not a vectorizable STMT, TRUE otherwise.
  
-   This function also handles reduction idioms (patterns) that have been 
+   This function also handles reduction idioms (patterns) that have been
     recognized in advance during vect_pattern_recog. In this case, STMT may be
     of this form:
       X = pattern_expr (arg0, arg1, ..., X)
     and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
     sequence that had been detected and replaced by the pattern-stmt (STMT).
-  
+
     In some cases of reduction patterns, the type of the reduction variable X is
     different than the type of the other arguments of STMT.
     In such cases, the vectype that is used when transforming STMT into a vector
     stmt is different than the vectype that is used to determine the
-   vectorization factor, because it consists of a different number of elements 
+   vectorization factor, because it consists of a different number of elements
     than the actual number of elements that are being operated upon in parallel.
  
     For example, consider an accumulation of shorts into an int accumulator.
@@ -3088,7 +3495,8 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
    tree scalar_dest;
    tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
    stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
-  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
+  tree vectype_in = NULL_TREE;
    loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    enum tree_code code, orig_code, epilog_reduc_code;
@@ -3106,28 +3514,37 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
    stmt_vec_info orig_stmt_info;
    tree expr = NULL_TREE;
    int i;
-  int nunits = TYPE_VECTOR_SUBPARTS (vectype);
-  int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
+  int ncopies;
    int epilog_copies;
    stmt_vec_info prev_stmt_info, prev_phi_info;
    gimple first_phi = NULL;
    bool single_defuse_cycle = false;
-  tree reduc_def;
+  tree reduc_def = NULL_TREE;
    gimple new_stmt = NULL;
    int j;
    tree ops[3];
+  bool nested_cycle = false, found_nested_cycle_def = false;
+  gimple reduc_def_stmt = NULL;
+  /* The default is that the reduction variable is the last in statement.  */
+  int reduc_index = 2;
+  bool double_reduc = false, dummy;
+  basic_block def_bb;
+  struct loop * def_stmt_loop, *outer_loop = NULL;
+  tree def_arg;
+  gimple def_arg_stmt;
  
    if (nested_in_vect_loop_p (loop, stmt))
-    loop = loop->inner;
-
-  gcc_assert (ncopies >= 1);
+    {
+      outer_loop = loop;
+      loop = loop->inner;
+      nested_cycle = true;
+    }
  
    /* FORNOW: SLP not supported.  */
    if (STMT_SLP_TYPE (stmt_info))
      return false;
  
    /* 1. Is vectorizable reduction?  */
-
    /* Not supportable if the reduction variable is used in the loop.  */
    if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
      return false;
@@ -3139,10 +3556,11 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
      return false;
  
    /* Make sure it was already recognized as a reduction computation.  */
-  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
+  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
+      && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
      return false;
  
-  /* 2. Has this been recognized as a reduction pattern? 
+  /* 2. Has this been recognized as a reduction pattern?
  
       Check if STMT represents a pattern that has been recognized
       in earlier analysis stages.  For stmts that represent a pattern,
@@ -3157,7 +3575,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
        gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
        gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
      }
- 
+
    /* 3. Check the operands of the operation. The first operands are defined
          inside the loop body. The last operand is the reduction variable,
          which is defined by the loop-header-phi.  */
@@ -3198,68 +3616,121 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
  
    scalar_dest = gimple_assign_lhs (stmt);
    scalar_type = TREE_TYPE (scalar_dest);
-  if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type) 
+  if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
        && !SCALAR_FLOAT_TYPE_P (scalar_type))
      return false;
  
    /* All uses but the last are expected to be defined in the loop.
-     The last use is the reduction variable.  */
+     The last use is the reduction variable. In case of nested cycle this
+     assumption is not true: we use reduc_index to record the index of the
+     reduction variable.  */
    for (i = 0; i < op_type-1; i++)
      {
-      is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
-                                         &def, &dt);
+      tree tem;
+
+      /* The condition of COND_EXPR is checked in vectorizable_condition().  */
+      if (i == 0 && code == COND_EXPR)
+        continue;
+
+      is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
+                                           &def_stmt, &def, &dt, &tem);
+      if (!vectype_in)
+       vectype_in = tem;
        gcc_assert (is_simple_use);
        if (dt != vect_internal_def
           && dt != vect_external_def
           && dt != vect_constant_def
-         && dt != vect_induction_def)
+         && dt != vect_induction_def
+          && !(dt == vect_nested_cycle && nested_cycle))
         return false;
+
+      if (dt == vect_nested_cycle)
+        {
+          found_nested_cycle_def = true;
+          reduc_def_stmt = def_stmt;
+          reduc_index = i;
+        }
      }
  
-  is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt, 
-                                      &def, &dt);
+  is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
+                                     &def, &dt);
    gcc_assert (is_simple_use);
-  gcc_assert (dt == vect_reduction_def);
-  gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
-  if (orig_stmt) 
-    gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
+  gcc_assert (dt == vect_reduction_def
+              || dt == vect_nested_cycle
+              || ((dt == vect_internal_def || dt == vect_external_def
+                   || dt == vect_constant_def || dt == vect_induction_def)
+                   && nested_cycle && found_nested_cycle_def));
+  if (!found_nested_cycle_def)
+    reduc_def_stmt = def_stmt;
+
+  gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
+  if (orig_stmt)
+    gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
+                                                       reduc_def_stmt,
+                                                       !nested_cycle,
+                                                       &dummy));
    else
-    gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
-  
-  if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
+    gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
+                                                  !nested_cycle, &dummy));
+
+  if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
      return false;
  
-  /* 4. Supportable by target?  */
  
-  /* 4.1. check support for the operation in the loop  */
-  optab = optab_for_tree_code (code, vectype, optab_default);
-  if (!optab)
+  ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+            / TYPE_VECTOR_SUBPARTS (vectype_in));
+  gcc_assert (ncopies >= 1);
+
+  vec_mode = TYPE_MODE (vectype_in);
+
+  if (code == COND_EXPR)
      {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "no optab.");
-      return false;
+      if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            fprintf (vect_dump, "unsupported condition in reduction");
+
+            return false;
+        }
      }
-  vec_mode = TYPE_MODE (vectype);
-  if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
+  else
      {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "op not supported by target.");
-      if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
-          || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-            < vect_min_worthwhile_factor (code))
-        return false;
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "proceeding using word mode.");
-    }
+      /* 4. Supportable by target?  */
  
-  /* Worthwhile without SIMD support?  */
-  if (!VECTOR_MODE_P (TYPE_MODE (vectype))
-      && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-        < vect_min_worthwhile_factor (code))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "not worthwhile without SIMD support.");
-      return false;
+      /* 4.1. check support for the operation in the loop  */
+      optab = optab_for_tree_code (code, vectype_in, optab_default);
+      if (!optab)
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            fprintf (vect_dump, "no optab.");
+
+          return false;
+        }
+
+      if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            fprintf (vect_dump, "op not supported by target.");
+
+          if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
+              || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+                 < vect_min_worthwhile_factor (code))
+            return false;
+
+          if (vect_print_dump_info (REPORT_DETAILS))
+           fprintf (vect_dump, "proceeding using word mode.");
+        }
+
+      /* Worthwhile without SIMD support?  */
+      if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
+          && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+            < vect_min_worthwhile_factor (code))
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+           fprintf (vect_dump, "not worthwhile without SIMD support.");
+
+          return false;
+        }
      }
  
    /* 4.2. Check support for the epilog operation.
@@ -3275,24 +3746,24 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
                          STMT: int_acc = widen_sum <short_a, int_acc>
  
            This means that:
-          1. The tree-code that is used to create the vector operation in the 
-             epilog code (that reduces the partial results) is not the 
-             tree-code of STMT, but is rather the tree-code of the original 
-             stmt from the pattern that STMT is replacing. I.e, in the example 
-             above we want to use 'widen_sum' in the loop, but 'plus' in the 
+          1. The tree-code that is used to create the vector operation in the
+             epilog code (that reduces the partial results) is not the
+             tree-code of STMT, but is rather the tree-code of the original
+             stmt from the pattern that STMT is replacing. I.e, in the example
+             above we want to use 'widen_sum' in the loop, but 'plus' in the
               epilog.
            2. The type (mode) we use to check available target support
-             for the vector operation to be created in the *epilog*, is 
-             determined by the type of the reduction variable (in the example 
+             for the vector operation to be created in the *epilog*, is
+             determined by the type of the reduction variable (in the example
               above we'd check this: plus_optab[vect_int_mode]).
               However the type (mode) we use to check available target support
               for the vector operation to be created *inside the loop*, is
               determined by the type of the other arguments to STMT (in the
               example we'd check this: widen_sum_optab[vect_short_mode]).
-  
-          This is contrary to "regular" reductions, in which the types of all 
-          the arguments are the same as the type of the reduction variable. 
-          For "regular" reductions we can therefore use the same vector type 
+
+          This is contrary to "regular" reductions, in which the types of all
+          the arguments are the same as the type of the reduction variable.
+          For "regular" reductions we can therefore use the same vector type
            (and also the same tree-code) when generating the epilog code and
            when generating the code inside the loop.  */
  
@@ -3301,18 +3772,8 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
        /* This is a reduction pattern: get the vectype from the type of the
           reduction variable, and get the tree-code from orig_stmt.  */
        orig_code = gimple_assign_rhs_code (orig_stmt);
-      vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
-      if (!vectype)
-       {
-          if (vect_print_dump_info (REPORT_DETAILS))
-            {
-              fprintf (vect_dump, "unsupported data-type ");
-              print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
-            }
-          return false;
-        }
-
-      vec_mode = TYPE_MODE (vectype);
+      gcc_assert (vectype_out);
+      vec_mode = TYPE_MODE (vectype_out);
      }
    else
      {
@@ -3321,22 +3782,64 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
        orig_code = code;
      }
  
-  if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
-    return false;
-  reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default);
-  if (!reduc_optab)
+  if (nested_cycle)
      {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "no optab for reduction.");
-      epilog_reduc_code = ERROR_MARK;
+      def_bb = gimple_bb (reduc_def_stmt);
+      def_stmt_loop = def_bb->loop_father;
+      def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
+                                       loop_preheader_edge (def_stmt_loop));
+      if (TREE_CODE (def_arg) == SSA_NAME
+          && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
+          && gimple_code (def_arg_stmt) == GIMPLE_PHI
+          && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
+          && vinfo_for_stmt (def_arg_stmt)
+          && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
+              == vect_double_reduction_def)
+        double_reduc = true;
      }
-  if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
+
+  epilog_reduc_code = ERROR_MARK;
+  if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
+    {
+      reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
+                                         optab_default);
+      if (!reduc_optab)
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            fprintf (vect_dump, "no optab for reduction.");
+
+          epilog_reduc_code = ERROR_MARK;
+        }
+
+      if (reduc_optab
+          && optab_handler (reduc_optab, vec_mode)->insn_code
+              == CODE_FOR_nothing)
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            fprintf (vect_dump, "reduc op not supported by target.");
+
+          epilog_reduc_code = ERROR_MARK;
+        }
+    }
+  else
+    {
+      if (!nested_cycle || double_reduc)
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            fprintf (vect_dump, "no reduc code for scalar code.");
+
+          return false;
+        }
+    }
+
+  if (double_reduc && ncopies > 1)
      {
        if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "reduc op not supported by target.");
-      epilog_reduc_code = ERROR_MARK;
+        fprintf (vect_dump, "multiple types in double reduction");
+
+      return false;
      }
- 
+
    if (!vec_stmt) /* transformation not required.  */
      {
        STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
@@ -3350,8 +3853,12 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
    if (vect_print_dump_info (REPORT_DETAILS))
      fprintf (vect_dump, "transform reduction.");
  
+  /* FORNOW: Multiple types are not supported for condition.  */
+  if (code == COND_EXPR)
+    gcc_assert (ncopies == 1);
+
    /* Create the destination vector  */
-  vec_dest = vect_create_destination_var (scalar_dest, vectype);
+  vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
  
    /* In case the vectorization factor (VF) is bigger than the number
       of elements that we can fit in a vectype (nunits), we have to generate
@@ -3396,21 +3903,38 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
         {
           /* Create the reduction-phi that defines the reduction-operand.  */
           new_phi = create_phi_node (vec_dest, loop->header);
-         set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo, 
+         set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo,
                                                           NULL));
+          /* Get the vector def for the reduction variable from the phi
+             node.  */
+          reduc_def = PHI_RESULT (new_phi);
         }
  
+      if (code == COND_EXPR)
+        {
+          first_phi = new_phi;
+          vectorizable_condition (stmt, gsi, vec_stmt, reduc_def, reduc_index);
+          /* Multiple types are not supported for condition.  */
+          break;
+        }
+
        /* Handle uses.  */
        if (j == 0)
          {
-         loop_vec_def0 = vect_get_vec_def_for_operand (ops[0], stmt, NULL);
+         loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
+                                                        stmt, NULL);
            if (op_type == ternary_op)
              {
-             loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, NULL);
+              if (reduc_index == 0)
+               loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
+                                                              NULL);
+              else
+                loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
+                                                              NULL);
              }
  
-          /* Get the vector def for the reduction variable from the phi node */
-          reduc_def = PHI_RESULT (new_phi);
+          /* Get the vector def for the reduction variable from the phi
+             node.  */
           first_phi = new_phi;
          }
        else
@@ -3428,12 +3952,30 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
          }
  
-      /* Arguments are ready. create the new vector stmt.  */
+      /* Arguments are ready. Create the new vector stmt.  */
        if (op_type == binary_op)
-        expr = build2 (code, vectype, loop_vec_def0, reduc_def);
+        {
+          if (reduc_index == 0)
+            expr = build2 (code, vectype_out, reduc_def, loop_vec_def0);
+          else
+            expr = build2 (code, vectype_out, loop_vec_def0, reduc_def);
+        }
        else
-        expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1, 
-                      reduc_def);
+        {
+          if (reduc_index == 0)
+            expr = build3 (code, vectype_out, reduc_def, loop_vec_def0,
+                           loop_vec_def1);
+          else
+            {
+              if (reduc_index == 1)
+                expr = build3 (code, vectype_out, loop_vec_def0, reduc_def,
+                               loop_vec_def1);
+              else
+                expr = build3 (code, vectype_out, loop_vec_def0, loop_vec_def1,
+                              reduc_def);
+            }
+        }
+
        new_stmt = gimple_build_assign (vec_dest, expr);
        new_temp = make_ssa_name (vec_dest, new_stmt);
        gimple_assign_set_lhs (new_stmt, new_temp);
@@ -3443,16 +3985,19 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
        else
         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+
        prev_stmt_info = vinfo_for_stmt (new_stmt);
        prev_phi_info = vinfo_for_stmt (new_phi);
      }
  
    /* Finalize the reduction-phi (set its arguments) and create the
       epilog reduction code.  */
-  if (!single_defuse_cycle)
+  if (!single_defuse_cycle || code == COND_EXPR)
      new_temp = gimple_assign_lhs (*vec_stmt);
+
    vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
-                                   epilog_reduc_code, first_phi);
+                                   epilog_reduc_code, first_phi, reduc_index,
+                                    double_reduc);
    return true;
  }
  
@@ -3544,7 +4089,7 @@ vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
  
  /* Function vectorizable_live_operation.
  
-   STMT computes a value that is used outside the loop. Check if 
+   STMT computes a value that is used outside the loop. Check if
     it can be supported.  */
  
  bool
@@ -3560,7 +4105,7 @@ vectorizable_live_operation (gimple stmt,
    tree op;
    tree def;
    gimple def_stmt;
-  enum vect_def_type dt; 
+  enum vect_def_type dt;
    enum tree_code code;
    enum gimple_rhs_class rhs_class;
  
@@ -3611,6 +4156,44 @@ vectorizable_live_operation (gimple stmt,
    return true;
  }
  
+/* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
+
+static void
+vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
+{
+  ssa_op_iter op_iter;
+  imm_use_iterator imm_iter;
+  def_operand_p def_p;
+  gimple ustmt;
+
+  FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
+    {
+      FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
+       {
+         basic_block bb;
+
+         if (!is_gimple_debug (ustmt))
+           continue;
+
+         bb = gimple_bb (ustmt);
+
+         if (!flow_bb_inside_loop_p (loop, bb))
+           {
+             if (gimple_debug_bind_p (ustmt))
+               {
+                 if (vect_print_dump_info (REPORT_DETAILS))
+                   fprintf (vect_dump, "killing debug use");
+
+                 gimple_debug_bind_reset_value (ustmt);
+                 update_stmt (ustmt);
+               }
+             else
+               gcc_unreachable ();
+           }
+       }
+    }
+}
+
  /* Function vect_transform_loop.
  
     The analysis phase has determined that the loop is vectorizable.
@@ -3648,16 +4231,12 @@ vect_transform_loop (loop_vec_info loop_vinfo)
         || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
  
-  if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
-      || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+  if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
+      || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
      vect_loop_versioning (loop_vinfo,
                           !do_peeling_for_loop_bound,
                           &cond_expr, &cond_expr_stmt_list);
  
-  /* CHECKME: we wouldn't need this if we called update_ssa once
-     for all loops.  */
-  bitmap_zero (vect_memsyms_to_rename);
-  
    /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
       compile time constant), or it is a constant that doesn't divide by the
       vectorization factor, then an epilog loop needs to be created.
@@ -3681,8 +4260,8 @@ vect_transform_loop (loop_vec_info loop_vinfo)
    split_edge (loop_preheader_edge (loop));
  
    /* FORNOW: the vectorizer supports only loops which body consist
-     of one basic block (header + empty latch). When the vectorizer will 
-     support more involved loop forms, the order by which the BBs are 
+     of one basic block (header + empty latch). When the vectorizer will
+     support more involved loop forms, the order by which the BBs are
       traversed need to be reconsidered.  */
  
    for (i = 0; i < nbbs; i++)
@@ -3703,6 +4282,9 @@ vect_transform_loop (loop_vec_info loop_vinfo)
           if (!stmt_info)
             continue;
  
+         if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
+           vect_loop_kill_debug_uses (loop, phi);
+
           if (!STMT_VINFO_RELEVANT_P (stmt_info)
               && !STMT_VINFO_LIVE_P (stmt_info))
             continue;
@@ -3729,7 +4311,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
             {
               fprintf (vect_dump, "------>vectorizing statement: ");
               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
-           }   
+           }
  
           stmt_info = vinfo_for_stmt (stmt);
  
@@ -3742,6 +4324,9 @@ vect_transform_loop (loop_vec_info loop_vinfo)
               continue;
             }
  
+         if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
+           vect_loop_kill_debug_uses (loop, stmt);
+
           if (!STMT_VINFO_RELEVANT_P (stmt_info)
               && !STMT_VINFO_LIVE_P (stmt_info))
             {
@@ -3780,7 +4365,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
                   continue;
                 }
             }
-         
+
           /* -------- vectorize statement ------------ */
           if (vect_print_dump_info (REPORT_DETAILS))
             fprintf (vect_dump, "transform statement.");
@@ -3812,8 +4397,6 @@ vect_transform_loop (loop_vec_info loop_vinfo)
  
    slpeel_make_loop_iterate_ntimes (loop, ratio);
  
-  mark_set_for_renaming (vect_memsyms_to_rename);
-
    /* The memory tags and pointers in vectorized statements need to
       have their SSA forms updated.  FIXME, why can't this be delayed
       until all the loops have been transformed?  */