PR c++/34513

[pf3gnuchains/gcc-fork.git] / gcc / tree-vectorizer.c
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c

index 396146d..0131b9a 100644 (file)
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -918,20 +918,29 @@ slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
  
  static edge
  slpeel_add_loop_guard (basic_block guard_bb, tree cond, basic_block exit_bb,
-                       basic_block dom_bb)
+                      basic_block dom_bb)
  {
    block_stmt_iterator bsi;
    edge new_e, enter_e;
    tree cond_stmt;
+  tree gimplify_stmt_list;
  
    enter_e = EDGE_SUCC (guard_bb, 0);
    enter_e->flags &= ~EDGE_FALLTHRU;
    enter_e->flags |= EDGE_FALSE_VALUE;
    bsi = bsi_last (guard_bb);
  
+  cond =
+    force_gimple_operand (cond, &gimplify_stmt_list, true,
+                         NULL_TREE);
    cond_stmt = build3 (COND_EXPR, void_type_node, cond,
                       NULL_TREE, NULL_TREE);
+  if (gimplify_stmt_list)
+    bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
+
+  bsi = bsi_last (guard_bb);
    bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
+
    /* Add new edge to connect guard block to the merge/loop-exit block.  */
    new_e = make_edge (guard_bb, exit_bb, EDGE_TRUE_VALUE);
    set_immediate_dominator (CDI_DOMINATORS, exit_bb, dom_bb);
@@ -948,7 +957,7 @@ slpeel_add_loop_guard (basic_block guard_bb, tree cond, basic_block exit_bb,
   */
  
  bool
-slpeel_can_duplicate_loop_p (struct loop *loop, edge e)
+slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
  {
    edge exit_e = single_exit (loop);
    edge entry_e = loop_preheader_edge (loop);
@@ -1007,12 +1016,89 @@ slpeel_verify_cfg_after_peeling (struct loop *first_loop,
  }
  #endif
  
+/* If the run time cost model check determines that vectorization is
+   not profitable and hence scalar loop should be generated then set
+   FIRST_NITERS to prologue peeled iterations. This will allow all the
+   iterations to be executed in the prologue peeled scalar loop.  */
+
+void
+set_prologue_iterations (basic_block bb_before_first_loop,
+                        tree first_niters,
+                        struct loop *loop,
+                        unsigned int th)
+{
+  edge e;
+  basic_block cond_bb, then_bb;
+  tree var, prologue_after_cost_adjust_name, stmt;
+  block_stmt_iterator bsi;
+  tree newphi;
+  edge e_true, e_false, e_fallthru;
+  tree cond_stmt;
+  tree gimplify_stmt_list;
+  tree cost_pre_condition = NULL_TREE;
+  tree scalar_loop_iters = 
+    unshare_expr (LOOP_VINFO_NITERS_UNCHANGED (loop_vec_info_for_loop (loop)));
+
+  e = single_pred_edge (bb_before_first_loop);
+  cond_bb = split_edge(e);
+
+  e = single_pred_edge (bb_before_first_loop);
+  then_bb = split_edge(e);
+  set_immediate_dominator (CDI_DOMINATORS, then_bb, cond_bb);
+
+  e_false = make_single_succ_edge (cond_bb, bb_before_first_loop,
+                                  EDGE_FALSE_VALUE);
+  set_immediate_dominator (CDI_DOMINATORS, bb_before_first_loop, cond_bb);
+
+  e_true = EDGE_PRED (then_bb, 0);
+  e_true->flags &= ~EDGE_FALLTHRU;
+  e_true->flags |= EDGE_TRUE_VALUE;
+
+  e_fallthru = EDGE_SUCC (then_bb, 0);
+
+  cost_pre_condition =
+    build2 (LE_EXPR, boolean_type_node, scalar_loop_iters, 
+           build_int_cst (TREE_TYPE (scalar_loop_iters), th));
+  cost_pre_condition =
+    force_gimple_operand (cost_pre_condition, &gimplify_stmt_list,
+                         true, NULL_TREE);
+  cond_stmt = build3 (COND_EXPR, void_type_node, cost_pre_condition,
+                     NULL_TREE, NULL_TREE);
+
+  bsi = bsi_last (cond_bb);
+  if (gimplify_stmt_list)
+    bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
+
+  bsi = bsi_last (cond_bb);
+  bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
+                                         
+  var = create_tmp_var (TREE_TYPE (scalar_loop_iters),
+                       "prologue_after_cost_adjust");
+  add_referenced_var (var);
+  prologue_after_cost_adjust_name = 
+    force_gimple_operand (scalar_loop_iters, &stmt, false, var);
+
+  bsi = bsi_last (then_bb);
+  if (stmt)
+    bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
+
+  newphi = create_phi_node (var, bb_before_first_loop);
+  add_phi_arg (newphi, prologue_after_cost_adjust_name, e_fallthru);
+  add_phi_arg (newphi, first_niters, e_false);
+
+  first_niters = PHI_RESULT (newphi);
+}
+
+
  /* Function slpeel_tree_peel_loop_to_edge.
  
     Peel the first (last) iterations of LOOP into a new prolog (epilog) loop
     that is placed on the entry (exit) edge E of LOOP. After this transformation
     we have two loops one after the other - first-loop iterates FIRST_NITERS
     times, and second-loop iterates the remainder NITERS - FIRST_NITERS times.
+   If the cost model indicates that it is profitable to emit a scalar 
+   loop instead of the vector one, then the prolog (epilog) loop will iterate
+   for the entire unchanged scalar iterations of the loop.
  
     Input:
     - LOOP: the loop to be peeled.
@@ -1027,6 +1113,13 @@ slpeel_verify_cfg_after_peeling (struct loop *first_loop,
          for updating the loop bound of the first-loop to FIRST_NITERS.  If it
          is false, the caller of this function may want to take care of this
          (this can be useful if we don't want new stmts added to first-loop).
+   - TH: cost model profitability threshold of iterations for vectorization.
+   - CHECK_PROFITABILITY: specify whether cost model check has not occured
+                          during versioning and hence needs to occur during
+                         prologue generation or whether cost model check 
+                         has not occured during prologue generation and hence
+                         needs to occur during epilogue generation.
+           
  
     Output:
     The function returns a pointer to the new loop-copy, or NULL if it failed
@@ -1048,11 +1141,11 @@ struct loop*
  slpeel_tree_peel_loop_to_edge (struct loop *loop, 
                                edge e, tree first_niters, 
                                tree niters, bool update_first_loop_count,
-                              unsigned int th)
+                              unsigned int th, bool check_profitability)
  {
    struct loop *new_loop = NULL, *first_loop, *second_loop;
    edge skip_e;
-  tree pre_condition;
+  tree pre_condition = NULL_TREE;
    bitmap definitions;
    basic_block bb_before_second_loop, bb_after_second_loop;
    basic_block bb_before_first_loop;
@@ -1060,6 +1153,7 @@ slpeel_tree_peel_loop_to_edge (struct loop *loop,
    basic_block new_exit_bb;
    edge exit_e = single_exit (loop);
    LOC loop_loc;
+  tree cost_pre_condition = NULL_TREE;
    
    if (!slpeel_can_duplicate_loop_p (loop, e))
      return NULL;
@@ -1116,32 +1210,124 @@ slpeel_tree_peel_loop_to_edge (struct loop *loop,
    rename_variables_in_loop (new_loop);
  
  
-  /* 2. Add the guard that controls whether the first loop is executed.
-        Resulting CFG would be:
+  /* 2.  Add the guard code in one of the following ways:
  
-        bb_before_first_loop:
-        if (FIRST_NITERS == 0) GOTO bb_before_second_loop
-                               GOTO first-loop
+     2.a Add the guard that controls whether the first loop is executed.
+         This occurs when this function is invoked for prologue or epilogiue
+        generation and when the cost model check can be done at compile time.
  
-        first_loop:
-        do {
-        } while ...
+         Resulting CFG would be:
  
-        bb_before_second_loop:
+         bb_before_first_loop:
+         if (FIRST_NITERS == 0) GOTO bb_before_second_loop
+                                GOTO first-loop
  
-        second_loop:
-        do {
-        } while ...
+         first_loop:
+         do {
+         } while ...
  
-        orig_exit_bb:
-   */
+         bb_before_second_loop:
+
+         second_loop:
+         do {
+         } while ...
+
+         orig_exit_bb:
+
+     2.b Add the cost model check that allows the prologue
+         to iterate for the entire unchanged scalar
+         iterations of the loop in the event that the cost
+         model indicates that the scalar loop is more
+         profitable than the vector one. This occurs when
+        this function is invoked for prologue generation
+        and the cost model check needs to be done at run
+        time.
+
+         Resulting CFG after prologue peeling would be:
+
+         if (scalar_loop_iterations <= th)
+           FIRST_NITERS = scalar_loop_iterations
+
+         bb_before_first_loop:
+         if (FIRST_NITERS == 0) GOTO bb_before_second_loop
+                                GOTO first-loop
+
+         first_loop:
+         do {
+         } while ...
+
+         bb_before_second_loop:
+
+         second_loop:
+         do {
+         } while ...
+
+         orig_exit_bb:
+
+     2.c Add the cost model check that allows the epilogue
+         to iterate for the entire unchanged scalar
+         iterations of the loop in the event that the cost
+         model indicates that the scalar loop is more
+         profitable than the vector one. This occurs when
+        this function is invoked for epilogue generation
+        and the cost model check needs to be done at run
+        time.
+
+         Resulting CFG after prologue peeling would be:
+
+         bb_before_first_loop:
+         if ((scalar_loop_iterations <= th)
+             ||
+             FIRST_NITERS == 0) GOTO bb_before_second_loop
+                                GOTO first-loop
+
+         first_loop:
+         do {
+         } while ...
+
+         bb_before_second_loop:
+
+         second_loop:
+         do {
+         } while ...
+
+         orig_exit_bb:
+  */
  
    bb_before_first_loop = split_edge (loop_preheader_edge (first_loop));
    bb_before_second_loop = split_edge (single_exit (first_loop));
  
-  pre_condition =
-    fold_build2 (LE_EXPR, boolean_type_node, first_niters, 
-       build_int_cst (TREE_TYPE (first_niters), th));
+  /* Epilogue peeling.  */
+  if (!update_first_loop_count)
+    {
+      pre_condition =
+       fold_build2 (LE_EXPR, boolean_type_node, first_niters, 
+                    build_int_cst (TREE_TYPE (first_niters), 0));
+      if (check_profitability)
+       {
+         tree scalar_loop_iters
+           = unshare_expr (LOOP_VINFO_NITERS_UNCHANGED
+                                       (loop_vec_info_for_loop (loop)));
+         cost_pre_condition = 
+           build2 (LE_EXPR, boolean_type_node, scalar_loop_iters, 
+                   build_int_cst (TREE_TYPE (scalar_loop_iters), th));
+
+         pre_condition = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
+                                      cost_pre_condition, pre_condition);
+       }
+    }
+
+  /* Prologue peeling.  */  
+  else
+    {
+      if (check_profitability)
+       set_prologue_iterations (bb_before_first_loop, first_niters,
+                                loop, th);
+
+      pre_condition =
+       fold_build2 (LE_EXPR, boolean_type_node, first_niters, 
+                    build_int_cst (TREE_TYPE (first_niters), 0));
+    }
  
    skip_e = slpeel_add_loop_guard (bb_before_first_loop, pre_condition,
                                    bb_before_second_loop, bb_before_first_loop);
@@ -1345,13 +1531,21 @@ new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo)
    STMT_VINFO_IN_PATTERN_P (res) = false;
    STMT_VINFO_RELATED_STMT (res) = NULL;
    STMT_VINFO_DATA_REF (res) = NULL;
-  if (TREE_CODE (stmt) == PHI_NODE)
+
+  STMT_VINFO_DR_BASE_ADDRESS (res) = NULL;
+  STMT_VINFO_DR_OFFSET (res) = NULL;
+  STMT_VINFO_DR_INIT (res) = NULL;
+  STMT_VINFO_DR_STEP (res) = NULL;
+  STMT_VINFO_DR_ALIGNED_TO (res) = NULL;
+
+  if (TREE_CODE (stmt) == PHI_NODE && is_loop_header_bb_p (bb_for_stmt (stmt)))
      STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type;
    else
      STMT_VINFO_DEF_TYPE (res) = vect_loop_def;
    STMT_VINFO_SAME_ALIGN_REFS (res) = VEC_alloc (dr_p, heap, 5);
    STMT_VINFO_INSIDE_OF_LOOP_COST (res) = 0;
    STMT_VINFO_OUTSIDE_OF_LOOP_COST (res) = 0;
+  STMT_SLP_TYPE (res) = 0;
    DR_GROUP_FIRST_DR (res) = NULL_TREE;
    DR_GROUP_NEXT_DR (res) = NULL_TREE;
    DR_GROUP_SIZE (res) = 0;
@@ -1364,6 +1558,20 @@ new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo)
  }
  
  
+/* Function bb_in_loop_p
+
+   Used as predicate for dfs order traversal of the loop bbs.  */
+
+static bool
+bb_in_loop_p (const_basic_block bb, const void *data)
+{
+  const struct loop *const loop = (const struct loop *)data;
+  if (flow_bb_inside_loop_p (loop, bb))
+    return true;
+  return false;
+}
+
+
  /* Function new_loop_vec_info.
  
     Create and initialize a new loop_vec_info struct for LOOP, as well as
@@ -1375,38 +1583,78 @@ new_loop_vec_info (struct loop *loop)
    loop_vec_info res;
    basic_block *bbs;
    block_stmt_iterator si;
-  unsigned int i;
+  unsigned int i, nbbs;
  
    res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
+  LOOP_VINFO_LOOP (res) = loop;
  
    bbs = get_loop_body (loop);
  
-  /* Create stmt_info for all stmts in the loop.  */
+  /* Create/Update stmt_info for all stmts in the loop.  */
    for (i = 0; i < loop->num_nodes; i++)
      {
        basic_block bb = bbs[i];
        tree phi;
  
-      for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
-        {
-          stmt_ann_t ann = get_stmt_ann (phi);
-          set_stmt_info (ann, new_stmt_vec_info (phi, res));
-        }
-
-      for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
+      /* BBs in a nested inner-loop will have been already processed (because 
+        we will have called vect_analyze_loop_form for any nested inner-loop).
+        Therefore, for stmts in an inner-loop we just want to update the 
+        STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new 
+        loop_info of the outer-loop we are currently considering to vectorize 
+        (instead of the loop_info of the inner-loop).
+        For stmts in other BBs we need to create a stmt_info from scratch.  */
+      if (bb->loop_father != loop)
         {
-         tree stmt = bsi_stmt (si);
-         stmt_ann_t ann;
+         /* Inner-loop bb.  */
+         gcc_assert (loop->inner && bb->loop_father == loop->inner);
+         for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
+           {
+             stmt_vec_info stmt_info = vinfo_for_stmt (phi);
+             loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+             gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
+             STMT_VINFO_LOOP_VINFO (stmt_info) = res;
+           }
+         for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
+          {
+             tree stmt = bsi_stmt (si);
+             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+             loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+             gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
+             STMT_VINFO_LOOP_VINFO (stmt_info) = res;
+          }
+       }
+      else
+       {
+         /* bb in current nest.  */
+         for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
+           {
+             stmt_ann_t ann = get_stmt_ann (phi);
+             set_stmt_info (ann, new_stmt_vec_info (phi, res));
+           }
  
-         ann = stmt_ann (stmt);
-         set_stmt_info (ann, new_stmt_vec_info (stmt, res));
+         for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
+           {
+             tree stmt = bsi_stmt (si);
+             stmt_ann_t ann = stmt_ann (stmt);
+             set_stmt_info (ann, new_stmt_vec_info (stmt, res));
+           }
         }
      }
  
-  LOOP_VINFO_LOOP (res) = loop;
+  /* CHECKME: We want to visit all BBs before their successors (except for 
+     latch blocks, for which this assertion wouldn't hold).  In the simple 
+     case of the loop forms we allow, a dfs order of the BBs would the same 
+     as reversed postorder traversal, so we are safe.  */
+
+   free (bbs);
+   bbs = XCNEWVEC (basic_block, loop->num_nodes);
+   nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, 
+                             bbs, loop->num_nodes, loop);
+   gcc_assert (nbbs == loop->num_nodes);
+
    LOOP_VINFO_BBS (res) = bbs;
-  LOOP_VINFO_EXIT_COND (res) = NULL;
    LOOP_VINFO_NITERS (res) = NULL;
+  LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
    LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
    LOOP_VINFO_VECTORIZABLE_P (res) = 0;
    LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
@@ -1414,8 +1662,13 @@ new_loop_vec_info (struct loop *loop)
    LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
    LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
    LOOP_VINFO_UNALIGNED_DR (res) = NULL;
-  LOOP_VINFO_MAY_MISALIGN_STMTS (res)
-    = VEC_alloc (tree, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_CHECKS));
+  LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
+    VEC_alloc (tree, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
+  LOOP_VINFO_MAY_ALIAS_DDRS (res) =
+    VEC_alloc (ddr_p, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
+  LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (tree, heap, 10);
+  LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
+  LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
  
    return res;
  }
@@ -1427,13 +1680,15 @@ new_loop_vec_info (struct loop *loop)
     stmts in the loop.  */
  
  void
-destroy_loop_vec_info (loop_vec_info loop_vinfo)
+destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
  {
    struct loop *loop;
    basic_block *bbs;
    int nbbs;
    block_stmt_iterator si;
    int j;
+  VEC (slp_instance, heap) *slp_instances;
+  slp_instance instance;
  
    if (!loop_vinfo)
      return;
@@ -1443,6 +1698,18 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo)
    bbs = LOOP_VINFO_BBS (loop_vinfo);
    nbbs = loop->num_nodes;
  
+  if (!clean_stmts)
+    {
+      free (LOOP_VINFO_BBS (loop_vinfo));
+      free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
+      free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
+      VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
+
+      free (loop_vinfo);
+      loop->aux = NULL;
+      return;
+    }
+
    for (j = 0; j < nbbs; j++)
      {
        basic_block bb = bbs[j];
@@ -1495,6 +1762,11 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo)
    free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
    free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
    VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
+  VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
+  slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
+  for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
+    vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
+  VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
  
    free (loop_vinfo);
    loop->aux = NULL;
@@ -1507,7 +1779,7 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo)
     on ALIGNMENT bit boundary.  */
  
  bool 
-vect_can_force_dr_alignment_p (tree decl, unsigned int alignment)
+vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
  {
    if (TREE_CODE (decl) != VAR_DECL)
      return false;
@@ -1521,12 +1793,9 @@ vect_can_force_dr_alignment_p (tree decl, unsigned int alignment)
    if (TREE_STATIC (decl))
      return (alignment <= MAX_OFILE_ALIGNMENT);
    else
-    /* This is not 100% correct.  The absolute correct stack alignment
-       is STACK_BOUNDARY.  We're supposed to hope, but not assume, that
-       PREFERRED_STACK_BOUNDARY is honored by all translation units.
-       However, until someone implements forced stack alignment, SSE
-       isn't really usable without this.  */  
-    return (alignment <= PREFERRED_STACK_BOUNDARY); 
+    /* This used to be PREFERRED_STACK_BOUNDARY, however, that is not 100%
+       correct until someone implements forced stack alignment.  */
+    return (alignment <= STACK_BOUNDARY); 
  }
  
  
@@ -1586,22 +1855,103 @@ get_vectype_for_scalar_type (tree scalar_type)
  enum dr_alignment_support
  vect_supportable_dr_alignment (struct data_reference *dr)
  {
-  tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
+  tree stmt = DR_STMT (dr);
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
    enum machine_mode mode = (int) TYPE_MODE (vectype);
+  struct loop *vect_loop = LOOP_VINFO_LOOP (STMT_VINFO_LOOP_VINFO (stmt_info));
+  bool nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
+  bool invariant_in_outerloop = false;
  
    if (aligned_access_p (dr))
      return dr_aligned;
  
+  if (nested_in_vect_loop)
+    {
+      tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
+      invariant_in_outerloop =
+       (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
+    }
+
    /* Possibly unaligned access.  */
-  
+
+  /* We can choose between using the implicit realignment scheme (generating
+     a misaligned_move stmt) and the explicit realignment scheme (generating
+     aligned loads with a REALIGN_LOAD). There are two variants to the explicit
+     realignment scheme: optimized, and unoptimized.
+     We can optimize the realignment only if the step between consecutive
+     vector loads is equal to the vector size.  Since the vector memory
+     accesses advance in steps of VS (Vector Size) in the vectorized loop, it
+     is guaranteed that the misalignment amount remains the same throughout the
+     execution of the vectorized loop.  Therefore, we can create the
+     "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
+     at the loop preheader.
+
+     However, in the case of outer-loop vectorization, when vectorizing a
+     memory access in the inner-loop nested within the LOOP that is now being
+     vectorized, while it is guaranteed that the misalignment of the
+     vectorized memory access will remain the same in different outer-loop
+     iterations, it is *not* guaranteed that is will remain the same throughout
+     the execution of the inner-loop.  This is because the inner-loop advances
+     with the original scalar step (and not in steps of VS).  If the inner-loop
+     step happens to be a multiple of VS, then the misalignment remains fixed
+     and we can use the optimized realignment scheme.  For example:
+
+      for (i=0; i<N; i++)
+        for (j=0; j<M; j++)
+          s += a[i+j];
+
+     When vectorizing the i-loop in the above example, the step between
+     consecutive vector loads is 1, and so the misalignment does not remain
+     fixed across the execution of the inner-loop, and the realignment cannot
+     be optimized (as illustrated in the following pseudo vectorized loop):
+
+      for (i=0; i<N; i+=4)
+        for (j=0; j<M; j++){
+          vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
+                         // when j is {0,1,2,3,4,5,6,7,...} respectively.
+                         // (assuming that we start from an aligned address).
+          }
+
+     We therefore have to use the unoptimized realignment scheme:
+
+      for (i=0; i<N; i+=4)
+          for (j=k; j<M; j+=4)
+          vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
+                           // that the misalignment of the initial address is
+                           // 0).
+
+     The loop can then be vectorized as follows:
+
+      for (k=0; k<4; k++){
+        rt = get_realignment_token (&vp[k]);
+        for (i=0; i<N; i+=4){
+          v1 = vp[i+k];
+          for (j=k; j<M; j+=4){
+            v2 = vp[i+j+VS-1];
+            va = REALIGN_LOAD <v1,v2,rt>;
+            vs += va;
+            v1 = v2;
+          }
+        }
+    } */
+
    if (DR_IS_READ (dr))
      {
-      if (optab_handler (vec_realign_load_optab, mode)->insn_code != CODE_FOR_nothing
+      if (optab_handler (vec_realign_load_optab, mode)->insn_code != 
+                                                            CODE_FOR_nothing
           && (!targetm.vectorize.builtin_mask_for_load
               || targetm.vectorize.builtin_mask_for_load ()))
-       return dr_unaligned_software_pipeline;
+       {
+           if (nested_in_vect_loop
+               && TREE_INT_CST_LOW (DR_STEP (dr)) != UNITS_PER_SIMD_WORD)
+             return dr_explicit_realign;
+           else
+             return dr_explicit_realign_optimized;
+       }
  
-      if (optab_handler (movmisalign_optab, mode)->insn_code != CODE_FOR_nothing)
+      if (optab_handler (movmisalign_optab, mode)->insn_code != 
+                                                            CODE_FOR_nothing)
         /* Can't software pipeline the loads, but can at least do them.  */
         return dr_unaligned_supported;
      }
@@ -1714,8 +2064,6 @@ vect_is_simple_use (tree operand, loop_vec_info loop_vinfo, tree *def_stmt,
      {
      case PHI_NODE:
        *def = PHI_RESULT (*def_stmt);
-      gcc_assert (*dt == vect_induction_def || *dt == vect_reduction_def
-                 || *dt == vect_invariant_def);
        break;
  
      case GIMPLE_MODIFY_STMT:
@@ -1756,6 +2104,8 @@ supportable_widening_operation (enum tree_code code, tree stmt, tree vectype,
                                  enum tree_code *code1, enum tree_code *code2)
  {
    stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
+  struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
    bool ordered_p;
    enum machine_mode vec_mode;
    enum insn_code icode1, icode2;
@@ -1778,9 +2128,15 @@ supportable_widening_operation (enum tree_code code, tree stmt, tree vectype,
       Some targets can take advantage of this and generate more efficient code.
       For example, targets like Altivec, that support widen_mult using a sequence
       of {mult_even,mult_odd} generate the following vectors:
-        vect1: [res1,res3,res5,res7], vect2: [res2,res4,res6,res8].  */
+        vect1: [res1,res3,res5,res7], vect2: [res2,res4,res6,res8].
  
-   if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction)
+     When vectorizaing outer-loops, we execute the inner-loop sequentially
+     (each vectorized inner-loop iteration contributes to VF outer-loop 
+     iterations in parallel). We therefore don't allow to change the order 
+     of the computation in the inner-loop during outer-loop vectorization.  */
+
+   if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction
+       && !nested_in_vect_loop_p (vect_loop, stmt))
       ordered_p = false;
     else
       ordered_p = true;
@@ -1898,7 +2254,7 @@ supportable_widening_operation (enum tree_code code, tree stmt, tree vectype,
  
  bool
  supportable_narrowing_operation (enum tree_code code,
-                                tree stmt, tree vectype,
+                                const_tree stmt, const_tree vectype,
                                  enum tree_code *code1)
  {
    enum machine_mode vec_mode;
@@ -2004,8 +2360,10 @@ reduction_code_for_scalar_code (enum tree_code code,
     Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.  */
  
  tree
-vect_is_simple_reduction (struct loop *loop, tree phi)
+vect_is_simple_reduction (loop_vec_info loop_info, tree phi)
  {
+  struct loop *loop = (bb_for_stmt (phi))->loop_father;
+  struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
    edge latch_e = loop_latch_edge (loop);
    tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
    tree def_stmt, def1, def2;
@@ -2018,6 +2376,8 @@ vect_is_simple_reduction (struct loop *loop, tree phi)
    imm_use_iterator imm_iter;
    use_operand_p use_p;
  
+  gcc_assert (loop == vect_loop || flow_loop_nested_p (vect_loop, loop));
+
    name = PHI_RESULT (phi);
    nloop_uses = 0;
    FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
@@ -2129,8 +2489,16 @@ vect_is_simple_reduction (struct loop *loop, tree phi)
        return NULL_TREE;
      }
  
+  /* Generally, when vectorizing a reduction we change the order of the
+     computation.  This may change the behavior of the program in some
+     cases, so we need to check that this is ok.  One exception is when 
+     vectorizing an outer-loop: the inner-loop is executed sequentially,
+     and therefore vectorizing reductions in the inner-loop durint 
+     outer-loop vectorization is safe.  */
+
    /* CHECKME: check for !flag_finite_math_only too?  */
-  if (SCALAR_FLOAT_TYPE_P (type) && !flag_unsafe_math_optimizations)
+  if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
+      && !nested_in_vect_loop_p (vect_loop, def_stmt)) 
      {
        /* Changing the order of operations changes the semantics.  */
        if (vect_print_dump_info (REPORT_DETAILS))
@@ -2140,7 +2508,8 @@ vect_is_simple_reduction (struct loop *loop, tree phi)
          }
        return NULL_TREE;
      }
-  else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type))
+  else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
+          && !nested_in_vect_loop_p (vect_loop, def_stmt))
      {
        /* Changing the order of operations changes the semantics.  */
        if (vect_print_dump_info (REPORT_DETAILS))
@@ -2179,13 +2548,16 @@ vect_is_simple_reduction (struct loop *loop, tree phi)
  
  
    /* Check that one def is the reduction def, defined by PHI,
-     the other def is either defined in the loop by a GIMPLE_MODIFY_STMT,
-     or it's an induction (defined by some phi node).  */
+     the other def is either defined in the loop ("vect_loop_def"),
+     or it's an induction (defined by a loop-header phi-node).  */
  
    if (def2 == phi
        && flow_bb_inside_loop_p (loop, bb_for_stmt (def1))
        && (TREE_CODE (def1) == GIMPLE_MODIFY_STMT 
-         || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def))
+         || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def
+         || (TREE_CODE (def1) == PHI_NODE 
+             && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_loop_def
+             && !is_loop_header_bb_p (bb_for_stmt (def1)))))
      {
        if (vect_print_dump_info (REPORT_DETAILS))
          {
@@ -2197,7 +2569,10 @@ vect_is_simple_reduction (struct loop *loop, tree phi)
    else if (def1 == phi
            && flow_bb_inside_loop_p (loop, bb_for_stmt (def2))
            && (TREE_CODE (def2) == GIMPLE_MODIFY_STMT 
-              || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_induction_def))
+              || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_induction_def
+              || (TREE_CODE (def2) == PHI_NODE
+                  && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_loop_def
+                  && !is_loop_header_bb_p (bb_for_stmt (def2)))))
      {
        /* Swap operands (just for simplicity - so that the rest of the code
          can assume that the reduction variable is always the last (second)
@@ -2336,7 +2711,7 @@ vectorize_loops (void)
        if (!loop)
         continue;
        loop_vinfo = loop->aux;
-      destroy_loop_vec_info (loop_vinfo);
+      destroy_loop_vec_info (loop_vinfo, true);
        loop->aux = NULL;
      }