Add file omitted from last checkin.

[pf3gnuchains/gcc-fork.git] / gcc / tree-vectorizer.c
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c

index 20c867c..79a7461 100644 (file)
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -147,24 +147,8 @@ along with GCC; see the file COPYING3.  If not see
  #include "tree-pass.h"
  
  /*************************************************************************
-  Simple Loop Peeling Utilities
- *************************************************************************/
-static void slpeel_update_phis_for_duplicate_loop 
-  (struct loop *, struct loop *, bool after);
-static void slpeel_update_phi_nodes_for_guard1 
-  (edge, struct loop *, bool, basic_block *, bitmap *); 
-static void slpeel_update_phi_nodes_for_guard2 
-  (edge, struct loop *, bool, basic_block *);
-static edge slpeel_add_loop_guard (basic_block, tree, basic_block, basic_block);
-
-static void rename_use_op (use_operand_p);
-static void rename_variables_in_bb (basic_block);
-static void rename_variables_in_loop (struct loop *);
-
-/*************************************************************************
    General Vectorization Utilities
   *************************************************************************/
-static void vect_set_dump_settings (void);
  
  /* vect_dump will be set to stderr or dump_file if exist.  */
  FILE *vect_dump;
@@ -241,7 +225,7 @@ rename_variables_in_bb (basic_block bb)
  
  /* Renames variables in new generated LOOP.  */
  
-static void
+void
  rename_variables_in_loop (struct loop *loop)
  {
    unsigned i;
@@ -806,7 +790,7 @@ slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters)
  /* Given LOOP this function generates a new copy of it and puts it 
     on E which is either the entry or exit of LOOP.  */
  
-static struct loop *
+struct loop *
  slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
  {
    struct loop *new_loop;
@@ -871,6 +855,7 @@ slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
    if (at_exit) /* Add the loop copy at exit.  */
      {
        redirect_edge_and_branch_force (e, new_loop->header);
+      PENDING_STMT (e) = NULL;
        set_immediate_dominator (CDI_DOMINATORS, new_loop->header, e->src);
        if (was_imm_dom)
         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header);
@@ -888,6 +873,7 @@ slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
         new_exit_e = EDGE_SUCC (new_loop->header, 1); 
  
        redirect_edge_and_branch_force (new_exit_e, loop->header);
+      PENDING_STMT (new_exit_e) = NULL;
        set_immediate_dominator (CDI_DOMINATORS, loop->header,
                                new_exit_e->src);
  
@@ -901,6 +887,7 @@ slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
         }    
  
        redirect_edge_and_branch_force (entry_e, new_loop->header);
+      PENDING_STMT (entry_e) = NULL;
        set_immediate_dominator (CDI_DOMINATORS, new_loop->header, preheader);
      }
  
@@ -918,20 +905,29 @@ slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
  
  static edge
  slpeel_add_loop_guard (basic_block guard_bb, tree cond, basic_block exit_bb,
-                       basic_block dom_bb)
+                      basic_block dom_bb)
  {
    block_stmt_iterator bsi;
    edge new_e, enter_e;
    tree cond_stmt;
+  tree gimplify_stmt_list;
  
    enter_e = EDGE_SUCC (guard_bb, 0);
    enter_e->flags &= ~EDGE_FALLTHRU;
    enter_e->flags |= EDGE_FALSE_VALUE;
    bsi = bsi_last (guard_bb);
  
+  cond =
+    force_gimple_operand (cond, &gimplify_stmt_list, true,
+                         NULL_TREE);
    cond_stmt = build3 (COND_EXPR, void_type_node, cond,
                       NULL_TREE, NULL_TREE);
+  if (gimplify_stmt_list)
+    bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
+
+  bsi = bsi_last (guard_bb);
    bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
+
    /* Add new edge to connect guard block to the merge/loop-exit block.  */
    new_e = make_edge (guard_bb, exit_bb, EDGE_TRUE_VALUE);
    set_immediate_dominator (CDI_DOMINATORS, exit_bb, dom_bb);
@@ -1007,12 +1003,89 @@ slpeel_verify_cfg_after_peeling (struct loop *first_loop,
  }
  #endif
  
+/* If the run time cost model check determines that vectorization is
+   not profitable and hence scalar loop should be generated then set
+   FIRST_NITERS to prologue peeled iterations. This will allow all the
+   iterations to be executed in the prologue peeled scalar loop.  */
+
+void
+set_prologue_iterations (basic_block bb_before_first_loop,
+                        tree first_niters,
+                        struct loop *loop,
+                        unsigned int th)
+{
+  edge e;
+  basic_block cond_bb, then_bb;
+  tree var, prologue_after_cost_adjust_name, stmt;
+  block_stmt_iterator bsi;
+  tree newphi;
+  edge e_true, e_false, e_fallthru;
+  tree cond_stmt;
+  tree gimplify_stmt_list;
+  tree cost_pre_condition = NULL_TREE;
+  tree scalar_loop_iters = 
+    unshare_expr (LOOP_VINFO_NITERS_UNCHANGED (loop_vec_info_for_loop (loop)));
+
+  e = single_pred_edge (bb_before_first_loop);
+  cond_bb = split_edge(e);
+
+  e = single_pred_edge (bb_before_first_loop);
+  then_bb = split_edge(e);
+  set_immediate_dominator (CDI_DOMINATORS, then_bb, cond_bb);
+
+  e_false = make_single_succ_edge (cond_bb, bb_before_first_loop,
+                                  EDGE_FALSE_VALUE);
+  set_immediate_dominator (CDI_DOMINATORS, bb_before_first_loop, cond_bb);
+
+  e_true = EDGE_PRED (then_bb, 0);
+  e_true->flags &= ~EDGE_FALLTHRU;
+  e_true->flags |= EDGE_TRUE_VALUE;
+
+  e_fallthru = EDGE_SUCC (then_bb, 0);
+
+  cost_pre_condition =
+    build2 (LE_EXPR, boolean_type_node, scalar_loop_iters, 
+           build_int_cst (TREE_TYPE (scalar_loop_iters), th));
+  cost_pre_condition =
+    force_gimple_operand (cost_pre_condition, &gimplify_stmt_list,
+                         true, NULL_TREE);
+  cond_stmt = build3 (COND_EXPR, void_type_node, cost_pre_condition,
+                     NULL_TREE, NULL_TREE);
+
+  bsi = bsi_last (cond_bb);
+  if (gimplify_stmt_list)
+    bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
+
+  bsi = bsi_last (cond_bb);
+  bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
+                                         
+  var = create_tmp_var (TREE_TYPE (scalar_loop_iters),
+                       "prologue_after_cost_adjust");
+  add_referenced_var (var);
+  prologue_after_cost_adjust_name = 
+    force_gimple_operand (scalar_loop_iters, &stmt, false, var);
+
+  bsi = bsi_last (then_bb);
+  if (stmt)
+    bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
+
+  newphi = create_phi_node (var, bb_before_first_loop);
+  add_phi_arg (newphi, prologue_after_cost_adjust_name, e_fallthru);
+  add_phi_arg (newphi, first_niters, e_false);
+
+  first_niters = PHI_RESULT (newphi);
+}
+
+
  /* Function slpeel_tree_peel_loop_to_edge.
  
     Peel the first (last) iterations of LOOP into a new prolog (epilog) loop
     that is placed on the entry (exit) edge E of LOOP. After this transformation
     we have two loops one after the other - first-loop iterates FIRST_NITERS
     times, and second-loop iterates the remainder NITERS - FIRST_NITERS times.
+   If the cost model indicates that it is profitable to emit a scalar 
+   loop instead of the vector one, then the prolog (epilog) loop will iterate
+   for the entire unchanged scalar iterations of the loop.
  
     Input:
     - LOOP: the loop to be peeled.
@@ -1027,6 +1100,13 @@ slpeel_verify_cfg_after_peeling (struct loop *first_loop,
          for updating the loop bound of the first-loop to FIRST_NITERS.  If it
          is false, the caller of this function may want to take care of this
          (this can be useful if we don't want new stmts added to first-loop).
+   - TH: cost model profitability threshold of iterations for vectorization.
+   - CHECK_PROFITABILITY: specify whether cost model check has not occured
+                          during versioning and hence needs to occur during
+                         prologue generation or whether cost model check 
+                         has not occured during prologue generation and hence
+                         needs to occur during epilogue generation.
+           
  
     Output:
     The function returns a pointer to the new loop-copy, or NULL if it failed
@@ -1048,11 +1128,11 @@ struct loop*
  slpeel_tree_peel_loop_to_edge (struct loop *loop, 
                                edge e, tree first_niters, 
                                tree niters, bool update_first_loop_count,
-                              unsigned int th)
+                              unsigned int th, bool check_profitability)
  {
    struct loop *new_loop = NULL, *first_loop, *second_loop;
    edge skip_e;
-  tree pre_condition;
+  tree pre_condition = NULL_TREE;
    bitmap definitions;
    basic_block bb_before_second_loop, bb_after_second_loop;
    basic_block bb_before_first_loop;
@@ -1060,6 +1140,7 @@ slpeel_tree_peel_loop_to_edge (struct loop *loop,
    basic_block new_exit_bb;
    edge exit_e = single_exit (loop);
    LOC loop_loc;
+  tree cost_pre_condition = NULL_TREE;
    
    if (!slpeel_can_duplicate_loop_p (loop, e))
      return NULL;
@@ -1116,32 +1197,124 @@ slpeel_tree_peel_loop_to_edge (struct loop *loop,
    rename_variables_in_loop (new_loop);
  
  
-  /* 2. Add the guard that controls whether the first loop is executed.
-        Resulting CFG would be:
+  /* 2.  Add the guard code in one of the following ways:
  
-        bb_before_first_loop:
-        if (FIRST_NITERS == 0) GOTO bb_before_second_loop
-                               GOTO first-loop
+     2.a Add the guard that controls whether the first loop is executed.
+         This occurs when this function is invoked for prologue or epilogiue
+        generation and when the cost model check can be done at compile time.
  
-        first_loop:
-        do {
-        } while ...
+         Resulting CFG would be:
  
-        bb_before_second_loop:
+         bb_before_first_loop:
+         if (FIRST_NITERS == 0) GOTO bb_before_second_loop
+                                GOTO first-loop
  
-        second_loop:
-        do {
-        } while ...
+         first_loop:
+         do {
+         } while ...
  
-        orig_exit_bb:
-   */
+         bb_before_second_loop:
+
+         second_loop:
+         do {
+         } while ...
+
+         orig_exit_bb:
+
+     2.b Add the cost model check that allows the prologue
+         to iterate for the entire unchanged scalar
+         iterations of the loop in the event that the cost
+         model indicates that the scalar loop is more
+         profitable than the vector one. This occurs when
+        this function is invoked for prologue generation
+        and the cost model check needs to be done at run
+        time.
+
+         Resulting CFG after prologue peeling would be:
+
+         if (scalar_loop_iterations <= th)
+           FIRST_NITERS = scalar_loop_iterations
+
+         bb_before_first_loop:
+         if (FIRST_NITERS == 0) GOTO bb_before_second_loop
+                                GOTO first-loop
+
+         first_loop:
+         do {
+         } while ...
+
+         bb_before_second_loop:
+
+         second_loop:
+         do {
+         } while ...
+
+         orig_exit_bb:
+
+     2.c Add the cost model check that allows the epilogue
+         to iterate for the entire unchanged scalar
+         iterations of the loop in the event that the cost
+         model indicates that the scalar loop is more
+         profitable than the vector one. This occurs when
+        this function is invoked for epilogue generation
+        and the cost model check needs to be done at run
+        time.
+
+         Resulting CFG after prologue peeling would be:
+
+         bb_before_first_loop:
+         if ((scalar_loop_iterations <= th)
+             ||
+             FIRST_NITERS == 0) GOTO bb_before_second_loop
+                                GOTO first-loop
+
+         first_loop:
+         do {
+         } while ...
+
+         bb_before_second_loop:
+
+         second_loop:
+         do {
+         } while ...
+
+         orig_exit_bb:
+  */
  
    bb_before_first_loop = split_edge (loop_preheader_edge (first_loop));
    bb_before_second_loop = split_edge (single_exit (first_loop));
  
-  pre_condition =
-    fold_build2 (LE_EXPR, boolean_type_node, first_niters, 
-       build_int_cst (TREE_TYPE (first_niters), th));
+  /* Epilogue peeling.  */
+  if (!update_first_loop_count)
+    {
+      pre_condition =
+       fold_build2 (LE_EXPR, boolean_type_node, first_niters, 
+                    build_int_cst (TREE_TYPE (first_niters), 0));
+      if (check_profitability)
+       {
+         tree scalar_loop_iters
+           = unshare_expr (LOOP_VINFO_NITERS_UNCHANGED
+                                       (loop_vec_info_for_loop (loop)));
+         cost_pre_condition = 
+           build2 (LE_EXPR, boolean_type_node, scalar_loop_iters, 
+                   build_int_cst (TREE_TYPE (scalar_loop_iters), th));
+
+         pre_condition = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
+                                      cost_pre_condition, pre_condition);
+       }
+    }
+
+  /* Prologue peeling.  */  
+  else
+    {
+      if (check_profitability)
+       set_prologue_iterations (bb_before_first_loop, first_niters,
+                                loop, th);
+
+      pre_condition =
+       fold_build2 (LE_EXPR, boolean_type_node, first_niters, 
+                    build_int_cst (TREE_TYPE (first_niters), 0));
+    }
  
    skip_e = slpeel_add_loop_guard (bb_before_first_loop, pre_condition,
                                    bb_before_second_loop, bb_before_first_loop);
@@ -1345,6 +1518,13 @@ new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo)
    STMT_VINFO_IN_PATTERN_P (res) = false;
    STMT_VINFO_RELATED_STMT (res) = NULL;
    STMT_VINFO_DATA_REF (res) = NULL;
+
+  STMT_VINFO_DR_BASE_ADDRESS (res) = NULL;
+  STMT_VINFO_DR_OFFSET (res) = NULL;
+  STMT_VINFO_DR_INIT (res) = NULL;
+  STMT_VINFO_DR_STEP (res) = NULL;
+  STMT_VINFO_DR_ALIGNED_TO (res) = NULL;
+
    if (TREE_CODE (stmt) == PHI_NODE && is_loop_header_bb_p (bb_for_stmt (stmt)))
      STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type;
    else
@@ -1352,6 +1532,7 @@ new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo)
    STMT_VINFO_SAME_ALIGN_REFS (res) = VEC_alloc (dr_p, heap, 5);
    STMT_VINFO_INSIDE_OF_LOOP_COST (res) = 0;
    STMT_VINFO_OUTSIDE_OF_LOOP_COST (res) = 0;
+  STMT_SLP_TYPE (res) = 0;
    DR_GROUP_FIRST_DR (res) = NULL_TREE;
    DR_GROUP_NEXT_DR (res) = NULL_TREE;
    DR_GROUP_SIZE (res) = 0;
@@ -1364,6 +1545,22 @@ new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo)
  }
  
  
+/* Free stmt vectorization related info.  */
+
+void
+free_stmt_vec_info (tree stmt)
+{
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+
+  if (!stmt_info)
+    return;
+
+  VEC_free (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmt_info));
+  free (stmt_info);
+  set_stmt_info (stmt_ann (stmt), NULL);
+}
+
+
  /* Function bb_in_loop_p
  
     Used as predicate for dfs order traversal of the loop bbs.  */
@@ -1371,7 +1568,7 @@ new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo)
  static bool
  bb_in_loop_p (const_basic_block bb, const void *data)
  {
-  struct loop *loop = (struct loop *)data;
+  const struct loop *const loop = (const struct loop *)data;
    if (flow_bb_inside_loop_p (loop, bb))
      return true;
    return false;
@@ -1460,6 +1657,7 @@ new_loop_vec_info (struct loop *loop)
  
    LOOP_VINFO_BBS (res) = bbs;
    LOOP_VINFO_NITERS (res) = NULL;
+  LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
    LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
    LOOP_VINFO_VECTORIZABLE_P (res) = 0;
    LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
@@ -1471,7 +1669,9 @@ new_loop_vec_info (struct loop *loop)
      VEC_alloc (tree, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
    LOOP_VINFO_MAY_ALIAS_DDRS (res) =
      VEC_alloc (ddr_p, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
-
+  LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (tree, heap, 10);
+  LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
+  LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
  
    return res;
  }
@@ -1490,6 +1690,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
    int nbbs;
    block_stmt_iterator si;
    int j;
+  VEC (slp_instance, heap) *slp_instances;
+  slp_instance instance;
  
    if (!loop_vinfo)
      return;
@@ -1515,21 +1717,13 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
      {
        basic_block bb = bbs[j];
        tree phi;
-      stmt_vec_info stmt_info;
  
        for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
-        {
-          stmt_ann_t ann = stmt_ann (phi);
-
-          stmt_info = vinfo_for_stmt (phi);
-          free (stmt_info);
-          set_stmt_info (ann, NULL);
-        }
+        free_stmt_vec_info (phi);
  
        for (si = bsi_start (bb); !bsi_end_p (si); )
         {
           tree stmt = bsi_stmt (si);
-         stmt_ann_t ann = stmt_ann (stmt);
           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
  
           if (stmt_info)
@@ -1547,9 +1741,7 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
                 }
                         
               /* Free stmt_vec_info.  */
-             VEC_free (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmt_info));
-             free (stmt_info);
-             set_stmt_info (ann, NULL);
+             free_stmt_vec_info (stmt);
  
               /* Remove dead "pattern stmts".  */
               if (remove_stmt_p)
@@ -1564,6 +1756,11 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
    free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
    VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
    VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
+  slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
+  for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
+    vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
+  VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
+  VEC_free (tree, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
  
    free (loop_vinfo);
    loop->aux = NULL;
@@ -1576,7 +1773,7 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
     on ALIGNMENT bit boundary.  */
  
  bool 
-vect_can_force_dr_alignment_p (tree decl, unsigned int alignment)
+vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
  {
    if (TREE_CODE (decl) != VAR_DECL)
      return false;
@@ -1590,12 +1787,9 @@ vect_can_force_dr_alignment_p (tree decl, unsigned int alignment)
    if (TREE_STATIC (decl))
      return (alignment <= MAX_OFILE_ALIGNMENT);
    else
-    /* This is not 100% correct.  The absolute correct stack alignment
-       is STACK_BOUNDARY.  We're supposed to hope, but not assume, that
-       PREFERRED_STACK_BOUNDARY is honored by all translation units.
-       However, until someone implements forced stack alignment, SSE
-       isn't really usable without this.  */  
-    return (alignment <= PREFERRED_STACK_BOUNDARY); 
+    /* This used to be PREFERRED_STACK_BOUNDARY, however, that is not 100%
+       correct until someone implements forced stack alignment.  */
+    return (alignment <= STACK_BOUNDARY); 
  }
  
  
@@ -1655,21 +1849,103 @@ get_vectype_for_scalar_type (tree scalar_type)
  enum dr_alignment_support
  vect_supportable_dr_alignment (struct data_reference *dr)
  {
-  tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
+  tree stmt = DR_STMT (dr);
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
    enum machine_mode mode = (int) TYPE_MODE (vectype);
+  struct loop *vect_loop = LOOP_VINFO_LOOP (STMT_VINFO_LOOP_VINFO (stmt_info));
+  bool nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
+  bool invariant_in_outerloop = false;
  
    if (aligned_access_p (dr))
      return dr_aligned;
  
+  if (nested_in_vect_loop)
+    {
+      tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
+      invariant_in_outerloop =
+       (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
+    }
+
    /* Possibly unaligned access.  */
+
+  /* We can choose between using the implicit realignment scheme (generating
+     a misaligned_move stmt) and the explicit realignment scheme (generating
+     aligned loads with a REALIGN_LOAD). There are two variants to the explicit
+     realignment scheme: optimized, and unoptimized.
+     We can optimize the realignment only if the step between consecutive
+     vector loads is equal to the vector size.  Since the vector memory
+     accesses advance in steps of VS (Vector Size) in the vectorized loop, it
+     is guaranteed that the misalignment amount remains the same throughout the
+     execution of the vectorized loop.  Therefore, we can create the
+     "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
+     at the loop preheader.
+
+     However, in the case of outer-loop vectorization, when vectorizing a
+     memory access in the inner-loop nested within the LOOP that is now being
+     vectorized, while it is guaranteed that the misalignment of the
+     vectorized memory access will remain the same in different outer-loop
+     iterations, it is *not* guaranteed that is will remain the same throughout
+     the execution of the inner-loop.  This is because the inner-loop advances
+     with the original scalar step (and not in steps of VS).  If the inner-loop
+     step happens to be a multiple of VS, then the misalignment remains fixed
+     and we can use the optimized realignment scheme.  For example:
+
+      for (i=0; i<N; i++)
+        for (j=0; j<M; j++)
+          s += a[i+j];
+
+     When vectorizing the i-loop in the above example, the step between
+     consecutive vector loads is 1, and so the misalignment does not remain
+     fixed across the execution of the inner-loop, and the realignment cannot
+     be optimized (as illustrated in the following pseudo vectorized loop):
+
+      for (i=0; i<N; i+=4)
+        for (j=0; j<M; j++){
+          vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
+                         // when j is {0,1,2,3,4,5,6,7,...} respectively.
+                         // (assuming that we start from an aligned address).
+          }
+
+     We therefore have to use the unoptimized realignment scheme:
+
+      for (i=0; i<N; i+=4)
+          for (j=k; j<M; j+=4)
+          vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
+                           // that the misalignment of the initial address is
+                           // 0).
+
+     The loop can then be vectorized as follows:
+
+      for (k=0; k<4; k++){
+        rt = get_realignment_token (&vp[k]);
+        for (i=0; i<N; i+=4){
+          v1 = vp[i+k];
+          for (j=k; j<M; j+=4){
+            v2 = vp[i+j+VS-1];
+            va = REALIGN_LOAD <v1,v2,rt>;
+            vs += va;
+            v1 = v2;
+          }
+        }
+    } */
+
    if (DR_IS_READ (dr))
      {
-      if (optab_handler (vec_realign_load_optab, mode)->insn_code != CODE_FOR_nothing
+      if (optab_handler (vec_realign_load_optab, mode)->insn_code != 
+                                                            CODE_FOR_nothing
           && (!targetm.vectorize.builtin_mask_for_load
               || targetm.vectorize.builtin_mask_for_load ()))
-       return dr_unaligned_software_pipeline;
+       {
+           if (nested_in_vect_loop
+               && TREE_INT_CST_LOW (DR_STEP (dr)) != UNITS_PER_SIMD_WORD)
+             return dr_explicit_realign;
+           else
+             return dr_explicit_realign_optimized;
+       }
  
-      if (optab_handler (movmisalign_optab, mode)->insn_code != CODE_FOR_nothing)
+      if (optab_handler (movmisalign_optab, mode)->insn_code != 
+                                                            CODE_FOR_nothing)
         /* Can't software pipeline the loads, but can at least do them.  */
         return dr_unaligned_supported;
      }
@@ -1720,7 +1996,13 @@ vect_is_simple_use (tree operand, loop_vec_info loop_vinfo, tree *def_stmt,
        *dt = vect_invariant_def;
        return true;
     }
-    
+
+  if (TREE_CODE (operand) == PAREN_EXPR)
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+        fprintf (vect_dump, "non-associatable copy.");
+      operand = TREE_OPERAND (operand, 0);
+    }
    if (TREE_CODE (operand) != SSA_NAME)
      {
        if (vect_print_dump_info (REPORT_DETAILS))
@@ -1890,8 +2172,7 @@ supportable_widening_operation (enum tree_code code, tree stmt, tree vectype,
          }
        break;
  
-    case NOP_EXPR:
-    case CONVERT_EXPR:
+    CASE_CONVERT:
        if (BYTES_BIG_ENDIAN)
          {
            c1 = VEC_UNPACK_HI_EXPR;
@@ -1930,13 +2211,13 @@ supportable_widening_operation (enum tree_code code, tree stmt, tree vectype,
    if (code == FIX_TRUNC_EXPR)
      {
        /* The signedness is determined from output operand.  */
-      optab1 = optab_for_tree_code (c1, type);
-      optab2 = optab_for_tree_code (c2, type);
+      optab1 = optab_for_tree_code (c1, type, optab_default);
+      optab2 = optab_for_tree_code (c2, type, optab_default);
      }
    else
      {
-      optab1 = optab_for_tree_code (c1, vectype);
-      optab2 = optab_for_tree_code (c2, vectype);
+      optab1 = optab_for_tree_code (c1, vectype, optab_default);
+      optab2 = optab_for_tree_code (c2, vectype, optab_default);
      }
  
    if (!optab1 || !optab2)
@@ -1972,7 +2253,7 @@ supportable_widening_operation (enum tree_code code, tree stmt, tree vectype,
  
  bool
  supportable_narrowing_operation (enum tree_code code,
-                                tree stmt, tree vectype,
+                                const_tree stmt, const_tree vectype,
                                  enum tree_code *code1)
  {
    enum machine_mode vec_mode;
@@ -1985,8 +2266,7 @@ supportable_narrowing_operation (enum tree_code code,
  
    switch (code)
      {
-    case NOP_EXPR:
-    case CONVERT_EXPR:
+    CASE_CONVERT:
        c1 = VEC_PACK_TRUNC_EXPR;
        break;
  
@@ -2005,9 +2285,9 @@ supportable_narrowing_operation (enum tree_code code,
  
    if (code == FIX_TRUNC_EXPR)
      /* The signedness is determined from output operand.  */
-    optab1 = optab_for_tree_code (c1, type);
+    optab1 = optab_for_tree_code (c1, type, optab_default);
    else
-    optab1 = optab_for_tree_code (c1, vectype);
+    optab1 = optab_for_tree_code (c1, vectype, optab_default);
  
    if (!optab1)
      return false;
@@ -2060,7 +2340,7 @@ reduction_code_for_scalar_code (enum tree_code code,
  
  /* Function vect_is_simple_reduction
  
-   Detect a cross-iteration def-use cucle that represents a simple
+   Detect a cross-iteration def-use cycle that represents a simple
     reduction computation. We look for the following pattern:
  
     loop_header:
@@ -2215,7 +2495,7 @@ vect_is_simple_reduction (loop_vec_info loop_info, tree phi)
       outer-loop vectorization is safe.  */
  
    /* CHECKME: check for !flag_finite_math_only too?  */
-  if (SCALAR_FLOAT_TYPE_P (type) && !flag_unsafe_math_optimizations
+  if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
        && !nested_in_vect_loop_p (vect_loop, def_stmt)) 
      {
        /* Changing the order of operations changes the semantics.  */
@@ -2485,8 +2765,10 @@ gate_increase_alignment (void)
    return flag_section_anchors && flag_tree_vectorize;
  }
  
-struct tree_opt_pass pass_ipa_increase_alignment = 
+struct simple_ipa_opt_pass pass_ipa_increase_alignment = 
  {
+ {
+  SIMPLE_IPA_PASS,
    "increase_alignment",                        /* name */
    gate_increase_alignment,             /* gate */
    increase_alignment,                  /* execute */
@@ -2498,6 +2780,6 @@ struct tree_opt_pass pass_ipa_increase_alignment =
    0,                                   /* properties_provided */
    0,                                   /* properties_destroyed */
    0,                                   /* todo_flags_start */
-  0,                                   /* todo_flags_finish */
-  0                                    /* letter */
+  0                                    /* todo_flags_finish */
+ }
  };