X-Git-Url: http://git.sourceforge.jp/view?a=blobdiff_plain;f=gcc%2Ftree-ssa-loop-ivopts.c;h=7556f53d3e491b0b2df7e51609bbcdfb24a28ac3;hb=3b01567f28b62f8d753edb3a490064eeeabb5420;hp=bda640f3e0eafb11b834b423d38cc006a80cfb9d;hpb=134c053efb725c8dfa8238c3329ea0439f96df09;p=pf3gnuchains%2Fgcc-fork.git

diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index bda640f3e0e..7556f53d3e4 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -1,5 +1,5 @@
 /* Induction variable optimizations.
-   Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
+   Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
    Free Software Foundation, Inc.
 
 This file is part of GCC.
@@ -70,14 +70,12 @@ along with GCC; see the file COPYING3.  If not see
 #include "tm_p.h"
 #include "basic-block.h"
 #include "output.h"
-#include "diagnostic.h"
 #include "tree-pretty-print.h"
 #include "gimple-pretty-print.h"
 #include "tree-flow.h"
 #include "tree-dump.h"
 #include "timevar.h"
 #include "cfgloop.h"
-#include "expr.h"
 #include "tree-pass.h"
 #include "ggc.h"
 #include "insn-config.h"
@@ -91,14 +89,38 @@ along with GCC; see the file COPYING3.  If not see
 #include "langhooks.h"
 #include "tree-affine.h"
 #include "target.h"
+#include "tree-inline.h"
+#include "tree-ssa-propagate.h"
+
+/* FIXME: add_cost and zero_cost defined in exprmed.h conflict with local uses.
+ */
+#include "expmed.h"
+#undef add_cost
+#undef zero_cost
+
+/* FIXME: Expressions are expanded to RTL in this pass to determine the
+   cost of different addressing modes.  This should be moved to a TBD
+   interface between the GIMPLE and RTL worlds.  */
+#include "expr.h"
 
 /* The infinite cost.  */
 #define INFTY 10000000
 
-/* The expected number of loop iterations.  TODO -- use profiling instead of
-   this.  */
 #define AVG_LOOP_NITER(LOOP) 5
 
+/* Returns the expected number of loop iterations for LOOP.
+   The average trip count is computed from profile data if it
+   exists. */
+
+static inline HOST_WIDE_INT
+avg_loop_niter (struct loop *loop)
+{
+  HOST_WIDE_INT niter = max_stmt_executions_int (loop, false);
+  if (niter == -1)
+    return AVG_LOOP_NITER (loop);
+
+  return niter;
+}
 
 /* Representation of the induction variable.  */
 struct iv
@@ -154,6 +176,8 @@ struct cost_pair
   tree value;		/* For final value elimination, the expression for
 			   the final value of the iv.  For iv elimination,
 			   the new bound to compare with.  */
+  enum tree_code comp;	/* For iv elimination, the comparison.  */
+  int inv_expr_id;      /* Loop invariant expression id.  */
 };
 
 /* Use.  */
@@ -208,6 +232,14 @@ struct iv_cand
 			   biv.  */
 };
 
+/* Loop invariant expression hashtable entry.  */
+struct iv_inv_expr_ent
+{
+  tree expr;
+  int id;
+  hashval_t hash;
+};
+
 /* The data used by the induction variable optimizations.  */
 
 typedef struct iv_use *iv_use_p;
@@ -235,6 +267,13 @@ struct ivopts_data
   /* The array of information for the ssa names.  */
   struct version_info *version_info;
 
+  /* The hashtable of loop invariant expressions created
+     by ivopt.  */
+  htab_t inv_expr_tab;
+
+  /* Loop invariant expression id.  */
+  int inv_expr_id;
+
   /* The bitmap of indices in version_info whose value was changed.  */
   bitmap relevant;
 
@@ -256,6 +295,12 @@ struct ivopts_data
 
   /* Are we optimizing for speed?  */
   bool speed;
+
+  /* Whether the loop body includes any function calls.  */
+  bool body_includes_call;
+
+  /* Whether the loop body can only be exited via single exit.  */
+  bool loop_single_exit_p;
 };
 
 /* An assignment of iv candidates to uses.  */
@@ -292,6 +337,13 @@ struct iv_ca
   /* Number of times each invariant is used.  */
   unsigned *n_invariant_uses;
 
+  /* The array holding the number of uses of each loop
+     invariant expressions created by ivopt.  */
+  unsigned *used_inv_expr;
+
+  /* The number of created loop invariants.  */
+  unsigned num_used_inv_expr;
+
   /* Total cost of the assignment.  */
   comp_cost cost;
 };
@@ -335,6 +387,8 @@ struct iv_ca_delta
 
 static VEC(tree,heap) *decl_rtl_to_reset;
 
+static comp_cost force_expr_to_var_cost (tree, bool);
+
 /* Number of uses recorded in DATA.  */
 
 static inline unsigned
@@ -513,6 +567,19 @@ dump_cand (FILE *file, struct iv_cand *cand)
       return;
     }
 
+  if (cand->var_before)
+    {
+      fprintf (file, "  var_before ");
+      print_generic_expr (file, cand->var_before, TDF_SLIM);
+      fprintf (file, "\n");
+    }
+  if (cand->var_after)
+    {
+      fprintf (file, "  var_after ");
+      print_generic_expr (file, cand->var_after, TDF_SLIM);
+      fprintf (file, "\n");
+    }
+
   switch (cand->pos)
     {
     case IP_NORMAL:
@@ -707,14 +774,13 @@ contains_abnormal_ssa_name_p (tree expr)
   return false;
 }
 
-/*  Returns tree describing number of iterations determined from
+/*  Returns the structure describing number of iterations determined from
     EXIT of DATA->current_loop, or NULL if something goes wrong.  */
 
-static tree
+static struct tree_niter_desc *
 niter_for_exit (struct ivopts_data *data, edge exit)
 {
-  struct tree_niter_desc desc;
-  tree niter;
+  struct tree_niter_desc *desc;
   void **slot;
 
   if (!data->niters)
@@ -727,32 +793,31 @@ niter_for_exit (struct ivopts_data *data, edge exit)
 
   if (!slot)
     {
-      /* Try to determine number of iterations.  We must know it
-	 unconditionally (i.e., without possibility of # of iterations
-	 being zero).  Also, we cannot safely work with ssa names that
-	 appear in phi nodes on abnormal edges, so that we do not create
-	 overlapping life ranges for them (PR 27283).  */
-      if (number_of_iterations_exit (data->current_loop,
-				     exit, &desc, true)
-	  && integer_zerop (desc.may_be_zero)
-     	  && !contains_abnormal_ssa_name_p (desc.niter))
-	niter = desc.niter;
-      else
-	niter = NULL_TREE;
-
-      *pointer_map_insert (data->niters, exit) = niter;
+      /* Try to determine number of iterations.  We cannot safely work with ssa
+         names that appear in phi nodes on abnormal edges, so that we do not
+         create overlapping life ranges for them (PR 27283).  */
+      desc = XNEW (struct tree_niter_desc);
+      if (!number_of_iterations_exit (data->current_loop,
+				      exit, desc, true)
+     	  || contains_abnormal_ssa_name_p (desc->niter))
+	{
+	  XDELETE (desc);
+	  desc = NULL;
+	}
+      slot = pointer_map_insert (data->niters, exit);
+      *slot = desc;
     }
   else
-    niter = (tree) *slot;
+    desc = (struct tree_niter_desc *) *slot;
 
-  return niter;
+  return desc;
 }
 
-/* Returns tree describing number of iterations determined from
+/* Returns the structure describing number of iterations determined from
    single dominating exit of DATA->current_loop, or NULL if something
    goes wrong.  */
 
-static tree
+static struct tree_niter_desc *
 niter_for_single_dom_exit (struct ivopts_data *data)
 {
   edge exit = single_dom_exit (data->current_loop);
@@ -763,6 +828,30 @@ niter_for_single_dom_exit (struct ivopts_data *data)
   return niter_for_exit (data, exit);
 }
 
+/* Hash table equality function for expressions.  */
+
+static int
+htab_inv_expr_eq (const void *ent1, const void *ent2)
+{
+  const struct iv_inv_expr_ent *expr1 =
+      (const struct iv_inv_expr_ent *)ent1;
+  const struct iv_inv_expr_ent *expr2 =
+      (const struct iv_inv_expr_ent *)ent2;
+
+  return expr1->hash == expr2->hash
+	 && operand_equal_p (expr1->expr, expr2->expr, 0);
+}
+
+/* Hash function for loop invariant expressions.  */
+
+static hashval_t
+htab_inv_expr_hash (const void *ent)
+{
+  const struct iv_inv_expr_ent *expr =
+      (const struct iv_inv_expr_ent *)ent;
+  return expr->hash;
+}
+
 /* Initializes data structures used by the iv optimization pass, stored
    in DATA.  */
 
@@ -777,6 +866,9 @@ tree_ssa_iv_optimize_init (struct ivopts_data *data)
   data->niters = NULL;
   data->iv_uses = VEC_alloc (iv_use_p, heap, 20);
   data->iv_candidates = VEC_alloc (iv_cand_p, heap, 20);
+  data->inv_expr_tab = htab_create (10, htab_inv_expr_hash,
+                                    htab_inv_expr_eq, free);
+  data->inv_expr_id = 0;
   decl_rtl_to_reset = VEC_alloc (tree, heap, 20);
 }
 
@@ -810,7 +902,7 @@ determine_base_object (tree expr)
       if (!base)
 	return expr;
 
-      if (TREE_CODE (base) == INDIRECT_REF)
+      if (TREE_CODE (base) == MEM_REF)
 	return determine_base_object (TREE_OPERAND (base, 0));
 
       return fold_convert (ptr_type_node,
@@ -939,7 +1031,7 @@ find_bivs (struct ivopts_data *data)
       if (step)
 	{
 	  if (POINTER_TYPE_P (type))
-	    step = fold_convert (sizetype, step);
+	    step = convert_to_ptrofftype (step);
 	  else
 	    step = fold_convert (type, step);
 	}
@@ -1014,6 +1106,12 @@ find_givs_in_stmt_scev (struct ivopts_data *data, gimple stmt, affine_iv *iv)
       || contains_abnormal_ssa_name_p (iv->step))
     return false;
 
+  /* If STMT could throw, then do not consider STMT as defining a GIV.  
+     While this will suppress optimizations, we can not safely delete this
+     GIV and associated statements, even if it appears it is not used.  */
+  if (stmt_could_throw_p (stmt))
+    return false;
+
   return true;
 }
 
@@ -1072,12 +1170,17 @@ find_induction_variables (struct ivopts_data *data)
 
   if (dump_file && (dump_flags & TDF_DETAILS))
     {
-      tree niter = niter_for_single_dom_exit (data);
+      struct tree_niter_desc *niter = niter_for_single_dom_exit (data);
 
       if (niter)
 	{
 	  fprintf (dump_file, "  number of iterations ");
-	  print_generic_expr (dump_file, niter, TDF_SLIM);
+	  print_generic_expr (dump_file, niter->niter, TDF_SLIM);
+	  if (!integer_zerop (niter->may_be_zero))
+	    {
+	      fprintf (dump_file, "; zero if ");
+	      print_generic_expr (dump_file, niter->may_be_zero, TDF_SLIM);
+	    }
 	  fprintf (dump_file, "\n\n");
     	};
 
@@ -1356,10 +1459,6 @@ idx_find_step (tree base, tree *idx, void *data)
   tree step, iv_base, iv_step, lbound, off;
   struct loop *loop = dta->ivopts_data->current_loop;
 
-  if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF
-      || TREE_CODE (base) == ALIGN_INDIRECT_REF)
-    return false;
-
   /* If base is a component ref, require that the offset of the reference
      be invariant.  */
   if (TREE_CODE (base) == COMPONENT_REF)
@@ -1412,7 +1511,7 @@ idx_find_step (tree base, tree *idx, void *data)
     }
   else
     /* The step for pointer arithmetics already is 1 byte.  */
-    step = build_int_cst (sizetype, 1);
+    step = size_one_node;
 
   iv_base = iv->base;
   iv_step = iv->step;
@@ -1537,7 +1636,8 @@ may_be_unaligned_p (tree ref, tree step)
   base = get_inner_reference (ref, &bitsize, &bitpos, &toffset, &mode,
 			      &unsignedp, &volatilep, true);
   base_type = TREE_TYPE (base);
-  base_align = TYPE_ALIGN (base_type);
+  base_align = get_object_alignment (base);
+  base_align = MAX (base_align, TYPE_ALIGN (base_type));
 
   if (mode != BLKmode)
     {
@@ -1561,7 +1661,7 @@ may_be_unaligned_p (tree ref, tree step)
 
 /* Return true if EXPR may be non-addressable.   */
 
-static bool
+bool
 may_be_nonaddressable_p (tree expr)
 {
   switch (TREE_CODE (expr))
@@ -1605,7 +1705,7 @@ may_be_nonaddressable_p (tree expr)
 static void
 find_interesting_uses_address (struct ivopts_data *data, gimple stmt, tree *op_p)
 {
-  tree base = *op_p, step = build_int_cst (sizetype, 0);
+  tree base = *op_p, step = size_zero_node;
   struct iv *civ;
   struct ifs_ivopts_data ifs_ivopts_data;
 
@@ -1636,6 +1736,16 @@ find_interesting_uses_address (struct ivopts_data *data, gimple stmt, tree *op_p
 	  TMR_BASE (base) = civ->base;
 	  step = civ->step;
 	}
+      if (TMR_INDEX2 (base)
+	  && TREE_CODE (TMR_INDEX2 (base)) == SSA_NAME)
+	{
+	  civ = get_iv (data, TMR_INDEX2 (base));
+	  if (!civ)
+	    goto fail;
+
+	  TMR_INDEX2 (base) = civ->base;
+	  step = civ->step;
+	}
       if (TMR_INDEX (base)
 	  && TREE_CODE (TMR_INDEX (base)) == SSA_NAME)
 	{
@@ -1663,15 +1773,12 @@ find_interesting_uses_address (struct ivopts_data *data, gimple stmt, tree *op_p
     {
       ifs_ivopts_data.ivopts_data = data;
       ifs_ivopts_data.stmt = stmt;
-      ifs_ivopts_data.step = build_int_cst (sizetype, 0);
+      ifs_ivopts_data.step = size_zero_node;
       if (!for_each_index (&base, idx_find_step, &ifs_ivopts_data)
 	  || integer_zerop (ifs_ivopts_data.step))
 	goto fail;
       step = ifs_ivopts_data.step;
 
-      gcc_assert (TREE_CODE (base) != ALIGN_INDIRECT_REF);
-      gcc_assert (TREE_CODE (base) != MISALIGNED_INDIRECT_REF);
-
       /* Check that the base expression is addressable.  This needs
 	 to be done after substituting bases of IVs into it.  */
       if (may_be_nonaddressable_p (base))
@@ -1691,9 +1798,11 @@ find_interesting_uses_address (struct ivopts_data *data, gimple stmt, tree *op_p
 	  tree *ref = &TREE_OPERAND (base, 0);
 	  while (handled_component_p (*ref))
 	    ref = &TREE_OPERAND (*ref, 0);
-	  if (TREE_CODE (*ref) == INDIRECT_REF)
+	  if (TREE_CODE (*ref) == MEM_REF)
 	    {
-	      tree tem = gimple_fold_indirect_ref (TREE_OPERAND (*ref, 0));
+	      tree tem = fold_binary (MEM_REF, TREE_TYPE (*ref),
+				      TREE_OPERAND (*ref, 0),
+				      TREE_OPERAND (*ref, 1));
 	      if (tem)
 		*ref = tem;
 	    }
@@ -1827,7 +1936,7 @@ find_interesting_uses_outside (struct ivopts_data *data, edge exit)
       phi = gsi_stmt (psi);
       def = PHI_ARG_DEF_FROM_EDGE (phi, exit);
       if (is_gimple_reg (def))
-	find_interesting_uses_op (data, def);
+        find_interesting_uses_op (data, def);
     }
 }
 
@@ -2015,7 +2124,8 @@ strip_offset_1 (tree expr, bool inside_addr, bool top_compref,
       expr = build_fold_addr_expr (op0);
       return fold_convert (orig_type, expr);
 
-    case INDIRECT_REF:
+    case MEM_REF:
+      /* ???  Offset operand?  */
       inside_addr = false;
       break;
 
@@ -2108,7 +2218,10 @@ add_candidate_1 (struct ivopts_data *data,
   struct iv_cand *cand = NULL;
   tree type, orig_type;
 
-  if (base)
+  /* For non-original variables, make sure their values are computed in a type
+     that does not invoke undefined behavior on overflows (since in general,
+     we cannot prove that these induction variables are non-wrapping).  */
+  if (pos != IP_ORIGINAL)
     {
       orig_type = TREE_TYPE (base);
       type = generic_type_for (orig_type);
@@ -2143,7 +2256,9 @@ add_candidate_1 (struct ivopts_data *data,
 	continue;
 
       if (operand_equal_p (base, cand->iv->base, 0)
-	  && operand_equal_p (step, cand->iv->step, 0))
+	  && operand_equal_p (step, cand->iv->step, 0)
+          && (TYPE_PRECISION (TREE_TYPE (base))
+              == TYPE_PRECISION (TREE_TYPE (cand->iv->base))))
 	break;
     }
 
@@ -2552,12 +2667,13 @@ infinite_cost_p (comp_cost cost)
 
 /* Sets cost of (USE, CANDIDATE) pair to COST and record that it depends
    on invariants DEPENDS_ON and that the value used in expressing it
-   is VALUE.  */
+   is VALUE, and in case of iv elimination the comparison operator is COMP.  */
 
 static void
 set_use_iv_cost (struct ivopts_data *data,
 		 struct iv_use *use, struct iv_cand *cand,
-		 comp_cost cost, bitmap depends_on, tree value)
+		 comp_cost cost, bitmap depends_on, tree value,
+		 enum tree_code comp, int inv_expr_id)
 {
   unsigned i, s;
 
@@ -2573,6 +2689,8 @@ set_use_iv_cost (struct ivopts_data *data,
       use->cost_map[cand->id].cost = cost;
       use->cost_map[cand->id].depends_on = depends_on;
       use->cost_map[cand->id].value = value;
+      use->cost_map[cand->id].comp = comp;
+      use->cost_map[cand->id].inv_expr_id = inv_expr_id;
       return;
     }
 
@@ -2592,6 +2710,8 @@ found:
   use->cost_map[i].cost = cost;
   use->cost_map[i].depends_on = depends_on;
   use->cost_map[i].value = value;
+  use->cost_map[i].comp = comp;
+  use->cost_map[i].inv_expr_id = inv_expr_id;
 }
 
 /* Gets cost of (USE, CANDIDATE) pair.  */
@@ -2640,7 +2760,7 @@ seq_cost (rtx seq, bool speed)
     {
       set = single_set (seq);
       if (set)
-	cost += rtx_cost (set, SET,speed);
+	cost += set_src_cost (SET_SRC (set), speed);
       else
 	cost++;
     }
@@ -2744,7 +2864,7 @@ computation_cost (tree expr, bool speed)
   unsigned cost;
   /* Avoid using hard regs in ways which may be unsupported.  */
   int regno = LAST_VIRTUAL_REGISTER + 1;
-  struct cgraph_node *node = cgraph_node (current_function_decl);
+  struct cgraph_node *node = cgraph_get_node (current_function_decl);
   enum node_frequency real_frequency = node->frequency;
 
   node->frequency = NODE_FREQUENCY_NORMAL;
@@ -2761,6 +2881,8 @@ computation_cost (tree expr, bool speed)
   if (MEM_P (rslt))
     cost += address_cost (XEXP (rslt, 0), TYPE_MODE (type),
 			  TYPE_ADDR_SPACE (type), speed);
+  else if (!REG_P (rslt))
+    cost += set_src_cost (rslt, speed);
 
   return cost;
 }
@@ -2776,26 +2898,6 @@ var_at_stmt (struct loop *loop, struct iv_cand *cand, gimple stmt)
     return cand->var_before;
 }
 
-/* Return the most significant (sign) bit of T.  Similar to tree_int_cst_msb,
-   but the bit is determined from TYPE_PRECISION, not MODE_BITSIZE.  */
-
-int
-tree_int_cst_sign_bit (const_tree t)
-{
-  unsigned bitno = TYPE_PRECISION (TREE_TYPE (t)) - 1;
-  unsigned HOST_WIDE_INT w;
-
-  if (bitno < HOST_BITS_PER_WIDE_INT)
-    w = TREE_INT_CST_LOW (t);
-  else
-    {
-      w = TREE_INT_CST_HIGH (t);
-      bitno -= HOST_BITS_PER_WIDE_INT;
-    }
-
-  return (w >> bitno) & 1;
-}
-
 /* If A is (TYPE) BA and B is (TYPE) BB, and the types of BA and BB have the
    same precision that is at least as wide as the precision of TYPE, stores
    BA to A and BB to B, and returns the type of BA.  Otherwise, returns the
@@ -2933,6 +3035,20 @@ get_computation (struct loop *loop, struct iv_use *use, struct iv_cand *cand)
   return get_computation_at (loop, use, cand, use->stmt);
 }
 
+/* Adjust the cost COST for being in loop setup rather than loop body.
+   If we're optimizing for space, the loop setup overhead is constant;
+   if we're optimizing for speed, amortize it over the per-iteration cost.  */
+static unsigned
+adjust_setup_cost (struct ivopts_data *data, unsigned cost)
+{
+  if (cost == INFTY)
+    return cost;
+  else if (optimize_loop_for_speed_p (data->current_loop))
+    return cost / avg_loop_niter (data->current_loop);
+  else
+    return cost;
+}
+
 /* Returns cost of addition in MODE.  */
 
 static unsigned
@@ -3141,9 +3257,8 @@ get_address_cost (bool symbol_present, bool var_present,
   if (!data)
     {
       HOST_WIDE_INT i;
-      HOST_WIDE_INT start = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
-      HOST_WIDE_INT rat, off;
-      int old_cse_not_expected;
+      HOST_WIDE_INT rat, off = 0;
+      int old_cse_not_expected, width;
       unsigned sym_p, var_p, off_p, rat_p, add_c;
       rtx seq, addr, base;
       rtx reg0, reg1;
@@ -3152,33 +3267,40 @@ get_address_cost (bool symbol_present, bool var_present,
 
       reg1 = gen_raw_REG (address_mode, LAST_VIRTUAL_REGISTER + 1);
 
+      width = GET_MODE_BITSIZE (address_mode) - 1;
+      if (width > (HOST_BITS_PER_WIDE_INT - 1))
+	width = HOST_BITS_PER_WIDE_INT - 1;
       addr = gen_rtx_fmt_ee (PLUS, address_mode, reg1, NULL_RTX);
-      for (i = start; i <= 1 << 20; i <<= 1)
+
+      for (i = width; i >= 0; i--)
 	{
-	  XEXP (addr, 1) = gen_int_mode (i, address_mode);
-	  if (!memory_address_addr_space_p (mem_mode, addr, as))
+	  off = -((HOST_WIDE_INT) 1 << i);
+	  XEXP (addr, 1) = gen_int_mode (off, address_mode);
+	  if (memory_address_addr_space_p (mem_mode, addr, as))
 	    break;
 	}
-      data->max_offset = i == start ? 0 : i >> 1;
-      off = data->max_offset;
+      data->min_offset = (i == -1? 0 : off);
 
-      for (i = start; i <= 1 << 20; i <<= 1)
+      for (i = width; i >= 0; i--)
 	{
-	  XEXP (addr, 1) = gen_int_mode (-i, address_mode);
-	  if (!memory_address_addr_space_p (mem_mode, addr, as))
+	  off = ((HOST_WIDE_INT) 1 << i) - 1;
+	  XEXP (addr, 1) = gen_int_mode (off, address_mode);
+	  if (memory_address_addr_space_p (mem_mode, addr, as))
 	    break;
 	}
-      data->min_offset = i == start ? 0 : -(i >> 1);
+      if (i == -1)
+        off = 0;
+      data->max_offset = off;
 
       if (dump_file && (dump_flags & TDF_DETAILS))
 	{
 	  fprintf (dump_file, "get_address_cost:\n");
-	  fprintf (dump_file, "  min offset %s %d\n",
+	  fprintf (dump_file, "  min offset %s " HOST_WIDE_INT_PRINT_DEC "\n",
 		   GET_MODE_NAME (mem_mode),
-		   (int) data->min_offset);
-	  fprintf (dump_file, "  max offset %s %d\n",
+		   data->min_offset);
+	  fprintf (dump_file, "  max offset %s " HOST_WIDE_INT_PRINT_DEC "\n",
 		   GET_MODE_NAME (mem_mode),
-		   (int) data->max_offset);
+		   data->max_offset);
 	}
 
       rat = 1;
@@ -3379,6 +3501,42 @@ get_address_cost (bool symbol_present, bool var_present,
   return new_cost (cost + acost, complexity);
 }
 
+ /* Calculate the SPEED or size cost of shiftadd EXPR in MODE.  MULT is the
+    the EXPR operand holding the shift.  COST0 and COST1 are the costs for
+    calculating the operands of EXPR.  Returns true if successful, and returns
+    the cost in COST.  */
+
+static bool
+get_shiftadd_cost (tree expr, enum machine_mode mode, comp_cost cost0,
+                   comp_cost cost1, tree mult, bool speed, comp_cost *cost)
+{
+  comp_cost res;
+  tree op1 = TREE_OPERAND (expr, 1);
+  tree cst = TREE_OPERAND (mult, 1);
+  tree multop = TREE_OPERAND (mult, 0);
+  int m = exact_log2 (int_cst_value (cst));
+  int maxm = MIN (BITS_PER_WORD, GET_MODE_BITSIZE (mode));
+  int sa_cost;
+
+  if (!(m >= 0 && m < maxm))
+    return false;
+
+  sa_cost = (TREE_CODE (expr) != MINUS_EXPR
+             ? shiftadd_cost[speed][mode][m]
+             : (mult == op1
+                ? shiftsub1_cost[speed][mode][m]
+                : shiftsub0_cost[speed][mode][m]));
+  res = new_cost (sa_cost, 0);
+  res = add_costs (res, mult == op1 ? cost0 : cost1);
+
+  STRIP_NOPS (multop);
+  if (!is_gimple_val (multop))
+    res = add_costs (res, force_expr_to_var_cost (multop, speed));
+
+  *cost = res;
+  return true;
+}
+
 /* Estimates cost of forcing expression EXPR into a variable.  */
 
 static comp_cost
@@ -3415,9 +3573,7 @@ force_expr_to_var_cost (tree expr, bool speed)
 	  symbol_cost[i] = computation_cost (addr, i) + 1;
 
 	  address_cost[i]
-	    = computation_cost (build2 (POINTER_PLUS_EXPR, type,
-					addr,
-					build_int_cst (sizetype, 2000)), i) + 1;
+	    = computation_cost (fold_build_pointer_plus_hwi (addr, 2000), i) + 1;
 	  if (dump_file && (dump_flags & TDF_DETAILS))
 	    {
 	      fprintf (dump_file, "force_expr_to_var_cost %s costs:\n", i ? "speed" : "size");
@@ -3504,6 +3660,21 @@ force_expr_to_var_cost (tree expr, bool speed)
     case MINUS_EXPR:
     case NEGATE_EXPR:
       cost = new_cost (add_cost (mode, speed), 0);
+      if (TREE_CODE (expr) != NEGATE_EXPR)
+        {
+          tree mult = NULL_TREE;
+          comp_cost sa_cost;
+          if (TREE_CODE (op1) == MULT_EXPR)
+            mult = op1;
+          else if (TREE_CODE (op0) == MULT_EXPR)
+            mult = op0;
+
+          if (mult != NULL_TREE
+              && cst_and_fits_in_hwi (TREE_OPERAND (mult, 1))
+              && get_shiftadd_cost (expr, mode, cost0, cost1, mult, speed,
+                                    &sa_cost))
+            return sa_cost;
+        }
       break;
 
     case MULT_EXPR:
@@ -3689,6 +3860,153 @@ difference_cost (struct ivopts_data *data,
   return force_var_cost (data, aff_combination_to_tree (&aff_e1), depends_on);
 }
 
+/* Returns true if AFF1 and AFF2 are identical.  */
+
+static bool
+compare_aff_trees (aff_tree *aff1, aff_tree *aff2)
+{
+  unsigned i;
+
+  if (aff1->n != aff2->n)
+    return false;
+
+  for (i = 0; i < aff1->n; i++)
+    {
+      if (double_int_cmp (aff1->elts[i].coef, aff2->elts[i].coef, 0) != 0)
+        return false;
+
+      if (!operand_equal_p (aff1->elts[i].val, aff2->elts[i].val, 0))
+        return false;
+    }
+  return true;
+}
+
+/* Stores EXPR in DATA->inv_expr_tab, and assigns it an inv_expr_id.  */
+
+static int
+get_expr_id (struct ivopts_data *data, tree expr)
+{
+  struct iv_inv_expr_ent ent;
+  struct iv_inv_expr_ent **slot;
+
+  ent.expr = expr;
+  ent.hash = iterative_hash_expr (expr, 0);
+  slot = (struct iv_inv_expr_ent **) htab_find_slot (data->inv_expr_tab,
+                                                     &ent, INSERT);
+  if (*slot)
+    return (*slot)->id;
+
+  *slot = XNEW (struct iv_inv_expr_ent);
+  (*slot)->expr = expr;
+  (*slot)->hash = ent.hash;
+  (*slot)->id = data->inv_expr_id++;
+  return (*slot)->id;
+}
+
+/* Returns the pseudo expr id if expression UBASE - RATIO * CBASE
+   requires a new compiler generated temporary.  Returns -1 otherwise.
+   ADDRESS_P is a flag indicating if the expression is for address
+   computation.  */
+
+static int
+get_loop_invariant_expr_id (struct ivopts_data *data, tree ubase,
+                            tree cbase, HOST_WIDE_INT ratio,
+                            bool address_p)
+{
+  aff_tree ubase_aff, cbase_aff;
+  tree expr, ub, cb;
+
+  STRIP_NOPS (ubase);
+  STRIP_NOPS (cbase);
+  ub = ubase;
+  cb = cbase;
+
+  if ((TREE_CODE (ubase) == INTEGER_CST)
+      && (TREE_CODE (cbase) == INTEGER_CST))
+    return -1;
+
+  /* Strips the constant part. */
+  if (TREE_CODE (ubase) == PLUS_EXPR
+      || TREE_CODE (ubase) == MINUS_EXPR
+      || TREE_CODE (ubase) == POINTER_PLUS_EXPR)
+    {
+      if (TREE_CODE (TREE_OPERAND (ubase, 1)) == INTEGER_CST)
+        ubase = TREE_OPERAND (ubase, 0);
+    }
+
+  /* Strips the constant part. */
+  if (TREE_CODE (cbase) == PLUS_EXPR
+      || TREE_CODE (cbase) == MINUS_EXPR
+      || TREE_CODE (cbase) == POINTER_PLUS_EXPR)
+    {
+      if (TREE_CODE (TREE_OPERAND (cbase, 1)) == INTEGER_CST)
+        cbase = TREE_OPERAND (cbase, 0);
+    }
+
+  if (address_p)
+    {
+      if (((TREE_CODE (ubase) == SSA_NAME)
+           || (TREE_CODE (ubase) == ADDR_EXPR
+               && is_gimple_min_invariant (ubase)))
+          && (TREE_CODE (cbase) == INTEGER_CST))
+        return -1;
+
+      if (((TREE_CODE (cbase) == SSA_NAME)
+           || (TREE_CODE (cbase) == ADDR_EXPR
+               && is_gimple_min_invariant (cbase)))
+          && (TREE_CODE (ubase) == INTEGER_CST))
+        return -1;
+    }
+
+  if (ratio == 1)
+    {
+      if(operand_equal_p (ubase, cbase, 0))
+        return -1;
+
+      if (TREE_CODE (ubase) == ADDR_EXPR
+          && TREE_CODE (cbase) == ADDR_EXPR)
+        {
+          tree usym, csym;
+
+          usym = TREE_OPERAND (ubase, 0);
+          csym = TREE_OPERAND (cbase, 0);
+          if (TREE_CODE (usym) == ARRAY_REF)
+            {
+              tree ind = TREE_OPERAND (usym, 1);
+              if (TREE_CODE (ind) == INTEGER_CST
+                  && host_integerp (ind, 0)
+                  && TREE_INT_CST_LOW (ind) == 0)
+                usym = TREE_OPERAND (usym, 0);
+            }
+          if (TREE_CODE (csym) == ARRAY_REF)
+            {
+              tree ind = TREE_OPERAND (csym, 1);
+              if (TREE_CODE (ind) == INTEGER_CST
+                  && host_integerp (ind, 0)
+                  && TREE_INT_CST_LOW (ind) == 0)
+                csym = TREE_OPERAND (csym, 0);
+            }
+          if (operand_equal_p (usym, csym, 0))
+            return -1;
+        }
+      /* Now do more complex comparison  */
+      tree_to_aff_combination (ubase, TREE_TYPE (ubase), &ubase_aff);
+      tree_to_aff_combination (cbase, TREE_TYPE (cbase), &cbase_aff);
+      if (compare_aff_trees (&ubase_aff, &cbase_aff))
+        return -1;
+    }
+
+  tree_to_aff_combination (ub, TREE_TYPE (ub), &ubase_aff);
+  tree_to_aff_combination (cb, TREE_TYPE (cb), &cbase_aff);
+
+  aff_combination_scale (&cbase_aff, shwi_to_double_int (-1 * ratio));
+  aff_combination_add (&ubase_aff, &cbase_aff);
+  expr = aff_combination_to_tree (&ubase_aff);
+  return get_expr_id (data, expr);
+}
+
+
+
 /* Determines the cost of the computation by that USE is expressed
    from induction variable CAND.  If ADDRESS_P is true, we just need
    to create an address from it, otherwise we want to get it into
@@ -3701,7 +4019,8 @@ static comp_cost
 get_computation_cost_at (struct ivopts_data *data,
 			 struct iv_use *use, struct iv_cand *cand,
 			 bool address_p, bitmap *depends_on, gimple at,
-			 bool *can_autoinc)
+			 bool *can_autoinc,
+                         int *inv_expr_id)
 {
   tree ubase = use->iv->base, ustep = use->iv->step;
   tree cbase, cstep;
@@ -3729,7 +4048,11 @@ get_computation_cost_at (struct ivopts_data *data,
       return infinite_cost;
     }
 
-  if (address_p)
+  if (address_p
+      || (use->iv->base_object
+	  && cand->iv->base_object
+	  && POINTER_TYPE_P (TREE_TYPE (use->iv->base_object))
+	  && POINTER_TYPE_P (TREE_TYPE (cand->iv->base_object))))
     {
       /* Do not try to express address of an object with computation based
 	 on address of a different object.  This may cause problems in rtl
@@ -3770,6 +4093,8 @@ get_computation_cost_at (struct ivopts_data *data,
   STRIP_NOPS (cbase);
   ctype = TREE_TYPE (cbase);
 
+  stmt_is_after_inc = stmt_after_increment (data->current_loop, cand, at);
+
   /* use = ubase + ratio * (var - cbase).  If either cbase is a constant
      or ratio == 1, it is better to handle this like
 
@@ -3784,13 +4109,31 @@ get_computation_cost_at (struct ivopts_data *data,
 			      ubase, build_int_cst (utype, 0),
 			      &symbol_present, &var_present, &offset,
 			      depends_on);
+      cost.cost /= avg_loop_niter (data->current_loop);
     }
   else if (ratio == 1)
     {
+      tree real_cbase = cbase;
+
+      /* Check to see if any adjustment is needed.  */
+      if (cstepi == 0 && stmt_is_after_inc)
+        {
+          aff_tree real_cbase_aff;
+          aff_tree cstep_aff;
+
+          tree_to_aff_combination (cbase, TREE_TYPE (real_cbase),
+                                   &real_cbase_aff);
+          tree_to_aff_combination (cstep, TREE_TYPE (cstep), &cstep_aff);
+
+          aff_combination_add (&real_cbase_aff, &cstep_aff);
+          real_cbase = aff_combination_to_tree (&real_cbase_aff);
+        }
+
       cost = difference_cost (data,
-			      ubase, cbase,
+			      ubase, real_cbase,
 			      &symbol_present, &var_present, &offset,
 			      depends_on);
+      cost.cost /= avg_loop_niter (data->current_loop);
     }
   else if (address_p
 	   && !POINTER_TYPE_P (ctype)
@@ -3804,21 +4147,31 @@ get_computation_cost_at (struct ivopts_data *data,
 			      ubase, cbase,
 			      &symbol_present, &var_present, &offset,
 			      depends_on);
+      cost.cost /= avg_loop_niter (data->current_loop);
     }
   else
     {
       cost = force_var_cost (data, cbase, depends_on);
-      cost.cost += add_cost (TYPE_MODE (ctype), data->speed);
       cost = add_costs (cost,
 			difference_cost (data,
 					 ubase, build_int_cst (utype, 0),
 					 &symbol_present, &var_present,
 					 &offset, depends_on));
+      cost.cost /= avg_loop_niter (data->current_loop);
+      cost.cost += add_cost (TYPE_MODE (ctype), data->speed);
+    }
+
+  if (inv_expr_id)
+    {
+      *inv_expr_id =
+          get_loop_invariant_expr_id (data, ubase, cbase, ratio, address_p);
+      /* Clear depends on.  */
+      if (*inv_expr_id != -1 && depends_on && *depends_on)
+        bitmap_clear (*depends_on);
     }
 
   /* If we are after the increment, the value of the candidate is higher by
      one iteration.  */
-  stmt_is_after_inc = stmt_after_increment (data->current_loop, cand, at);
   if (stmt_is_after_inc)
     offset -= ratio * cstepi;
 
@@ -3845,8 +4198,8 @@ get_computation_cost_at (struct ivopts_data *data,
   /* Symbol + offset should be compile-time computable so consider that they
       are added once to the variable, if present.  */
   if (var_present && (symbol_present || offset))
-    cost.cost += add_cost (TYPE_MODE (ctype), speed)
-		 / AVG_LOOP_NITER (data->current_loop);
+    cost.cost += adjust_setup_cost (data,
+				    add_cost (TYPE_MODE (ctype), speed));
 
   /* Having offset does not affect runtime cost in case it is added to
      symbol, but it increases complexity.  */
@@ -3858,6 +4211,7 @@ get_computation_cost_at (struct ivopts_data *data,
   aratio = ratio > 0 ? ratio : -ratio;
   if (aratio != 1)
     cost.cost += multiply_by_cost (aratio, TYPE_MODE (ctype), speed);
+  return cost;
 
 fallback:
   if (can_autoinc)
@@ -3871,7 +4225,7 @@ fallback:
       return infinite_cost;
 
     if (address_p)
-      comp = build1 (INDIRECT_REF, TREE_TYPE (TREE_TYPE (comp)), comp);
+      comp = build_simple_mem_ref (comp);
 
     return new_cost (computation_cost (comp, speed), 0);
   }
@@ -3887,11 +4241,12 @@ fallback:
 static comp_cost
 get_computation_cost (struct ivopts_data *data,
 		      struct iv_use *use, struct iv_cand *cand,
-		      bool address_p, bitmap *depends_on, bool *can_autoinc)
+		      bool address_p, bitmap *depends_on,
+                      bool *can_autoinc, int *inv_expr_id)
 {
   return get_computation_cost_at (data,
 				  use, cand, address_p, depends_on, use->stmt,
-				  can_autoinc);
+				  can_autoinc, inv_expr_id);
 }
 
 /* Determines cost of basing replacement of USE on CAND in a generic
@@ -3903,6 +4258,7 @@ determine_use_iv_cost_generic (struct ivopts_data *data,
 {
   bitmap depends_on;
   comp_cost cost;
+  int inv_expr_id = -1;
 
   /* The simple case first -- if we need to express value of the preserved
      original biv, the cost is 0.  This also prevents us from counting the
@@ -3911,12 +4267,16 @@ determine_use_iv_cost_generic (struct ivopts_data *data,
   if (cand->pos == IP_ORIGINAL
       && cand->incremented_at == use->stmt)
     {
-      set_use_iv_cost (data, use, cand, zero_cost, NULL, NULL_TREE);
+      set_use_iv_cost (data, use, cand, zero_cost, NULL, NULL_TREE,
+                       ERROR_MARK, -1);
       return true;
     }
 
-  cost = get_computation_cost (data, use, cand, false, &depends_on, NULL);
-  set_use_iv_cost (data, use, cand, cost, depends_on, NULL_TREE);
+  cost = get_computation_cost (data, use, cand, false, &depends_on,
+                               NULL, &inv_expr_id);
+
+  set_use_iv_cost (data, use, cand, cost, depends_on, NULL_TREE, ERROR_MARK,
+                   inv_expr_id);
 
   return !infinite_cost_p (cost);
 }
@@ -3929,8 +4289,9 @@ determine_use_iv_cost_address (struct ivopts_data *data,
 {
   bitmap depends_on;
   bool can_autoinc;
+  int inv_expr_id = -1;
   comp_cost cost = get_computation_cost (data, use, cand, true, &depends_on,
-					 &can_autoinc);
+					 &can_autoinc, &inv_expr_id);
 
   if (cand->ainc_use == use)
     {
@@ -3942,7 +4303,8 @@ determine_use_iv_cost_address (struct ivopts_data *data,
       else if (cand->pos == IP_AFTER_USE || cand->pos == IP_BEFORE_USE)
 	cost = infinite_cost;
     }
-  set_use_iv_cost (data, use, cand, cost, depends_on, NULL_TREE);
+  set_use_iv_cost (data, use, cand, cost, depends_on, NULL_TREE, ERROR_MARK,
+                   inv_expr_id);
 
   return !infinite_cost_p (cost);
 }
@@ -3982,15 +4344,20 @@ iv_period (struct iv *iv)
 
   gcc_assert (step && TREE_CODE (step) == INTEGER_CST);
 
-  /* Period of the iv is gcd (step, type range).  Since type range is power
-     of two, it suffices to determine the maximum power of two that divides
-     step.  */
-  pow2div = num_ending_zeros (step);
   type = unsigned_type_for (TREE_TYPE (step));
+  /* Period of the iv is lcm (step, type_range)/step -1,
+     i.e., N*type_range/step - 1. Since type range is power
+     of two, N == (step >> num_of_ending_zeros_binary (step),
+     so the final result is
+
+       (type_range >> num_of_ending_zeros_binary (step)) - 1
+
+  */
+  pow2div = num_ending_zeros (step);
 
   period = build_low_bits_mask (type,
-				(TYPE_PRECISION (type)
-				 - tree_low_cst (pow2div, 1)));
+                                (TYPE_PRECISION (type)
+                                 - tree_low_cst (pow2div, 1)));
 
   return period;
 }
@@ -4012,18 +4379,264 @@ iv_elimination_compare (struct ivopts_data *data, struct iv_use *use)
   return (exit->flags & EDGE_TRUE_VALUE ? EQ_EXPR : NE_EXPR);
 }
 
+static tree
+strip_wrap_conserving_type_conversions (tree exp)
+{
+  while (tree_ssa_useless_type_conversion (exp)
+	 && (nowrap_type_p (TREE_TYPE (exp))
+	     == nowrap_type_p (TREE_TYPE (TREE_OPERAND (exp, 0)))))
+    exp = TREE_OPERAND (exp, 0);
+  return exp;
+}
+
+/* Walk the SSA form and check whether E == WHAT.  Fairly simplistic, we
+   check for an exact match.  */
+
+static bool
+expr_equal_p (tree e, tree what)
+{
+  gimple stmt;
+  enum tree_code code;
+
+  e = strip_wrap_conserving_type_conversions (e);
+  what = strip_wrap_conserving_type_conversions (what);
+
+  code = TREE_CODE (what);
+  if (TREE_TYPE (e) != TREE_TYPE (what))
+    return false;
+
+  if (operand_equal_p (e, what, 0))
+    return true;
+
+  if (TREE_CODE (e) != SSA_NAME)
+    return false;
+
+  stmt = SSA_NAME_DEF_STMT (e);
+  if (gimple_code (stmt) != GIMPLE_ASSIGN
+      || gimple_assign_rhs_code (stmt) != code)
+    return false;
+
+  switch (get_gimple_rhs_class (code))
+    {
+    case GIMPLE_BINARY_RHS:
+      if (!expr_equal_p (gimple_assign_rhs2 (stmt), TREE_OPERAND (what, 1)))
+	return false;
+      /* Fallthru.  */
+
+    case GIMPLE_UNARY_RHS:
+    case GIMPLE_SINGLE_RHS:
+      return expr_equal_p (gimple_assign_rhs1 (stmt), TREE_OPERAND (what, 0));
+    default:
+      return false;
+    }
+}
+
+/* Returns true if we can prove that BASE - OFFSET does not overflow.  For now,
+   we only detect the situation that BASE = SOMETHING + OFFSET, where the
+   calculation is performed in non-wrapping type.
+
+   TODO: More generally, we could test for the situation that
+	 BASE = SOMETHING + OFFSET' and OFFSET is between OFFSET' and zero.
+	 This would require knowing the sign of OFFSET.
+
+	 Also, we only look for the first addition in the computation of BASE.
+	 More complex analysis would be better, but introducing it just for
+	 this optimization seems like an overkill.  */
+
+static bool
+difference_cannot_overflow_p (tree base, tree offset)
+{
+  enum tree_code code;
+  tree e1, e2;
+
+  if (!nowrap_type_p (TREE_TYPE (base)))
+    return false;
+
+  base = expand_simple_operations (base);
+
+  if (TREE_CODE (base) == SSA_NAME)
+    {
+      gimple stmt = SSA_NAME_DEF_STMT (base);
+
+      if (gimple_code (stmt) != GIMPLE_ASSIGN)
+	return false;
+
+      code = gimple_assign_rhs_code (stmt);
+      if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
+	return false;
+
+      e1 = gimple_assign_rhs1 (stmt);
+      e2 = gimple_assign_rhs2 (stmt);
+    }
+  else
+    {
+      code = TREE_CODE (base);
+      if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
+	return false;
+      e1 = TREE_OPERAND (base, 0);
+      e2 = TREE_OPERAND (base, 1);
+    }
+
+  /* TODO: deeper inspection may be necessary to prove the equality.  */
+  switch (code)
+    {
+    case PLUS_EXPR:
+      return expr_equal_p (e1, offset) || expr_equal_p (e2, offset);
+    case POINTER_PLUS_EXPR:
+      return expr_equal_p (e2, offset);
+
+    default:
+      return false;
+    }
+}
+
+/* Tries to replace loop exit by one formulated in terms of a LT_EXPR
+   comparison with CAND.  NITER describes the number of iterations of
+   the loops.  If successful, the comparison in COMP_P is altered accordingly.
+
+   We aim to handle the following situation:
+
+   sometype *base, *p;
+   int a, b, i;
+
+   i = a;
+   p = p_0 = base + a;
+
+   do
+     {
+       bla (*p);
+       p++;
+       i++;
+     }
+   while (i < b);
+
+   Here, the number of iterations of the loop is (a + 1 > b) ? 0 : b - a - 1.
+   We aim to optimize this to
+
+   p = p_0 = base + a;
+   do
+     {
+       bla (*p);
+       p++;
+     }
+   while (p < p_0 - a + b);
+
+   This preserves the correctness, since the pointer arithmetics does not
+   overflow.  More precisely:
+
+   1) if a + 1 <= b, then p_0 - a + b is the final value of p, hence there is no
+      overflow in computing it or the values of p.
+   2) if a + 1 > b, then we need to verify that the expression p_0 - a does not
+      overflow.  To prove this, we use the fact that p_0 = base + a.  */
+
+static bool
+iv_elimination_compare_lt (struct ivopts_data *data,
+                           struct iv_cand *cand, enum tree_code *comp_p,
+			   struct tree_niter_desc *niter)
+{
+  tree cand_type, a, b, mbz, nit_type = TREE_TYPE (niter->niter), offset;
+  struct affine_tree_combination nit, tmpa, tmpb;
+  enum tree_code comp;
+  HOST_WIDE_INT step;
+
+  /* We need to know that the candidate induction variable does not overflow.
+     While more complex analysis may be used to prove this, for now just
+     check that the variable appears in the original program and that it
+     is computed in a type that guarantees no overflows.  */
+  cand_type = TREE_TYPE (cand->iv->base);
+  if (cand->pos != IP_ORIGINAL || !nowrap_type_p (cand_type))
+    return false;
+
+  /* Make sure that the loop iterates till the loop bound is hit, as otherwise
+     the calculation of the BOUND could overflow, making the comparison
+     invalid.  */
+  if (!data->loop_single_exit_p)
+    return false;
+
+  /* We need to be able to decide whether candidate is increasing or decreasing
+     in order to choose the right comparison operator.  */
+  if (!cst_and_fits_in_hwi (cand->iv->step))
+    return false;
+  step = int_cst_value (cand->iv->step);
+
+  /* Check that the number of iterations matches the expected pattern:
+     a + 1 > b ? 0 : b - a - 1.  */
+  mbz = niter->may_be_zero;
+  if (TREE_CODE (mbz) == GT_EXPR)
+    {
+      /* Handle a + 1 > b.  */
+      tree op0 = TREE_OPERAND (mbz, 0);
+      if (TREE_CODE (op0) == PLUS_EXPR && integer_onep (TREE_OPERAND (op0, 1)))
+	{
+	  a = TREE_OPERAND (op0, 0);
+	  b = TREE_OPERAND (mbz, 1);
+	}
+      else
+	return false;
+    }
+  else if (TREE_CODE (mbz) == LT_EXPR)
+    {
+      tree op1 = TREE_OPERAND (mbz, 1);
+
+      /* Handle b < a + 1.  */
+      if (TREE_CODE (op1) == PLUS_EXPR && integer_onep (TREE_OPERAND (op1, 1)))
+        {
+          a = TREE_OPERAND (op1, 0);
+          b = TREE_OPERAND (mbz, 0);
+        }
+      else
+	return false;
+    }
+  else
+    return false;
+
+  /* Expected number of iterations is B - A - 1.  Check that it matches
+     the actual number, i.e., that B - A - NITER = 1.  */
+  tree_to_aff_combination (niter->niter, nit_type, &nit);
+  tree_to_aff_combination (fold_convert (nit_type, a), nit_type, &tmpa);
+  tree_to_aff_combination (fold_convert (nit_type, b), nit_type, &tmpb);
+  aff_combination_scale (&nit, double_int_minus_one);
+  aff_combination_scale (&tmpa, double_int_minus_one);
+  aff_combination_add (&tmpb, &tmpa);
+  aff_combination_add (&tmpb, &nit);
+  if (tmpb.n != 0 || !double_int_equal_p (tmpb.offset, double_int_one))
+    return false;
+
+  /* Finally, check that CAND->IV->BASE - CAND->IV->STEP * A does not
+     overflow.  */
+  offset = fold_build2 (MULT_EXPR, TREE_TYPE (cand->iv->step),
+			cand->iv->step,
+			fold_convert (TREE_TYPE (cand->iv->step), a));
+  if (!difference_cannot_overflow_p (cand->iv->base, offset))
+    return false;
+
+  /* Determine the new comparison operator.  */
+  comp = step < 0 ? GT_EXPR : LT_EXPR;
+  if (*comp_p == NE_EXPR)
+    *comp_p = comp;
+  else if (*comp_p == EQ_EXPR)
+    *comp_p = invert_tree_comparison (comp, false);
+  else
+    gcc_unreachable ();
+
+  return true;
+}
+
 /* Check whether it is possible to express the condition in USE by comparison
-   of candidate CAND.  If so, store the value compared with to BOUND.  */
+   of candidate CAND.  If so, store the value compared with to BOUND, and the
+   comparison operator to COMP.  */
 
 static bool
 may_eliminate_iv (struct ivopts_data *data,
-		  struct iv_use *use, struct iv_cand *cand, tree *bound)
+		  struct iv_use *use, struct iv_cand *cand, tree *bound,
+		  enum tree_code *comp)
 {
   basic_block ex_bb;
   edge exit;
-  tree nit, period;
+  tree period;
   struct loop *loop = data->current_loop;
   aff_tree bnd;
+  struct tree_niter_desc *desc = NULL;
 
   if (TREE_CODE (cand->iv->step) != INTEGER_CST)
     return false;
@@ -4042,8 +4655,8 @@ may_eliminate_iv (struct ivopts_data *data,
   if (flow_bb_inside_loop_p (loop, exit->dest))
     return false;
 
-  nit = niter_for_exit (data, exit);
-  if (!nit)
+  desc = niter_for_exit (data, exit);
+  if (!desc)
     return false;
 
   /* Determine whether we can use the variable to test the exit condition.
@@ -4052,39 +4665,89 @@ may_eliminate_iv (struct ivopts_data *data,
   period = iv_period (cand->iv);
 
   /* If the number of iterations is constant, compare against it directly.  */
-  if (TREE_CODE (nit) == INTEGER_CST)
-    {
-      if (!tree_int_cst_lt (nit, period))
-	return false;
+  if (TREE_CODE (desc->niter) == INTEGER_CST)
+    {
+      /* See cand_value_at.  */
+      if (stmt_after_increment (loop, cand, use->stmt))
+        {
+          if (!tree_int_cst_lt (desc->niter, period))
+            return false;
+        }
+      else
+        {
+          if (tree_int_cst_lt (period, desc->niter))
+            return false;
+        }
     }
 
   /* If not, and if this is the only possible exit of the loop, see whether
      we can get a conservative estimate on the number of iterations of the
      entire loop and compare against that instead.  */
-  else if (loop_only_exit_p (loop, exit))
+  else
     {
       double_int period_value, max_niter;
-      if (!estimated_loop_iterations (loop, true, &max_niter))
-	return false;
-      period_value = tree_to_double_int (period);
-      if (double_int_ucmp (max_niter, period_value) >= 0)
-	return false;
-    }
-
-  /* Otherwise, punt.  */
-  else
-    return false;
 
-  cand_value_at (loop, cand, use->stmt, nit, &bnd);
+      max_niter = desc->max;
+      if (stmt_after_increment (loop, cand, use->stmt))
+        max_niter = double_int_add (max_niter, double_int_one);
+      period_value = tree_to_double_int (period);
+      if (double_int_ucmp (max_niter, period_value) > 0)
+        {
+          /* See if we can take advantage of infered loop bound information.  */
+          if (data->loop_single_exit_p)
+            {
+              if (!estimated_loop_iterations (loop, true, &max_niter))
+                return false;
+              /* The loop bound is already adjusted by adding 1.  */
+              if (double_int_ucmp (max_niter, period_value) > 0)
+                return false;
+            }
+          else
+            return false;
+        }
+    }
+
+  cand_value_at (loop, cand, use->stmt, desc->niter, &bnd);
 
   *bound = aff_combination_to_tree (&bnd);
+  *comp = iv_elimination_compare (data, use);
+
   /* It is unlikely that computing the number of iterations using division
      would be more profitable than keeping the original induction variable.  */
   if (expression_expensive_p (*bound))
     return false;
+
+  /* Sometimes, it is possible to handle the situation that the number of
+     iterations may be zero unless additional assumtions by using <
+     instead of != in the exit condition.
+
+     TODO: we could also calculate the value MAY_BE_ZERO ? 0 : NITER and
+	   base the exit condition on it.  However, that is often too
+	   expensive.  */
+  if (!integer_zerop (desc->may_be_zero))
+    return iv_elimination_compare_lt (data, cand, comp, desc);
+
   return true;
 }
 
+ /* Calculates the cost of BOUND, if it is a PARM_DECL.  A PARM_DECL must
+    be copied, if is is used in the loop body and DATA->body_includes_call.  */
+
+static int
+parm_decl_cost (struct ivopts_data *data, tree bound)
+{
+  tree sbound = bound;
+  STRIP_NOPS (sbound);
+
+  if (TREE_CODE (sbound) == SSA_NAME
+      && TREE_CODE (SSA_NAME_VAR (sbound)) == PARM_DECL
+      && gimple_nop_p (SSA_NAME_DEF_STMT (sbound))
+      && data->body_includes_call)
+    return COSTS_N_INSNS (1);
+
+  return 0;
+}
+
 /* Determines cost of basing replacement of USE on CAND in a condition.  */
 
 static bool
@@ -4094,24 +4757,42 @@ determine_use_iv_cost_condition (struct ivopts_data *data,
   tree bound = NULL_TREE;
   struct iv *cmp_iv;
   bitmap depends_on_elim = NULL, depends_on_express = NULL, depends_on;
-  comp_cost elim_cost, express_cost, cost;
+  comp_cost elim_cost, express_cost, cost, bound_cost;
   bool ok;
+  int elim_inv_expr_id = -1, express_inv_expr_id = -1, inv_expr_id;
   tree *control_var, *bound_cst;
+  enum tree_code comp = ERROR_MARK;
 
   /* Only consider real candidates.  */
   if (!cand->iv)
     {
-      set_use_iv_cost (data, use, cand, infinite_cost, NULL, NULL_TREE);
+      set_use_iv_cost (data, use, cand, infinite_cost, NULL, NULL_TREE,
+		       ERROR_MARK, -1);
       return false;
     }
 
   /* Try iv elimination.  */
-  if (may_eliminate_iv (data, use, cand, &bound))
+  if (may_eliminate_iv (data, use, cand, &bound, &comp))
     {
       elim_cost = force_var_cost (data, bound, &depends_on_elim);
+      if (elim_cost.cost == 0)
+        elim_cost.cost = parm_decl_cost (data, bound);
+      else if (TREE_CODE (bound) == INTEGER_CST)
+        elim_cost.cost = 0;
+      /* If we replace a loop condition 'i < n' with 'p < base + n',
+	 depends_on_elim will have 'base' and 'n' set, which implies
+	 that both 'base' and 'n' will be live during the loop.	 More likely,
+	 'base + n' will be loop invariant, resulting in only one live value
+	 during the loop.  So in that case we clear depends_on_elim and set
+        elim_inv_expr_id instead.  */
+      if (depends_on_elim && bitmap_count_bits (depends_on_elim) > 1)
+	{
+	  elim_inv_expr_id = get_expr_id (data, bound);
+	  bitmap_clear (depends_on_elim);
+	}
       /* The bound is a loop invariant, so it will be only computed
 	 once.  */
-      elim_cost.cost /= AVG_LOOP_NITER (data->current_loop);
+      elim_cost.cost = adjust_setup_cost (data, elim_cost.cost);
     }
   else
     elim_cost = infinite_cost;
@@ -4135,16 +4816,26 @@ determine_use_iv_cost_condition (struct ivopts_data *data,
     elim_cost.cost -= 1;
 
   express_cost = get_computation_cost (data, use, cand, false,
-				       &depends_on_express, NULL);
+				       &depends_on_express, NULL,
+                                       &express_inv_expr_id);
   fd_ivopts_data = data;
   walk_tree (&cmp_iv->base, find_depends, &depends_on_express, NULL);
 
+  /* Count the cost of the original bound as well.  */
+  bound_cost = force_var_cost (data, *bound_cst, NULL);
+  if (bound_cost.cost == 0)
+    bound_cost.cost = parm_decl_cost (data, *bound_cst);
+  else if (TREE_CODE (*bound_cst) == INTEGER_CST)
+    bound_cost.cost = 0;
+  express_cost.cost += bound_cost.cost;
+
   /* Choose the better approach, preferring the eliminated IV. */
   if (compare_costs (elim_cost, express_cost) <= 0)
     {
       cost = elim_cost;
       depends_on = depends_on_elim;
       depends_on_elim = NULL;
+      inv_expr_id = elim_inv_expr_id;
     }
   else
     {
@@ -4152,9 +4843,11 @@ determine_use_iv_cost_condition (struct ivopts_data *data,
       depends_on = depends_on_express;
       depends_on_express = NULL;
       bound = NULL_TREE;
+      comp = ERROR_MARK;
+      inv_expr_id = express_inv_expr_id;
     }
 
-  set_use_iv_cost (data, use, cand, cost, depends_on, bound);
+  set_use_iv_cost (data, use, cand, cost, depends_on, bound, comp, inv_expr_id);
 
   if (depends_on_elim)
     BITMAP_FREE (depends_on_elim);
@@ -4202,7 +4895,7 @@ autoinc_possible_for_pair (struct ivopts_data *data, struct iv_use *use,
     return false;
 
   cost = get_computation_cost (data, use, cand, true, &depends_on,
-			       &can_autoinc);
+			       &can_autoinc, NULL);
 
   BITMAP_FREE (depends_on);
 
@@ -4326,6 +5019,8 @@ determine_use_iv_costs (struct ivopts_data *data)
 	      if (use->cost_map[j].depends_on)
 		bitmap_print (dump_file,
 			      use->cost_map[j].depends_on, "","");
+              if (use->cost_map[j].inv_expr_id != -1)
+                fprintf (dump_file, " inv_expr:%d", use->cost_map[j].inv_expr_id);
 	      fprintf (dump_file, "\n");
 	    }
 
@@ -4356,9 +5051,14 @@ determine_iv_cost (struct ivopts_data *data, struct iv_cand *cand)
 
   base = cand->iv->base;
   cost_base = force_var_cost (data, base, NULL);
+  /* It will be exceptional that the iv register happens to be initialized with
+     the proper value at no cost.  In general, there will at least be a regcopy
+     or a const set.  */
+  if (cost_base.cost == 0)
+    cost_base.cost = COSTS_N_INSNS (1);
   cost_step = add_cost (TYPE_MODE (TREE_TYPE (base)), data->speed);
 
-  cost = cost_step + cost_base.cost / AVG_LOOP_NITER (current_loop);
+  cost = cost_step + adjust_setup_cost (data, cost_base.cost);
 
   /* Prefer the original ivs unless we may gain something by replacing it.
      The reason is to make debugging simpler; so this is not relevant for
@@ -4411,7 +5111,8 @@ ivopts_global_cost_for_size (struct ivopts_data *data, unsigned size)
 {
   /* We add size to the cost, so that we prefer eliminating ivs
      if possible.  */
-  return size + estimate_reg_pressure_cost (size, data->regs_used, data->speed);
+  return size + estimate_reg_pressure_cost (size, data->regs_used, data->speed,
+					    data->body_includes_call);
 }
 
 /* For each size of the induction variable set determine the penalty.  */
@@ -4426,30 +5127,11 @@ determine_set_costs (struct ivopts_data *data)
   struct loop *loop = data->current_loop;
   bitmap_iterator bi;
 
-  /* We use the following model (definitely improvable, especially the
-     cost function -- TODO):
-
-     We estimate the number of registers available (using MD data), name it A.
-
-     We estimate the number of registers used by the loop, name it U.  This
-     number is obtained as the number of loop phi nodes (not counting virtual
-     registers and bivs) + the number of variables from outside of the loop.
-
-     We set a reserve R (free regs that are used for temporary computations,
-     etc.).  For now the reserve is a constant 3.
-
-     Let I be the number of induction variables.
-
-     -- if U + I + R <= A, the cost is I * SMALL_COST (just not to encourage
-	make a lot of ivs without a reason).
-     -- if A - R < U + I <= A, the cost is I * PRES_COST
-     -- if U + I > A, the cost is I * PRES_COST and
-        number of uses * SPILL_COST * (U + I - A) / (U + I) is added.  */
-
   if (dump_file && (dump_flags & TDF_DETAILS))
     {
       fprintf (dump_file, "Global costs:\n");
       fprintf (dump_file, "  target_avail_regs %d\n", target_avail_regs);
+      fprintf (dump_file, "  target_clobbered_regs %d\n", target_clobbered_regs);
       fprintf (dump_file, "  target_reg_cost %d\n", target_reg_cost[data->speed]);
       fprintf (dump_file, "  target_spill_cost %d\n", target_spill_cost[data->speed]);
     }
@@ -4519,14 +5201,26 @@ cheaper_cost_pair (struct cost_pair *a, struct cost_pair *b)
   return false;
 }
 
+
+/* Returns candidate by that USE is expressed in IVS.  */
+
+static struct cost_pair *
+iv_ca_cand_for_use (struct iv_ca *ivs, struct iv_use *use)
+{
+  return ivs->cand_for_use[use->id];
+}
+
 /* Computes the cost field of IVS structure.  */
 
 static void
 iv_ca_recount_cost (struct ivopts_data *data, struct iv_ca *ivs)
 {
   comp_cost cost = ivs->cand_use_cost;
+
   cost.cost += ivs->cand_cost;
-  cost.cost += ivopts_global_cost_for_size (data, ivs->n_regs);
+
+  cost.cost += ivopts_global_cost_for_size (data,
+                                            ivs->n_regs + ivs->num_used_inv_expr);
 
   ivs->cost = cost;
 }
@@ -4546,7 +5240,7 @@ iv_ca_set_remove_invariants (struct iv_ca *ivs, bitmap invs)
     {
       ivs->n_invariant_uses[iid]--;
       if (ivs->n_invariant_uses[iid] == 0)
-	ivs->n_regs--;
+        ivs->n_regs--;
     }
 }
 
@@ -4583,6 +5277,13 @@ iv_ca_set_no_cp (struct ivopts_data *data, struct iv_ca *ivs,
   ivs->cand_use_cost = sub_costs (ivs->cand_use_cost, cp->cost);
 
   iv_ca_set_remove_invariants (ivs, cp->depends_on);
+
+  if (cp->inv_expr_id != -1)
+    {
+      ivs->used_inv_expr[cp->inv_expr_id]--;
+      if (ivs->used_inv_expr[cp->inv_expr_id] == 0)
+        ivs->num_used_inv_expr--;
+    }
   iv_ca_recount_cost (data, ivs);
 }
 
@@ -4601,7 +5302,7 @@ iv_ca_set_add_invariants (struct iv_ca *ivs, bitmap invs)
     {
       ivs->n_invariant_uses[iid]++;
       if (ivs->n_invariant_uses[iid] == 1)
-	ivs->n_regs++;
+        ivs->n_regs++;
     }
 }
 
@@ -4640,19 +5341,28 @@ iv_ca_set_cp (struct ivopts_data *data, struct iv_ca *ivs,
 
       ivs->cand_use_cost = add_costs (ivs->cand_use_cost, cp->cost);
       iv_ca_set_add_invariants (ivs, cp->depends_on);
+
+      if (cp->inv_expr_id != -1)
+        {
+          ivs->used_inv_expr[cp->inv_expr_id]++;
+          if (ivs->used_inv_expr[cp->inv_expr_id] == 1)
+            ivs->num_used_inv_expr++;
+        }
       iv_ca_recount_cost (data, ivs);
     }
 }
 
 /* Extend set IVS by expressing USE by some of the candidates in it
-   if possible.  */
+   if possible. All important candidates will be considered
+   if IMPORTANT_CANDIDATES is true.  */
 
 static void
 iv_ca_add_use (struct ivopts_data *data, struct iv_ca *ivs,
-	       struct iv_use *use)
+	       struct iv_use *use, bool important_candidates)
 {
   struct cost_pair *best_cp = NULL, *cp;
   bitmap_iterator bi;
+  bitmap cands;
   unsigned i;
 
   gcc_assert (ivs->upto >= use->id);
@@ -4663,9 +5373,12 @@ iv_ca_add_use (struct ivopts_data *data, struct iv_ca *ivs,
       ivs->bad_uses++;
     }
 
-  EXECUTE_IF_SET_IN_BITMAP (ivs->cands, 0, i, bi)
+  cands = (important_candidates ? data->important_candidates : ivs->cands);
+  EXECUTE_IF_SET_IN_BITMAP (cands, 0, i, bi)
     {
-      cp = get_use_iv_cost (data, use, iv_cand (data, i));
+      struct iv_cand *cand = iv_cand (data, i);
+
+      cp = get_use_iv_cost (data, use, cand);
 
       if (cheaper_cost_pair (cp, best_cp))
 	best_cp = cp;
@@ -4745,14 +5458,6 @@ iv_ca_delta_join (struct iv_ca_delta *l1, struct iv_ca_delta *l2)
   return l1;
 }
 
-/* Returns candidate by that USE is expressed in IVS.  */
-
-static struct cost_pair *
-iv_ca_cand_for_use (struct iv_ca *ivs, struct iv_use *use)
-{
-  return ivs->cand_for_use[use->id];
-}
-
 /* Reverse the list of changes DELTA, forming the inverse to it.  */
 
 static struct iv_ca_delta *
@@ -4850,6 +5555,8 @@ iv_ca_new (struct ivopts_data *data)
   nw->cand_cost = 0;
   nw->n_invariant_uses = XCNEWVEC (unsigned, data->max_inv_id + 1);
   nw->cost = zero_cost;
+  nw->used_inv_expr = XCNEWVEC (unsigned, data->inv_expr_id + 1);
+  nw->num_used_inv_expr = 0;
 
   return nw;
 }
@@ -4863,6 +5570,7 @@ iv_ca_free (struct iv_ca **ivs)
   free ((*ivs)->n_cand_uses);
   BITMAP_FREE ((*ivs)->cands);
   free ((*ivs)->n_invariant_uses);
+  free ((*ivs)->used_inv_expr);
   free (*ivs);
   *ivs = NULL;
 }
@@ -4876,8 +5584,21 @@ iv_ca_dump (struct ivopts_data *data, FILE *file, struct iv_ca *ivs)
   unsigned i;
   comp_cost cost = iv_ca_cost (ivs);
 
-  fprintf (file, "  cost %d (complexity %d)\n", cost.cost, cost.complexity);
-  bitmap_print (file, ivs->cands, "  candidates ","\n");
+  fprintf (file, "  cost: %d (complexity %d)\n", cost.cost, cost.complexity);
+  fprintf (file, "  cand_cost: %d\n  cand_use_cost: %d (complexity %d)\n",
+           ivs->cand_cost, ivs->cand_use_cost.cost, ivs->cand_use_cost.complexity);
+  bitmap_print (file, ivs->cands, "  candidates: ","\n");
+
+   for (i = 0; i < ivs->upto; i++)
+    {
+      struct iv_use *use = iv_use (data, i);
+      struct cost_pair *cp = iv_ca_cand_for_use (ivs, use);
+      if (cp)
+        fprintf (file, "   use:%d --> iv_cand:%d, cost=(%d,%d)\n",
+                 use->id, cp->cand->id, cp->cost.cost, cp->cost.complexity);
+      else
+        fprintf (file, "   use:%d --> ??\n", use->id);
+    }
 
   for (i = 1; i <= data->max_inv_id; i++)
     if (ivs->n_invariant_uses[i])
@@ -4885,17 +5606,18 @@ iv_ca_dump (struct ivopts_data *data, FILE *file, struct iv_ca *ivs)
 	fprintf (file, "%s%d", pref, i);
 	pref = ", ";
       }
-  fprintf (file, "\n");
+  fprintf (file, "\n\n");
 }
 
 /* Try changing candidate in IVS to CAND for each use.  Return cost of the
    new set, and store differences in DELTA.  Number of induction variables
-   in the new set is stored to N_IVS.  */
+   in the new set is stored to N_IVS. MIN_NCAND is a flag. When it is true
+   the function will try to find a solution with mimimal iv candidates.  */
 
 static comp_cost
 iv_ca_extend (struct ivopts_data *data, struct iv_ca *ivs,
 	      struct iv_cand *cand, struct iv_ca_delta **delta,
-	      unsigned *n_ivs)
+	      unsigned *n_ivs, bool min_ncand)
 {
   unsigned i;
   comp_cost cost;
@@ -4916,11 +5638,11 @@ iv_ca_extend (struct ivopts_data *data, struct iv_ca *ivs,
       if (!new_cp)
 	continue;
 
-      if (!iv_ca_has_deps (ivs, new_cp))
+      if (!min_ncand && !iv_ca_has_deps (ivs, new_cp))
 	continue;
 
-      if (!cheaper_cost_pair (new_cp, old_cp))
-	continue;
+      if (!min_ncand && !cheaper_cost_pair (new_cp, old_cp))
+        continue;
 
       *delta = iv_ca_delta_add (use, old_cp, new_cp, *delta);
     }
@@ -4971,8 +5693,9 @@ iv_ca_narrow (struct ivopts_data *data, struct iv_ca *ivs,
 	      cp = get_use_iv_cost (data, use, cnd);
 	      if (!cp)
 		continue;
+
 	      if (!iv_ca_has_deps (ivs, cp))
-		continue;
+                continue; 
 
 	      if (!cheaper_cost_pair (cp, new_cp))
 		continue;
@@ -5069,11 +5792,13 @@ iv_ca_prune (struct ivopts_data *data, struct iv_ca *ivs,
 }
 
 /* Tries to extend the sets IVS in the best possible way in order
-   to express the USE.  */
+   to express the USE.  If ORIGINALP is true, prefer candidates from
+   the original set of IVs, otherwise favor important candidates not
+   based on any memory object.  */
 
 static bool
 try_add_cand_for (struct ivopts_data *data, struct iv_ca *ivs,
-		  struct iv_use *use)
+		  struct iv_use *use, bool originalp)
 {
   comp_cost best_cost, act_cost;
   unsigned i;
@@ -5082,17 +5807,26 @@ try_add_cand_for (struct ivopts_data *data, struct iv_ca *ivs,
   struct iv_ca_delta *best_delta = NULL, *act_delta;
   struct cost_pair *cp;
 
-  iv_ca_add_use (data, ivs, use);
+  iv_ca_add_use (data, ivs, use, false);
   best_cost = iv_ca_cost (ivs);
 
   cp = iv_ca_cand_for_use (ivs, use);
+  if (!cp)
+    {
+      ivs->upto--;
+      ivs->bad_uses--;
+      iv_ca_add_use (data, ivs, use, true);
+      best_cost = iv_ca_cost (ivs);
+      cp = iv_ca_cand_for_use (ivs, use);
+    }
   if (cp)
     {
       best_delta = iv_ca_delta_add (use, NULL, cp, NULL);
       iv_ca_set_no_cp (data, ivs, use);
     }
 
-  /* First try important candidates not based on any memory object.  Only if
+  /* If ORIGINALP is true, try to find the original IV for the use.  Otherwise
+     first try important candidates not based on any memory object.  Only if
      this fails, try the specific ones.  Rationale -- in loops with many
      variables the best choice often is to use just one generic biv.  If we
      added here many ivs specific to the uses, the optimization algorithm later
@@ -5104,18 +5838,22 @@ try_add_cand_for (struct ivopts_data *data, struct iv_ca *ivs,
     {
       cand = iv_cand (data, i);
 
-      if (cand->iv->base_object != NULL_TREE)
+      if (originalp && cand->pos !=IP_ORIGINAL)
 	continue;
 
-      if (iv_ca_cand_used_p (ivs, cand))
+      if (!originalp && cand->iv->base_object != NULL_TREE)
 	continue;
 
+      if (iv_ca_cand_used_p (ivs, cand))
+        continue;
+
       cp = get_use_iv_cost (data, use, cand);
       if (!cp)
 	continue;
 
       iv_ca_set_cp (data, ivs, use, cp);
-      act_cost = iv_ca_extend (data, ivs, cand, &act_delta, NULL);
+      act_cost = iv_ca_extend (data, ivs, cand, &act_delta, NULL,
+                               true);
       iv_ca_set_no_cp (data, ivs, use);
       act_delta = iv_ca_delta_add (use, NULL, cp, act_delta);
 
@@ -5140,15 +5878,20 @@ try_add_cand_for (struct ivopts_data *data, struct iv_ca *ivs,
 	    continue;
 
 	  /* Already tried this.  */
-	  if (cand->important && cand->iv->base_object == NULL_TREE)
-	    continue;
+	  if (cand->important)
+	    {
+	      if (originalp && cand->pos == IP_ORIGINAL)
+		continue;
+	      if (!originalp && cand->iv->base_object == NULL_TREE)
+		continue;
+	    }
 
 	  if (iv_ca_cand_used_p (ivs, cand))
 	    continue;
 
 	  act_delta = NULL;
 	  iv_ca_set_cp (data, ivs, use, cp);
-	  act_cost = iv_ca_extend (data, ivs, cand, &act_delta, NULL);
+	  act_cost = iv_ca_extend (data, ivs, cand, &act_delta, NULL, true);
 	  iv_ca_set_no_cp (data, ivs, use);
 	  act_delta = iv_ca_delta_add (use, iv_ca_cand_for_use (ivs, use),
 				       cp, act_delta);
@@ -5175,13 +5918,13 @@ try_add_cand_for (struct ivopts_data *data, struct iv_ca *ivs,
 /* Finds an initial assignment of candidates to uses.  */
 
 static struct iv_ca *
-get_initial_solution (struct ivopts_data *data)
+get_initial_solution (struct ivopts_data *data, bool originalp)
 {
   struct iv_ca *ivs = iv_ca_new (data);
   unsigned i;
 
   for (i = 0; i < n_iv_uses (data); i++)
-    if (!try_add_cand_for (data, ivs, iv_use (data, i)))
+    if (!try_add_cand_for (data, ivs, iv_use (data, i), originalp))
       {
 	iv_ca_free (&ivs);
 	return NULL;
@@ -5208,7 +5951,7 @@ try_improve_iv_set (struct ivopts_data *data, struct iv_ca *ivs)
       if (iv_ca_cand_used_p (ivs, cand))
 	continue;
 
-      acost = iv_ca_extend (data, ivs, cand, &act_delta, &n_ivs);
+      acost = iv_ca_extend (data, ivs, cand, &act_delta, &n_ivs, false);
       if (!act_delta)
 	continue;
 
@@ -5253,14 +5996,12 @@ try_improve_iv_set (struct ivopts_data *data, struct iv_ca *ivs)
    solution and remove the unused ivs while this improves the cost.  */
 
 static struct iv_ca *
-find_optimal_iv_set (struct ivopts_data *data)
+find_optimal_iv_set_1 (struct ivopts_data *data, bool originalp)
 {
-  unsigned i;
   struct iv_ca *set;
-  struct iv_use *use;
 
   /* Get the initial solution.  */
-  set = get_initial_solution (data);
+  set = get_initial_solution (data, originalp);
   if (!set)
     {
       if (dump_file && (dump_flags & TDF_DETAILS))
@@ -5283,11 +6024,46 @@ find_optimal_iv_set (struct ivopts_data *data)
 	}
     }
 
+  return set;
+}
+
+static struct iv_ca *
+find_optimal_iv_set (struct ivopts_data *data)
+{
+  unsigned i;
+  struct iv_ca *set, *origset;
+  struct iv_use *use;
+  comp_cost cost, origcost;
+
+  /* Determine the cost based on a strategy that starts with original IVs,
+     and try again using a strategy that prefers candidates not based
+     on any IVs.  */
+  origset = find_optimal_iv_set_1 (data, true);
+  set = find_optimal_iv_set_1 (data, false);
+
+  if (!origset && !set)
+    return NULL;
+
+  origcost = origset ? iv_ca_cost (origset) : infinite_cost;
+  cost = set ? iv_ca_cost (set) : infinite_cost;
+
   if (dump_file && (dump_flags & TDF_DETAILS))
     {
-      comp_cost cost = iv_ca_cost (set);
-      fprintf (dump_file, "Final cost %d (complexity %d)\n\n", cost.cost, cost.complexity);
+      fprintf (dump_file, "Original cost %d (complexity %d)\n\n",
+	       origcost.cost, origcost.complexity);
+      fprintf (dump_file, "Final cost %d (complexity %d)\n\n",
+	       cost.cost, cost.complexity);
+    }
+
+  /* Choose the one with the best cost.  */
+  if (compare_costs (origcost, cost) <= 0)
+    {
+      if (set)
+	iv_ca_free (&set);
+      set = origset;
     }
+  else if (origset)
+    iv_ca_free (&origset);
 
   for (i = 0; i < n_iv_uses (data); i++)
     {
@@ -5335,7 +6111,6 @@ create_new_iv (struct ivopts_data *data, struct iv_cand *cand)
 
       /* Rewrite the increment so that it uses var_before directly.  */
       find_interesting_uses_op (data, cand->var_after)->selected = cand;
-
       return;
     }
 
@@ -5363,8 +6138,18 @@ create_new_ivs (struct ivopts_data *data, struct iv_ca *set)
       cand = iv_cand (data, i);
       create_new_iv (data, cand);
     }
-}
 
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "\nSelected IV set: \n");
+      EXECUTE_IF_SET_IN_BITMAP (set->cands, 0, i, bi)
+        {
+          cand = iv_cand (data, i);
+          dump_cand (dump_file, cand);
+        }
+      fprintf (dump_file, "\n");
+    }
+}
 
 /* Rewrites USE (definition of iv used in a nonlinear expression)
    using candidate CAND.  */
@@ -5385,35 +6170,24 @@ rewrite_use_nonlinear_expr (struct ivopts_data *data,
   if (cand->pos == IP_ORIGINAL
       && cand->incremented_at == use->stmt)
     {
-      tree step, ctype, utype;
-      enum tree_code incr_code = PLUS_EXPR, old_code;
+      enum tree_code stmt_code;
 
       gcc_assert (is_gimple_assign (use->stmt));
       gcc_assert (gimple_assign_lhs (use->stmt) == cand->var_after);
 
-      step = cand->iv->step;
-      ctype = TREE_TYPE (step);
-      utype = TREE_TYPE (cand->var_after);
-      if (TREE_CODE (step) == NEGATE_EXPR)
-	{
-	  incr_code = MINUS_EXPR;
-	  step = TREE_OPERAND (step, 0);
-	}
-
       /* Check whether we may leave the computation unchanged.
 	 This is the case only if it does not rely on other
 	 computations in the loop -- otherwise, the computation
 	 we rely upon may be removed in remove_unused_ivs,
 	 thus leading to ICE.  */
-      old_code = gimple_assign_rhs_code (use->stmt);
-      if (old_code == PLUS_EXPR
-	  || old_code == MINUS_EXPR
-	  || old_code == POINTER_PLUS_EXPR)
+      stmt_code = gimple_assign_rhs_code (use->stmt);
+      if (stmt_code == PLUS_EXPR
+	  || stmt_code == MINUS_EXPR
+	  || stmt_code == POINTER_PLUS_EXPR)
 	{
 	  if (gimple_assign_rhs1 (use->stmt) == cand->var_before)
 	    op = gimple_assign_rhs2 (use->stmt);
-	  else if (old_code != MINUS_EXPR
-		   && gimple_assign_rhs2 (use->stmt) == cand->var_before)
+	  else if (gimple_assign_rhs2 (use->stmt) == cand->var_before)
 	    op = gimple_assign_rhs1 (use->stmt);
 	  else
 	    op = NULL_TREE;
@@ -5421,24 +6195,13 @@ rewrite_use_nonlinear_expr (struct ivopts_data *data,
       else
 	op = NULL_TREE;
 
-      if (op
-	  && (TREE_CODE (op) == INTEGER_CST
-	      || operand_equal_p (op, step, 0)))
+      if (op && expr_invariant_in_loop_p (data->current_loop, op))
 	return;
-
-      /* Otherwise, add the necessary computations to express
-	 the iv.  */
-      op = fold_convert (ctype, cand->var_before);
-      comp = fold_convert (utype,
-			   build2 (incr_code, ctype, op,
-				   unshare_expr (step)));
-    }
-  else
-    {
-      comp = get_computation (data->current_loop, use, cand);
-      gcc_assert (comp != NULL_TREE);
     }
 
+  comp = get_computation (data->current_loop, use, cand);
+  gcc_assert (comp != NULL_TREE);
+
   switch (gimple_code (use->stmt))
     {
     case GIMPLE_PHI:
@@ -5460,12 +6223,31 @@ rewrite_use_nonlinear_expr (struct ivopts_data *data,
       gcc_unreachable ();
     }
 
-  op = force_gimple_operand_gsi (&bsi, comp, false, SSA_NAME_VAR (tgt),
-				 true, GSI_SAME_STMT);
+  if (!valid_gimple_rhs_p (comp)
+      || (gimple_code (use->stmt) != GIMPLE_PHI
+	  /* We can't allow re-allocating the stmt as it might be pointed
+	     to still.  */
+	  && (get_gimple_rhs_num_ops (TREE_CODE (comp))
+	      >= gimple_num_ops (gsi_stmt (bsi)))))
+    {
+      comp = force_gimple_operand_gsi (&bsi, comp, true, NULL_TREE,
+				       true, GSI_SAME_STMT);
+      if (POINTER_TYPE_P (TREE_TYPE (tgt)))
+	{
+	  duplicate_ssa_name_ptr_info (comp, SSA_NAME_PTR_INFO (tgt));
+	  /* As this isn't a plain copy we have to reset alignment
+	     information.  */
+	  if (SSA_NAME_PTR_INFO (comp))
+	    {
+	      SSA_NAME_PTR_INFO (comp)->align = 1;
+	      SSA_NAME_PTR_INFO (comp)->misalign = 0;
+	    }
+	}
+    }
 
   if (gimple_code (use->stmt) == GIMPLE_PHI)
     {
-      ass = gimple_build_assign (tgt, op);
+      ass = gimple_build_assign (tgt, comp);
       gsi_insert_before (&bsi, ass, GSI_SAME_STMT);
 
       bsi = gsi_for_stmt (use->stmt);
@@ -5473,62 +6255,92 @@ rewrite_use_nonlinear_expr (struct ivopts_data *data,
     }
   else
     {
-      gimple_assign_set_rhs_from_tree (&bsi, op);
+      gimple_assign_set_rhs_from_tree (&bsi, comp);
       use->stmt = gsi_stmt (bsi);
     }
 }
 
-/* Replaces ssa name in index IDX by its basic variable.  Callback for
-   for_each_index.  */
+/* Performs a peephole optimization to reorder the iv update statement with
+   a mem ref to enable instruction combining in later phases. The mem ref uses
+   the iv value before the update, so the reordering transformation requires
+   adjustment of the offset. CAND is the selected IV_CAND.
 
-static bool
-idx_remove_ssa_names (tree base, tree *idx,
-		      void *data ATTRIBUTE_UNUSED)
-{
-  tree *op;
+   Example:
 
-  if (TREE_CODE (*idx) == SSA_NAME)
-    *idx = SSA_NAME_VAR (*idx);
+   t = MEM_REF (base, iv1, 8, 16);  // base, index, stride, offset
+   iv2 = iv1 + 1;
 
-  if (TREE_CODE (base) == ARRAY_REF || TREE_CODE (base) == ARRAY_RANGE_REF)
-    {
-      op = &TREE_OPERAND (base, 2);
-      if (*op
-	  && TREE_CODE (*op) == SSA_NAME)
-	*op = SSA_NAME_VAR (*op);
-      op = &TREE_OPERAND (base, 3);
-      if (*op
-	  && TREE_CODE (*op) == SSA_NAME)
-	*op = SSA_NAME_VAR (*op);
-    }
+   if (t < val)      (1)
+     goto L;
+   goto Head;
 
-  return true;
-}
 
-/* Unshares REF and replaces ssa names inside it by their basic variables.  */
+   directly propagating t over to (1) will introduce overlapping live range
+   thus increase register pressure. This peephole transform it into:
 
-static tree
-unshare_and_remove_ssa_names (tree ref)
-{
-  ref = unshare_expr (ref);
-  for_each_index (&ref, idx_remove_ssa_names, NULL);
-
-  return ref;
-}
 
-/* Copies the reference information from OLD_REF to NEW_REF.  */
+   iv2 = iv1 + 1;
+   t = MEM_REF (base, iv2, 8, 8);
+   if (t < val)
+     goto L;
+   goto Head;
+*/
 
 static void
-copy_ref_info (tree new_ref, tree old_ref)
+adjust_iv_update_pos (struct iv_cand *cand, struct iv_use *use)
 {
-  if (TREE_CODE (old_ref) == TARGET_MEM_REF)
-    copy_mem_ref_info (new_ref, old_ref);
-  else
+  tree var_after;
+  gimple iv_update, stmt;
+  basic_block bb;
+  gimple_stmt_iterator gsi, gsi_iv;
+
+  if (cand->pos != IP_NORMAL)
+    return;
+
+  var_after = cand->var_after;
+  iv_update = SSA_NAME_DEF_STMT (var_after);
+
+  bb = gimple_bb (iv_update);
+  gsi = gsi_last_nondebug_bb (bb);
+  stmt = gsi_stmt (gsi);
+
+  /* Only handle conditional statement for now.  */
+  if (gimple_code (stmt) != GIMPLE_COND)
+    return;
+
+  gsi_prev_nondebug (&gsi);
+  stmt = gsi_stmt (gsi);
+  if (stmt != iv_update)
+    return;
+
+  gsi_prev_nondebug (&gsi);
+  if (gsi_end_p (gsi))
+    return;
+
+  stmt = gsi_stmt (gsi);
+  if (gimple_code (stmt) != GIMPLE_ASSIGN)
+    return;
+
+  if (stmt != use->stmt)
+    return;
+
+  if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
+    return;
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
     {
-      TMR_ORIGINAL (new_ref) = unshare_and_remove_ssa_names (old_ref);
-      TREE_SIDE_EFFECTS (new_ref) = TREE_SIDE_EFFECTS (old_ref);
-      TREE_THIS_VOLATILE (new_ref) = TREE_THIS_VOLATILE (old_ref);
+      fprintf (dump_file, "Reordering \n");
+      print_gimple_stmt (dump_file, iv_update, 0, 0);
+      print_gimple_stmt (dump_file, use->stmt, 0, 0);
+      fprintf (dump_file, "\n");
     }
+
+  gsi = gsi_for_stmt (use->stmt);
+  gsi_iv = gsi_for_stmt (iv_update);
+  gsi_move_before (&gsi_iv, &gsi);
+
+  cand->pos = IP_BEFORE_USE;
+  cand->incremented_at = use->stmt;
 }
 
 /* Rewrites USE (address that is an iv) using candidate CAND.  */
@@ -5540,9 +6352,10 @@ rewrite_use_address (struct ivopts_data *data,
   aff_tree aff;
   gimple_stmt_iterator bsi = gsi_for_stmt (use->stmt);
   tree base_hint = NULL_TREE;
-  tree ref;
+  tree ref, iv;
   bool ok;
 
+  adjust_iv_update_pos (cand, use);
   ok = get_computation_aff (data->current_loop, use, cand, use->stmt, &aff);
   gcc_assert (ok);
   unshare_aff_combination (&aff);
@@ -5561,8 +6374,10 @@ rewrite_use_address (struct ivopts_data *data,
   if (cand->iv->base_object)
     base_hint = var_at_stmt (data->current_loop, cand, use->stmt);
 
-  ref = create_mem_ref (&bsi, TREE_TYPE (*use->op_p), &aff, base_hint,
-			data->speed);
+  iv = var_at_stmt (data->current_loop, cand, use->stmt);
+  ref = create_mem_ref (&bsi, TREE_TYPE (*use->op_p), &aff,
+			reference_alias_ptr_type (*use->op_p),
+			iv, base_hint, data->speed);
   copy_ref_info (ref, *use->op_p);
   *use->op_p = ref;
 }
@@ -5587,7 +6402,12 @@ rewrite_use_compare (struct ivopts_data *data,
       tree var_type = TREE_TYPE (var);
       gimple_seq stmts;
 
-      compare = iv_elimination_compare (data, use);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+        {
+          fprintf (dump_file, "Replacing exit test: ");
+          print_gimple_stmt (dump_file, use->stmt, 0, TDF_SLIM);
+        }
+      compare = cp->comp;
       bound = unshare_expr (fold_convert (var_type, bound));
       op = force_gimple_operand (bound, &stmts, true, NULL_TREE);
       if (stmts)
@@ -5688,6 +6508,19 @@ remove_unused_ivs (struct ivopts_data *data)
   BITMAP_FREE (toremove);
 }
 
+/* Frees memory occupied by struct tree_niter_desc in *VALUE. Callback
+   for pointer_map_traverse.  */
+
+static bool
+free_tree_niter_desc (const void *key ATTRIBUTE_UNUSED, void **value,
+                      void *data ATTRIBUTE_UNUSED)
+{
+  struct tree_niter_desc *const niter = (struct tree_niter_desc *) *value;
+
+  free (niter);
+  return true;
+}
+
 /* Frees data allocated by the optimization of a single loop.  */
 
 static void
@@ -5699,6 +6532,7 @@ free_loop_data (struct ivopts_data *data)
 
   if (data->niters)
     {
+      pointer_map_traverse (data->niters, free_tree_niter_desc, NULL);
       pointer_map_destroy (data->niters);
       data->niters = NULL;
     }
@@ -5708,8 +6542,7 @@ free_loop_data (struct ivopts_data *data)
       struct version_info *info;
 
       info = ver_info (data, i);
-      if (info->iv)
-	free (info->iv);
+      free (info->iv);
       info->iv = NULL;
       info->has_nonlin_use = false;
       info->preserve_biv = false;
@@ -5736,8 +6569,7 @@ free_loop_data (struct ivopts_data *data)
     {
       struct iv_cand *cand = iv_cand (data, i);
 
-      if (cand->iv)
-	free (cand->iv);
+      free (cand->iv);
       if (cand->depends_on)
 	BITMAP_FREE (cand->depends_on);
       free (cand);
@@ -5753,10 +6585,13 @@ free_loop_data (struct ivopts_data *data)
 
   data->max_inv_id = 0;
 
-  for (i = 0; VEC_iterate (tree, decl_rtl_to_reset, i, obj); i++)
+  FOR_EACH_VEC_ELT (tree, decl_rtl_to_reset, i, obj)
     SET_DECL_RTL (obj, NULL_RTX);
 
   VEC_truncate (tree, decl_rtl_to_reset, 0);
+
+  htab_empty (data->inv_expr_tab);
+  data->inv_expr_id = 0;
 }
 
 /* Finalizes data structures used by the iv optimization pass.  LOOPS is the
@@ -5773,6 +6608,26 @@ tree_ssa_iv_optimize_finalize (struct ivopts_data *data)
   VEC_free (tree, heap, decl_rtl_to_reset);
   VEC_free (iv_use_p, heap, data->iv_uses);
   VEC_free (iv_cand_p, heap, data->iv_candidates);
+  htab_delete (data->inv_expr_tab);
+}
+
+/* Returns true if the loop body BODY includes any function calls.  */
+
+static bool
+loop_body_includes_call (basic_block *body, unsigned num_nodes)
+{
+  gimple_stmt_iterator gsi;
+  unsigned i;
+
+  for (i = 0; i < num_nodes; i++)
+    for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
+      {
+	gimple stmt = gsi_stmt (gsi);
+	if (is_gimple_call (stmt)
+	    && !is_inexpensive_builtin (gimple_call_fndecl (stmt)))
+	  return true;
+      }
+  return false;
 }
 
 /* Optimizes the LOOP.  Returns true if anything changed.  */
@@ -5782,7 +6637,7 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop)
 {
   bool changed = false;
   struct iv_ca *iv_ca;
-  edge exit;
+  edge exit = single_dom_exit (loop);
   basic_block *body;
 
   gcc_assert (!data->niters);
@@ -5793,7 +6648,6 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop)
     {
       fprintf (dump_file, "Processing loop %d\n", loop->num);
 
-      exit = single_dom_exit (loop);
       if (exit)
 	{
 	  fprintf (dump_file, "  single exit %d -> %d, exit condition ",
@@ -5806,9 +6660,12 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, struct loop *loop)
     }
 
   body = get_loop_body (loop);
+  data->body_includes_call = loop_body_includes_call (body, loop->num_nodes);
   renumber_gimple_stmt_uids_in_blocks (body, loop->num_nodes);
   free (body);
 
+  data->loop_single_exit_p = exit != NULL && loop_only_exit_p (loop, exit);
+
   /* For each ssa name determines whether it behaves as an induction variable
      in some loop.  */
   if (!find_induction_variables (data))