(c_sizeof, build_c_cast): Set TREE_OVERFLOW in addition

[pf3gnuchains/gcc-fork.git] / gcc / expmed.c
diff --git a/gcc/expmed.c b/gcc/expmed.c

index 375b9f6..6fb8579 100644 (file)
--- a/gcc/expmed.c
+++ b/gcc/expmed.c
@@ -1,6 +1,6 @@
  /* Medium-level subroutines: convert bit-field store and extract
     and shifts, multiplies and divides to rtl instructions.
-   Copyright (C) 1987, 1988, 1989, 1992 Free Software Foundation, Inc.
+   Copyright (C) 1987, 1988, 1989, 1992, 1993 Free Software Foundation, Inc.
  
  This file is part of GNU CC.
  
@@ -49,59 +49,102 @@ int mult_is_very_cheap;
  
  static int sdiv_pow2_cheap, smod_pow2_cheap;
  
-/* Cost of various pieces of RTL.  */
-static int add_cost, shift_cost, mult_cost, negate_cost, lea_cost;
+/* For compilers that support multiple targets with different word sizes,
+   MAX_BITS_PER_WORD contains the biggest value of BITS_PER_WORD.  An example
+   is the H8/300(H) compiler.  */
+
+#ifndef MAX_BITS_PER_WORD
+#define MAX_BITS_PER_WORD BITS_PER_WORD
+#endif
  
-/* Max scale factor for scaled address in lea instruction.  */
-static int lea_max_mul;
+/* Cost of various pieces of RTL.  */
+static int add_cost, mult_cost, negate_cost, zero_cost;
+static int shift_cost[MAX_BITS_PER_WORD];
+static int shiftadd_cost[MAX_BITS_PER_WORD];
+static int shiftsub_cost[MAX_BITS_PER_WORD];
  
  void
  init_expmed ()
  {
-  char *free_point = (char *) oballoc (1);
+  char *free_point;
    /* This is "some random pseudo register" for purposes of calling recog
       to see what insns exist.  */
    rtx reg = gen_rtx (REG, word_mode, FIRST_PSEUDO_REGISTER);
-  rtx pow2 = GEN_INT (32);
-  rtx lea;
-  HOST_WIDE_INT i;
+  rtx shift_insn, shiftadd_insn, shiftsub_insn;
    int dummy;
+  int m;
  
+  start_sequence ();
+
+  /* Since we are on the permanent obstack, we must be sure we save this
+     spot AFTER we call start_sequence, since it will reuse the rtl it
+     makes.  */
+
+  free_point = (char *) oballoc (0);
+
+  zero_cost = rtx_cost (const0_rtx, 0);
    add_cost = rtx_cost (gen_rtx (PLUS, word_mode, reg, reg), SET);
-  shift_cost = rtx_cost (gen_rtx (LSHIFT, word_mode, reg,
-                                 /* Using a constant gives better
-                                    estimate of typical costs.
-                                    1 or 2 might have quirks.  */
-                                 GEN_INT (3)), SET);
+
+  shift_insn = emit_insn (gen_rtx (SET, VOIDmode, reg,
+                                  gen_rtx (ASHIFT, word_mode, reg,
+                                           const0_rtx)));
+
+  shiftadd_insn = emit_insn (gen_rtx (SET, VOIDmode, reg,
+                                     gen_rtx (PLUS, word_mode,
+                                              gen_rtx (MULT, word_mode,
+                                                       reg, const0_rtx),
+                                              reg)));
+
+  shiftsub_insn = emit_insn (gen_rtx (SET, VOIDmode, reg,
+                                     gen_rtx (MINUS, word_mode,
+                                              gen_rtx (MULT, word_mode,
+                                                        reg, const0_rtx),
+                                               reg)));
+
+  init_recog ();
+
+  shift_cost[0] = 0;
+  shiftadd_cost[0] = shiftsub_cost[0] = add_cost;
+
+  for (m = 1; m < BITS_PER_WORD; m++)
+    {
+      shift_cost[m] = shiftadd_cost[m] = shiftsub_cost[m] = 32000;
+
+      XEXP (SET_SRC (PATTERN (shift_insn)), 1) = GEN_INT (m);
+      if (recog (PATTERN (shift_insn), shift_insn, &dummy) >= 0)
+       shift_cost[m] = rtx_cost (SET_SRC (PATTERN (shift_insn)), SET);
+
+      XEXP (XEXP (SET_SRC (PATTERN (shiftadd_insn)), 0), 1)
+       = GEN_INT ((HOST_WIDE_INT) 1 << m);
+      if (recog (PATTERN (shiftadd_insn), shiftadd_insn, &dummy) >= 0)
+       shiftadd_cost[m] = rtx_cost (SET_SRC (PATTERN (shiftadd_insn)), SET);
+
+      XEXP (XEXP (SET_SRC (PATTERN (shiftsub_insn)), 0), 1)
+       = GEN_INT ((HOST_WIDE_INT) 1 << m);
+      if (recog (PATTERN (shiftsub_insn), shiftsub_insn, &dummy) >= 0)
+       shiftsub_cost[m] = rtx_cost (SET_SRC (PATTERN (shiftsub_insn)), SET);
+    }
+
    mult_cost = rtx_cost (gen_rtx (MULT, word_mode, reg, reg), SET);
+  /* For gcc 2.4 keep MULT_COST small to avoid really slow searches
+     in synth_mult.  */
+  mult_cost = MIN (12 * add_cost, mult_cost);
    negate_cost = rtx_cost (gen_rtx (NEG, word_mode, reg), SET);
  
    /* 999999 is chosen to avoid any plausible faster special case.  */
    mult_is_very_cheap
      = (rtx_cost (gen_rtx (MULT, word_mode, reg, GEN_INT (999999)), SET)
-       < rtx_cost (gen_rtx (LSHIFT, word_mode, reg, GEN_INT (7)), SET));
+       < rtx_cost (gen_rtx (ASHIFT, word_mode, reg, GEN_INT (7)), SET));
  
    sdiv_pow2_cheap
-    = rtx_cost (gen_rtx (DIV, word_mode, reg, pow2), SET) <= 2 * add_cost;
+    = (rtx_cost (gen_rtx (DIV, word_mode, reg, GEN_INT (32)), SET)
+       <= 2 * add_cost);
    smod_pow2_cheap
-    = rtx_cost (gen_rtx (MOD, word_mode, reg, pow2), SET) <= 2 * add_cost;
-
-  init_recog ();
-  for (i = 2;; i <<= 1)
-    {
-      lea = gen_rtx (SET, VOIDmode, reg,
-                    gen_rtx (PLUS, word_mode,
-                             gen_rtx (MULT, word_mode, reg, GEN_INT (i)),
-                             reg));
-      /* Using 0 as second argument is not quite right,
-        but what else is there to do?  */
-      if (recog (lea, 0, &dummy) < 0)
-       break;
-      lea_max_mul = i;
-      lea_cost = rtx_cost (SET_SRC (lea), SET);
-    }
+    = (rtx_cost (gen_rtx (MOD, word_mode, reg, GEN_INT (32)), SET)
+       <= 2 * add_cost);
  
    /* Free the objects we just allocated.  */
+  end_sequence ();
    obfree (free_point);
  }
  
@@ -648,8 +691,13 @@ store_split_bit_field (op0, bitsize, bitpos, value, align)
  
    if (GET_MODE (value) != VOIDmode)
      value = convert_to_mode (word_mode, value, 1);
+
+  if (GET_CODE (value) == CONST_DOUBLE
+      && (part1 = gen_lowpart_common (word_mode, value)) != 0)
+    value = part1;
+
    if (CONSTANT_P (value) && GET_CODE (value) != CONST_INT)
-    value = copy_to_reg (value);
+    value = copy_to_mode_reg (word_mode, value);
  
    /* Split the value into two parts:
       PART1 gets that which goes in the first word; PART2 the other.  */
@@ -1225,6 +1273,16 @@ extract_fixed_bit_field (tmode, op0, offset, bitsize, bitpos,
  
        total_bits = GET_MODE_BITSIZE (mode);
  
+      /* Make sure bitpos is valid for the chosen mode.  Adjust BITPOS to
+        be be in the range 0 to total_bits-1, and put any excess bytes in
+        OFFSET.  */
+      if (bitpos >= total_bits)
+       {
+         offset += (bitpos / total_bits) * (total_bits / BITS_PER_UNIT);
+         bitpos -= ((bitpos / total_bits) * (total_bits / BITS_PER_UNIT)
+                    * BITS_PER_UNIT);
+       }
+
        /* Get ref to an aligned byte, halfword, or word containing the field.
          Adjust BITPOS to be position within a word,
          and OFFSET to be the offset of that word.
@@ -1716,272 +1774,261 @@ expand_shift (code, mode, shifted, amount, target, unsignedp)
    return temp;
  }
  \f
-enum alg_code { alg_add, alg_subtract, alg_compound };
+enum alg_code { alg_zero, alg_m, alg_shift,
+                 alg_add_t_m2, alg_sub_t_m2,
+                 alg_add_factor, alg_sub_factor,
+                 alg_add_t2_m, alg_sub_t2_m,
+                 alg_add, alg_subtract, alg_factor, alg_shiftop };
  
  /* This structure records a sequence of operations.
     `ops' is the number of operations recorded.
     `cost' is their total cost.
     The operations are stored in `op' and the corresponding
-   integer coefficients in `coeff'.
-   These are the operations:
-   alg_add       Add to the total the multiplicand times the coefficient.
-   alg_subtract  Subtract the multiplicand times the coefficient.
-   alg_compound  This coefficient plus or minus the following one
-                 is multiplied into the total.  The following operation
-                 is alg_add or alg_subtract to indicate whether to add
-                or subtract the two coefficients.  */
+   logarithms of the integer coefficients in `log'.
  
-#ifndef MAX_BITS_PER_WORD
-#define MAX_BITS_PER_WORD BITS_PER_WORD
-#endif
+   These are the operations:
+   alg_zero            total := 0;
+   alg_m               total := multiplicand;
+   alg_shift           total := total * coeff
+   alg_add_t_m2                total := total + multiplicand * coeff;
+   alg_sub_t_m2                total := total - multiplicand * coeff;
+   alg_add_factor      total := total * coeff + total;
+   alg_sub_factor      total := total * coeff - total;
+   alg_add_t2_m                total := total * coeff + multiplicand;
+   alg_sub_t2_m                total := total * coeff - multiplicand;
+
+   The first operand must be either alg_zero or alg_m.  */
  
  struct algorithm
  {
-  int cost;
-  unsigned int ops;
+  short cost;
+  short ops;
+  /* The size of the OP and LOG fields are not directly related to the
+     word size, but the worst-case algorithms will be if we have few
+     consecutive ones or zeros, i.e., a multiplicand like 10101010101...
+     In that case we will generate shift-by-2, add, shift-by-2, add,...,
+     in total wordsize operations.  */
    enum alg_code op[MAX_BITS_PER_WORD];
-  unsigned HOST_WIDE_INT coeff[MAX_BITS_PER_WORD];
+  char log[MAX_BITS_PER_WORD];
  };
  
  /* Compute and return the best algorithm for multiplying by T.
-   Assume that add insns cost ADD_COST and shifts cost SHIFT_COST.
-   Return cost -1 if would cost more than MAX_COST.  */
+   The algorithm must cost less than cost_limit
+   If retval.cost >= COST_LIMIT, no algorithm was found and all
+   other field of the returned struct are undefined.  */
  
  static struct algorithm
-synth_mult (t, add_cost, shift_cost, max_cost)
+synth_mult (t, cost_limit)
       unsigned HOST_WIDE_INT t;
-     int add_cost, shift_cost;
-     int max_cost;
+     int cost_limit;
  {
-  int m, n;
+  int m;
    struct algorithm *best_alg
      = (struct algorithm *)alloca (sizeof (struct algorithm));
    struct algorithm *alg_in
      = (struct algorithm *)alloca (sizeof (struct algorithm));
    unsigned int cost;
+  unsigned HOST_WIDE_INT q;
  
-  /* No matter what happens, we want to return a valid algorithm.  */
-  best_alg->cost = max_cost;
-  best_alg->ops = 0;
+  /* Indicate that no algorithm is yet found.  If no algorithm
+     is found, this value will be returned and indicate failure.  */
+  best_alg->cost = cost_limit;
  
-  /* Is t an exponent of 2, so we can just do a shift?  */
+  if (cost_limit <= 0)
+    return *best_alg;
  
-  if ((t & -t) == t)
+  /* t == 1 can be done in zero cost.  */
+  if (t == 1)
      {
-      if (t > 1)
-       {
-         if (max_cost >= shift_cost)
-           {
-             best_alg->cost = shift_cost;
-             best_alg->ops = 1;
-             best_alg->op[0] = alg_add;
-             best_alg->coeff[0] = t;
-           }
-         else
-           best_alg->cost = -1;
-       }
-      else if (t == 1)
-       {
-         if (max_cost >= 0)
-           best_alg->cost = 0;
-       }
-      else
-       best_alg->cost = 0;
-
+      best_alg->ops = 1;
+      best_alg->cost = 0;
+      best_alg->op[0] = alg_m;
        return *best_alg;
      }
  
-  /* If MAX_COST just permits as little as an addition (or less), we won't
-     succeed in synthesizing an algorithm for t.  Return immediately with
-     an indication of failure.  */
-  if (max_cost <= add_cost)
-    {
-      best_alg->cost = -1;
-      return *best_alg;
-    }
+  /* t == 0 sometimes has a cost.  If it does and it exceeds our limit,
+     fail now.  */
  
-  /* Look for factors of t of the form
-     t = q(2**m +- 1), 2 <= m <= floor(log2(t)) - 1.
-     If we find such a factor, we can multiply by t using an algorithm that
-     multiplies by q, shift the result by m and add/subtract it to itself.  */
-
-  for (m = floor_log2 (t) - 1; m >= 2; m--)
+  else if (t == 0)
      {
-      HOST_WIDE_INT m_exp_2 = (HOST_WIDE_INT) 1 << m;
-      HOST_WIDE_INT d;
-
-      d = m_exp_2 + 1;
-      if (t % d == 0)
+      if (zero_cost >= cost_limit)
+       return *best_alg;
+      else
         {
-         HOST_WIDE_INT q = t / d;
-
-         cost = add_cost + shift_cost * 2;
-
-         *alg_in = synth_mult (q, add_cost, shift_cost,
-                               MIN (max_cost, best_alg->cost) - cost);
-
-         if (alg_in->cost >= 0)
-           {
-             cost += alg_in->cost;
-
-             if (cost < best_alg->cost)
-               {
-                 struct algorithm *x;
-                 x = alg_in;
-                 alg_in = best_alg;
-                 best_alg = x;
-                 best_alg->coeff[best_alg->ops] = m_exp_2;
-                 best_alg->op[best_alg->ops++] = alg_compound;
-                 best_alg->coeff[best_alg->ops] = 1;
-                 best_alg->op[best_alg->ops++] = alg_add;
-                 best_alg->cost = cost;
-               }
-           }
+         best_alg->ops = 1;
+         best_alg->cost = zero_cost;
+         best_alg->op[0] = alg_zero;
+         return *best_alg;
         }
+    }
  
-      d = m_exp_2 - 1;
-      if (t % d == 0)
-       {
-         HOST_WIDE_INT q = t / d;
-
-         cost = add_cost + shift_cost * 2;
+  /* If we have a group of zero bits at the low-order part of T, try
+     multiplying by the remaining bits and then doing a shift.  */
  
-         *alg_in = synth_mult (q, add_cost, shift_cost,
-                               MIN (max_cost, best_alg->cost) - cost);
+  if ((t & 1) == 0)
+    {
+      m = floor_log2 (t & -t); /* m = number of low zero bits */
+      q = t >> m;
+      cost = shift_cost[m];
+      if (cost < cost_limit)
+       {
+         *alg_in = synth_mult (q, cost_limit - cost);
  
-         if (alg_in->cost >= 0)
+         cost += alg_in->cost;
+         if (cost < best_alg->cost)
             {
-             cost += alg_in->cost;
-
-             if (cost < best_alg->cost)
-               {
-                 struct algorithm *x;
-                 x = alg_in;
-                 alg_in = best_alg;
-                 best_alg = x;
-                 best_alg->coeff[best_alg->ops] = m_exp_2;
-                 best_alg->op[best_alg->ops++] = alg_compound;
-                 best_alg->coeff[best_alg->ops] = 1;
-                 best_alg->op[best_alg->ops++] = alg_subtract;
-                 best_alg->cost = cost;
-               }
+             struct algorithm *x;
+             x = alg_in, alg_in = best_alg, best_alg = x;
+             best_alg->log[best_alg->ops] = m;
+             best_alg->op[best_alg->ops++] = alg_shift;
+             best_alg->cost = cost_limit = cost;
             }
         }
      }
  
-  /* Try load effective address instructions, i.e. do a*3, a*5, a*9.  */
-
-  {
-    HOST_WIDE_INT q;
-    HOST_WIDE_INT w;
-
-    q = t & -t;                        /* get out lsb */
-    w = (t - q) & -(t - q);    /* get out next lsb */
-
-    if (w / q <= lea_max_mul)
-      {
-       cost = lea_cost + (q != 1 ? shift_cost : 0);
-
-       *alg_in = synth_mult (t - q - w, add_cost, shift_cost,
-                             MIN (max_cost, best_alg->cost) - cost);
-
-       if (alg_in->cost >= 0)
-         {
-           cost += alg_in->cost;
-
-           /* Use <= to prefer this method to the factoring method
-              when the cost appears the same, because this method
-              uses fewer temporary registers.  */
-           if (cost <= best_alg->cost)
-             {
-               struct algorithm *x;
-               x = alg_in;
-               alg_in = best_alg;
-               best_alg = x;
-               best_alg->coeff[best_alg->ops] = w;
-               best_alg->op[best_alg->ops++] = alg_add;
-               best_alg->coeff[best_alg->ops] = q;
-               best_alg->op[best_alg->ops++] = alg_add;
-               best_alg->cost = cost;
-             }
-         }
-      }
-  }
-
-  /* Now, use the good old method to add or subtract at the leftmost
-     1-bit.  */
-
+  /* If we have an odd number, add or subtract one.  */
+  if ((t & 1) != 0)
    {
-    HOST_WIDE_INT q;
-    HOST_WIDE_INT w;
+    unsigned HOST_WIDE_INT w;
  
-    q = t & -t;                        /* get out lsb */
-    for (w = q; (w & t) != 0; w <<= 1)
+    for (w = 1; (w & t) != 0; w <<= 1)
        ;
-    if ((w > q << 1)
-       /* Reject the case where t has only two bits.
+    if (w > 2
+       /* Reject the case where t is 3.
            Thus we prefer addition in that case.  */
-       && !(t < w && w == q << 2))
+       && t != 3)
        {
-       /* There are many bits in a row.  Make 'em by subtraction.  */
+       /* T ends with ...111.  Multiply by (T + 1) and subtract 1.  */
  
         cost = add_cost;
-       if (q != 1)
-         cost += shift_cost;
+       *alg_in = synth_mult (t + 1, cost_limit - cost);
  
-       *alg_in = synth_mult (t + q, add_cost, shift_cost,
-                             MIN (max_cost, best_alg->cost) - cost);
-
-       if (alg_in->cost >= 0)
+       cost += alg_in->cost;
+       if (cost < best_alg->cost)
           {
-           cost += alg_in->cost;
-
-           /* Use <= to prefer this method to the factoring method
-              when the cost appears the same, because this method
-              uses fewer temporary registers.  */
-           if (cost <= best_alg->cost)
-             {
-               struct algorithm *x;
-               x = alg_in;
-               alg_in = best_alg;
-               best_alg = x;
-               best_alg->coeff[best_alg->ops] = q;
-               best_alg->op[best_alg->ops++] = alg_subtract;
-               best_alg->cost = cost;
-             }
+           struct algorithm *x;
+           x = alg_in, alg_in = best_alg, best_alg = x;
+           best_alg->log[best_alg->ops] = 0;
+           best_alg->op[best_alg->ops++] = alg_sub_t_m2;
+           best_alg->cost = cost_limit = cost;
           }
        }
      else
        {
-       /* There's only one bit at the left.  Make it by addition.  */
+       /* T ends with ...01 or ...011.  Multiply by (T - 1) and add 1.  */
  
         cost = add_cost;
-       if (q != 1)
-         cost += shift_cost;
-
-       *alg_in = synth_mult (t - q, add_cost, shift_cost,
-                             MIN (max_cost, best_alg->cost) - cost);
+       *alg_in = synth_mult (t - 1, cost_limit - cost);
  
-       if (alg_in->cost >= 0)
+       cost += alg_in->cost;
+       if (cost < best_alg->cost)
           {
-           cost += alg_in->cost;
-
-           if (cost <= best_alg->cost)
-             {
-               struct algorithm *x;
-               x = alg_in;
-               alg_in = best_alg;
-               best_alg = x;
-               best_alg->coeff[best_alg->ops] = q;
-               best_alg->op[best_alg->ops++] = alg_add;
-               best_alg->cost = cost;
-             }
+           struct algorithm *x;
+           x = alg_in, alg_in = best_alg, best_alg = x;
+           best_alg->log[best_alg->ops] = 0;
+           best_alg->op[best_alg->ops++] = alg_add_t_m2;
+           best_alg->cost = cost_limit = cost;
           }
        }
    }
  
-  if (best_alg->cost >= max_cost)
-    best_alg->cost = -1;
+  /* Look for factors of t of the form
+     t = q(2**m +- 1), 2 <= m <= floor(log2(t - 1)).
+     If we find such a factor, we can multiply by t using an algorithm that
+     multiplies by q, shift the result by m and add/subtract it to itself.
+
+     We search for large factors first and loop down, even if large factors
+     are less probable than small; if we find a large factor we will find a
+     good sequence quickly, and therefore be able to prune (by decreasing
+     COST_LIMIT) the search.  */
+
+  for (m = floor_log2 (t - 1); m >= 2; m--)
+    {
+      unsigned HOST_WIDE_INT d;
+
+      d = ((unsigned HOST_WIDE_INT) 1 << m) + 1;
+      if (t % d == 0 && t > d)
+       {
+         cost = MIN (shiftadd_cost[m], add_cost + shift_cost[m]);
+         *alg_in = synth_mult (t / d, cost_limit - cost);
+
+         cost += alg_in->cost;
+         if (cost < best_alg->cost)
+           {
+             struct algorithm *x;
+             x = alg_in, alg_in = best_alg, best_alg = x;
+             best_alg->log[best_alg->ops] = m;
+             best_alg->op[best_alg->ops++] = alg_add_factor;
+             best_alg->cost = cost_limit = cost;
+           }
+       }
+
+      d = ((unsigned HOST_WIDE_INT) 1 << m) - 1;
+      if (t % d == 0 && t > d)
+       {
+         cost = MIN (shiftsub_cost[m], add_cost + shift_cost[m]);
+         *alg_in = synth_mult (t / d, cost_limit - cost);
+
+         cost += alg_in->cost;
+         if (cost < best_alg->cost)
+           {
+             struct algorithm *x;
+             x = alg_in, alg_in = best_alg, best_alg = x;
+             best_alg->log[best_alg->ops] = m;
+             best_alg->op[best_alg->ops++] = alg_sub_factor;
+             best_alg->cost = cost_limit = cost;
+           }
+       }
+    }
+
+  /* Try shift-and-add (load effective address) instructions,
+     i.e. do a*3, a*5, a*9.  */
+  if ((t & 1) != 0)
+    {
+      q = t - 1;
+      q = q & -q;
+      m = exact_log2 (q);
+      if (m >= 0)
+       {
+         cost = shiftadd_cost[m];
+         *alg_in = synth_mult ((t - 1) >> m, cost_limit - cost);
+
+         cost += alg_in->cost;
+         if (cost < best_alg->cost)
+           {
+             struct algorithm *x;
+             x = alg_in, alg_in = best_alg, best_alg = x;
+             best_alg->log[best_alg->ops] = m;
+             best_alg->op[best_alg->ops++] = alg_add_t2_m;
+             best_alg->cost = cost_limit = cost;
+           }
+       }
+
+      q = t + 1;
+      q = q & -q;
+      m = exact_log2 (q);
+      if (m >= 0)
+       {
+         cost = shiftsub_cost[m];
+         *alg_in = synth_mult ((t + 1) >> m, cost_limit - cost);
+
+         cost += alg_in->cost;
+         if (cost < best_alg->cost)
+           {
+             struct algorithm *x;
+             x = alg_in, alg_in = best_alg, best_alg = x;
+             best_alg->log[best_alg->ops] = m;
+             best_alg->op[best_alg->ops++] = alg_sub_t2_m;
+             best_alg->cost = cost_limit = cost;
+           }
+       }
+    }
+
+  /* If we are getting a too long sequence for `struct algorithm'
+     to record, store a fake cost to make this search fail.  */
+  if (best_alg->ops == MAX_BITS_PER_WORD)
+    best_alg->cost = cost_limit;
+
    return *best_alg;
  }
  \f
@@ -2016,13 +2063,15 @@ expand_mult (mode, op0, op1, target, unsignedp)
       produce a smaller program when -O is not used.
       But this causes such a terrible slowdown sometimes
       that it seems better to use synth_mult always.  */
+
    if (GET_CODE (const_op1) == CONST_INT && ! mult_is_very_cheap)
      {
        struct algorithm alg;
        struct algorithm neg_alg;
        int negate = 0;
-      HOST_WIDE_INT absval = INTVAL (op1);
-      rtx last;
+      HOST_WIDE_INT val = INTVAL (op1);
+      HOST_WIDE_INT val_so_far;
+      rtx insn;
  
        /* Try to do the computation two ways: multiply by the negative of OP1
          and then negate, or do the multiplication directly.  The latter is
@@ -2031,21 +2080,19 @@ expand_mult (mode, op0, op1, target, unsignedp)
          has a factor of 2**m +/- 1, while the negated value does not or
          vice versa.  */
  
-      alg = synth_mult (absval, add_cost, shift_cost, mult_cost);
-      neg_alg = synth_mult (- absval, add_cost, shift_cost,
-                           (alg.cost >= 0 ? alg.cost : mult_cost)
+      alg = synth_mult (val, mult_cost);
+      neg_alg = synth_mult (- val,
+                           (alg.cost < mult_cost ? alg.cost : mult_cost)
                             - negate_cost);
  
-      if (neg_alg.cost >= 0 && neg_alg.cost + negate_cost < alg.cost)
-       alg = neg_alg, negate = 1, absval = - absval;
+      if (neg_alg.cost + negate_cost < alg.cost)
+       alg = neg_alg, negate = 1;
  
-      if (alg.cost >= 0)
+      if (alg.cost < mult_cost)
         {
-         /* If we found something, it must be cheaper than multiply.
-            So use it.  */
-         int opno = 0;
+         /* We found something cheaper than a multiply insn.  */
+         int opno;
           rtx accum, tem;
-         int factors_seen = 0;
  
           op0 = protect_from_queue (op0, 0);
  
@@ -2054,119 +2101,113 @@ expand_mult (mode, op0, op1, target, unsignedp)
           if (GET_CODE (op0) == MEM)
             op0 = force_reg (mode, op0);
  
-         if (alg.ops == 0)
-           accum = copy_to_mode_reg (mode, op0);
-         else
+         /* ACCUM starts out either as OP0 or as a zero, depending on
+            the first operation.  */
+
+         if (alg.op[0] == alg_zero)
             {
-             /* 1 if this is the last in a series of adds and subtracts.  */
-             int last = (1 == alg.ops || alg.op[1] == alg_compound);
-             int log = floor_log2 (alg.coeff[0]);
-             if (! factors_seen && ! last)
-               log -= floor_log2 (alg.coeff[1]);
-
-             if (alg.op[0] != alg_add)
-               abort ();
-             accum = expand_shift (LSHIFT_EXPR, mode, op0,
-                                   build_int_2 (log, 0), NULL_RTX, 0);
+             accum = copy_to_mode_reg (mode, const0_rtx);
+             val_so_far = 0;
             }
-   
-         while (++opno < alg.ops)
+         else if (alg.op[0] == alg_m)
             {
-             int log = floor_log2 (alg.coeff[opno]);
-             /* 1 if this is the last in a series of adds and subtracts.  */
-             int last = (opno + 1 == alg.ops
-                         || alg.op[opno + 1] == alg_compound);
-
-             /* If we have not yet seen any separate factors (alg_compound)
-                then turn op0<<a1 + op0<<a2 + op0<<a3... into
-                (op0<<(a1-a2) + op0)<<(a2-a3) + op0...  */
+             accum  = copy_to_mode_reg (mode, op0);
+             val_so_far = 1;
+           }
+         else
+           abort ();
+
+         for (opno = 1; opno < alg.ops; opno++)
+           {
+             int log = alg.log[opno];
+             rtx shift_subtarget = preserve_subexpressions_p () ? 0 : accum;
+             rtx add_target = opno == alg.ops - 1 && target != 0 ? target : 0;
+
               switch (alg.op[opno])
                 {
-               case alg_add:
-                 if (factors_seen)
-                   {
-                     tem = expand_shift (LSHIFT_EXPR, mode, op0,
-                                         build_int_2 (log, 0), NULL_RTX, 0);
-                     accum = force_operand (gen_rtx (PLUS, mode, accum, tem),
-                                            accum);
-                   }
-                 else
-                   {
-                     if (! last)
-                       log -= floor_log2 (alg.coeff[opno + 1]);
-                     accum = force_operand (gen_rtx (PLUS, mode, accum, op0),
-                                            accum);
-                     accum = expand_shift (LSHIFT_EXPR, mode, accum,
-                                           build_int_2 (log, 0), accum, 0);
-                   }
+               case alg_shift:
+                 accum = expand_shift (LSHIFT_EXPR, mode, accum,
+                                       build_int_2 (log, 0), NULL_RTX, 0);
+                 val_so_far <<= log;
                   break;
  
-               case alg_subtract:
-                 if (factors_seen)
-                   {
-                     tem = expand_shift (LSHIFT_EXPR, mode, op0,
-                                         build_int_2 (log, 0), NULL_RTX, 0);
-                     accum = force_operand (gen_rtx (MINUS, mode, accum, tem),
-                                            accum);
-                   }
-                 else
-                   {
-                     if (! last)
-                       log -= floor_log2 (alg.coeff[opno + 1]);
-                     accum = force_operand (gen_rtx (MINUS, mode, accum, op0),
-                                            accum);
-                     accum = expand_shift (LSHIFT_EXPR, mode, accum,
-                                           build_int_2 (log, 0), accum, 0);
-                   }
+               case alg_add_t_m2:
+                 tem = expand_shift (LSHIFT_EXPR, mode, op0,
+                                     build_int_2 (log, 0), NULL_RTX, 0);
+                 accum = force_operand (gen_rtx (PLUS, mode, accum, tem),
+                                        add_target ? add_target : accum);
+                 val_so_far += (HOST_WIDE_INT) 1 << log;
+                 break;
  
+               case alg_sub_t_m2:
+                 tem = expand_shift (LSHIFT_EXPR, mode, op0,
+                                     build_int_2 (log, 0), NULL_RTX, 0);
+                 accum = force_operand (gen_rtx (MINUS, mode, accum, tem),
+                                        add_target ? add_target : accum);
+                 val_so_far -= (HOST_WIDE_INT) 1 << log;
                   break;
  
-               case alg_compound:
-                 factors_seen = 1;
+               case alg_add_t2_m:
+                 accum = expand_shift (LSHIFT_EXPR, mode, accum,
+                                       build_int_2 (log, 0), accum, 0);
+                 accum = force_operand (gen_rtx (PLUS, mode, accum, op0),
+                                        add_target ? add_target : accum);
+                 val_so_far = (val_so_far << log) + 1;
+                 break;
+
+               case alg_sub_t2_m:
+                 accum = expand_shift (LSHIFT_EXPR, mode, accum,
+                                       build_int_2 (log, 0), accum, 0);
+                 accum = force_operand (gen_rtx (MINUS, mode, accum, op0),
+                                        add_target ? add_target : accum);
+                 val_so_far = (val_so_far << log) - 1;
+                 break;
+
+               case alg_add_factor:
                   tem = expand_shift (LSHIFT_EXPR, mode, accum,
                                       build_int_2 (log, 0), NULL_RTX, 0);
+                 accum = force_operand (gen_rtx (PLUS, mode, accum, tem),
+                                        add_target ? add_target : accum);
+                 val_so_far += val_so_far << log;
+                 break;
  
-                 log = floor_log2 (alg.coeff[opno + 1]);
-                 accum = expand_shift (LSHIFT_EXPR, mode, accum,
-                                       build_int_2 (log, 0), NULL_RTX, 0);
-                 opno++;
-                 if (alg.op[opno] == alg_add)
-                   accum = force_operand (gen_rtx (PLUS, mode, tem, accum),
-                                          tem);
-                 else
-                   accum = force_operand (gen_rtx (MINUS, mode, tem, accum),
-                                          tem);
-               }
-           }
+               case alg_sub_factor:
+                 tem = expand_shift (LSHIFT_EXPR, mode, accum,
+                                     build_int_2 (log, 0), NULL_RTX, 0);
+                 accum = force_operand (gen_rtx (MINUS, mode, tem, accum),
+                                        add_target ? add_target : tem);
+                 val_so_far = (val_so_far << log) - val_so_far;
+                 break;
  
-         /* Write a REG_EQUAL note on the last insn so that we can cse 
-            multiplication sequences.  We need not do this if we were
-            multiplying by a power of two, since only one insn would have
-            been generated.
+               default:
+                 abort ();;
+               }
  
-            ??? We could also write REG_EQUAL notes on the last insn of
-            each sequence that uses a single temporary, but it is not
-            clear how to calculate the partial product so far.
+             /* Write a REG_EQUAL note on the last insn so that we can cse
+                multiplication sequences.  */
  
-            Torbjorn: Can you do this?  */
+             insn = get_last_insn ();
+             REG_NOTES (insn)
+               = gen_rtx (EXPR_LIST, REG_EQUAL,
+                          gen_rtx (MULT, mode, op0, GEN_INT (val_so_far)),
+                          REG_NOTES (insn));
+           }
  
-         if (exact_log2 (absval) < 0)
+         if (negate)
             {
-             last = get_last_insn ();
-             REG_NOTES (last)
-               = gen_rtx (EXPR_LIST, REG_EQUAL,
-                          gen_rtx (MULT, mode, op0, 
-                                   negate ? GEN_INT (absval) : op1),
-                          REG_NOTES (last));
+             val_so_far = - val_so_far;
+             accum = expand_unop (mode, neg_optab, accum, target, 0);
             }
  
-         return (negate ? expand_unop (mode, neg_optab, accum, target, 0)
-                 : accum);
+         if (val != val_so_far)
+           abort ();
+
+         return accum;
         }
      }
  
    /* This used to use umul_optab if unsigned,
-     but I think that for non-widening multiply there is no difference
+     but for non-widening multiply there is no difference
       between signed and unsigned.  */
    op0 = expand_binop (mode, smul_optab,
                       op0, op1, target, unsignedp, OPTAB_LIB_WIDEN);
@@ -2334,14 +2375,6 @@ expand_divmod (rem_flag, code, mode, op0, op1, target, unsignedp)
      case TRUNC_DIV_EXPR:
        if (log >= 0 && ! unsignedp)
         {
-         if (! can_clobber_op0)
-           {
-             adjusted_op0 = copy_to_suggested_reg (adjusted_op0, target,
-                                                   compute_mode);
-             /* Copy op0 to a reg, since emit_cmp_insn will call emit_queue
-                which will screw up mem refs for autoincrements.  */
-             op0 = force_reg (compute_mode, op0);
-           }
           /* Here we need to add OP1-1 if OP0 is negative, 0 otherwise.
              This can be computed without jumps by arithmetically shifting
              OP0 right LOG-1 places and then shifting right logically
@@ -2350,17 +2383,33 @@ expand_divmod (rem_flag, code, mode, op0, op1, target, unsignedp)
           if (log == 1 || BRANCH_COST >= 3)
             {
               rtx temp = gen_reg_rtx (compute_mode);
+             if (! can_clobber_op0)
+               /* Copy op0 to a reg, to play safe,
+                  since this is done in the other path.  */
+               op0 = force_reg (compute_mode, op0);
               temp = copy_to_suggested_reg (adjusted_op0, temp, compute_mode);
               temp = expand_shift (RSHIFT_EXPR, compute_mode, temp,
                                    build_int_2 (log - 1, 0), NULL_RTX, 0);
               temp = expand_shift (RSHIFT_EXPR, compute_mode, temp,
                                    build_int_2 (size - log, 0),
                                    temp, 1);
-             expand_inc (adjusted_op0, temp);
+             /* We supply 0 as the target to make a new pseudo
+                for the value; that helps loop.c optimize the result.  */
+             adjusted_op0 = expand_binop (compute_mode, add_optab,
+                                          adjusted_op0, temp,
+                                          0, 0, OPTAB_LIB_WIDEN);
             }
           else
             {
               rtx label = gen_label_rtx ();
+             if (! can_clobber_op0)
+               {
+                 adjusted_op0 = copy_to_suggested_reg (adjusted_op0, target,
+                                                       compute_mode);
+                 /* Copy op0 to a reg, since emit_cmp_insn will call emit_queue
+                    which will screw up mem refs for autoincrements.  */
+                 op0 = force_reg (compute_mode, op0);
+               }
               emit_cmp_insn (adjusted_op0, const0_rtx, GE, 
                              NULL_RTX, compute_mode, 0, 0);
               emit_jump_insn (gen_bge (label));
@@ -2940,7 +2989,7 @@ emit_store_flag (target, code, op0, op1, mode, unsignedp, normalizep)
              conversion now.  */
           if (target_mode != compare_mode)
             {
-             convert_move (target, op0);
+             convert_move (target, op0, 0);
               return target;
             }
           else
@@ -2960,7 +3009,7 @@ emit_store_flag (target, code, op0, op1, mode, unsignedp, normalizep)
       comparison with zero.  Don't do any of these cases if branches are
       very cheap.  */
  
-  if (BRANCH_COST >= 0
+  if (BRANCH_COST > 0
        && GET_MODE_CLASS (mode) == MODE_INT && (code == EQ || code == NE)
        && op1 != const0_rtx)
      {