* calls.c (expand_call): When copying unaligned values into a register,

[pf3gnuchains/gcc-fork.git] / gcc / expmed.c
diff --git a/gcc/expmed.c b/gcc/expmed.c

index faa4843..65fb007 100644 (file)
--- a/gcc/expmed.c
+++ b/gcc/expmed.c
@@ -1,6 +1,6 @@
  /* Medium-level subroutines: convert bit-field store and extract
     and shifts, multiplies and divides to rtl instructions.
-   Copyright (C) 1987, 1988, 1989, 1992, 1993 Free Software Foundation, Inc.
+   Copyright (C) 1987, 88, 89, 92-6, 1997 Free Software Foundation, Inc.
  
  This file is part of GNU CC.
  
@@ -16,7 +16,8 @@ GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
  along with GNU CC; see the file COPYING.  If not, write to
-the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */
+the Free Software Foundation, 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
  
  
  #include "config.h"
@@ -30,18 +31,18 @@ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */
  #include "real.h"
  #include "recog.h"
  
-static rtx extract_split_bit_field ();
-static rtx extract_fixed_bit_field ();
-static void store_split_bit_field ();
-static void store_fixed_bit_field ();
-static rtx mask_rtx ();
-static rtx lshift_value ();
+static void store_fixed_bit_field      PROTO((rtx, int, int, int, rtx, int));
+static void store_split_bit_field      PROTO((rtx, int, int, rtx, int));
+static rtx extract_fixed_bit_field     PROTO((enum machine_mode, rtx, int,
+                                              int, int, rtx, int, int));
+static rtx mask_rtx                    PROTO((enum machine_mode, int,
+                                              int, int));
+static rtx lshift_value                        PROTO((enum machine_mode, rtx,
+                                              int, int));
+static rtx extract_split_bit_field     PROTO((rtx, int, int, int, int));
  
  #define CEIL(x,y) (((x) + (y) - 1) / (y))
  
-/* Non-zero means multiply instructions are cheaper than shifts.  */
-int mult_is_very_cheap;
-
  /* Non-zero means divides or modulus operations are relatively cheap for
     powers of two, so don't use branches; emit the operation instead. 
     Usually, this will mean that the MD file will emit non-branch
@@ -49,11 +50,28 @@ int mult_is_very_cheap;
  
  static int sdiv_pow2_cheap, smod_pow2_cheap;
  
-/* Cost of various pieces of RTL.  */
-static int add_cost, mult_cost, negate_cost, zero_cost;
-static int shift_cost[BITS_PER_WORD];
-static int shiftadd_cost[BITS_PER_WORD];
-static int shiftsub_cost[BITS_PER_WORD];
+#ifndef SLOW_UNALIGNED_ACCESS
+#define SLOW_UNALIGNED_ACCESS STRICT_ALIGNMENT
+#endif
+
+/* For compilers that support multiple targets with different word sizes,
+   MAX_BITS_PER_WORD contains the biggest value of BITS_PER_WORD.  An example
+   is the H8/300(H) compiler.  */
+
+#ifndef MAX_BITS_PER_WORD
+#define MAX_BITS_PER_WORD BITS_PER_WORD
+#endif
+
+/* Cost of various pieces of RTL.  Note that some of these are indexed by shift count,
+   and some by mode.  */
+static int add_cost, negate_cost, zero_cost;
+static int shift_cost[MAX_BITS_PER_WORD];
+static int shiftadd_cost[MAX_BITS_PER_WORD];
+static int shiftsub_cost[MAX_BITS_PER_WORD];
+static int mul_cost[NUM_MACHINE_MODES];
+static int div_cost[NUM_MACHINE_MODES];
+static int mul_widen_cost[NUM_MACHINE_MODES];
+static int mul_highpart_cost[NUM_MACHINE_MODES];
  
  void
  init_expmed ()
@@ -61,10 +79,11 @@ init_expmed ()
    char *free_point;
    /* This is "some random pseudo register" for purposes of calling recog
       to see what insns exist.  */
-  rtx reg = gen_rtx (REG, word_mode, FIRST_PSEUDO_REGISTER);
+  rtx reg = gen_rtx (REG, word_mode, 10000);
    rtx shift_insn, shiftadd_insn, shiftsub_insn;
    int dummy;
    int m;
+  enum machine_mode mode, wider_mode;
  
    start_sequence ();
  
@@ -74,7 +93,7 @@ init_expmed ()
  
    free_point = (char *) oballoc (0);
  
-  zero_cost = rtx_cost (const0_rtx);
+  zero_cost = rtx_cost (const0_rtx, 0);
    add_cost = rtx_cost (gen_rtx (PLUS, word_mode, reg, reg), SET);
  
    shift_insn = emit_insn (gen_rtx (SET, VOIDmode, reg,
@@ -117,14 +136,8 @@ init_expmed ()
         shiftsub_cost[m] = rtx_cost (SET_SRC (PATTERN (shiftsub_insn)), SET);
      }
  
-  mult_cost = rtx_cost (gen_rtx (MULT, word_mode, reg, reg), SET);
    negate_cost = rtx_cost (gen_rtx (NEG, word_mode, reg), SET);
  
-  /* 999999 is chosen to avoid any plausible faster special case.  */
-  mult_is_very_cheap
-    = (rtx_cost (gen_rtx (MULT, word_mode, reg, GEN_INT (999999)), SET)
-       < rtx_cost (gen_rtx (ASHIFT, word_mode, reg, GEN_INT (7)), SET));
-
    sdiv_pow2_cheap
      = (rtx_cost (gen_rtx (DIV, word_mode, reg, GEN_INT (32)), SET)
         <= 2 * add_cost);
@@ -132,6 +145,32 @@ init_expmed ()
      = (rtx_cost (gen_rtx (MOD, word_mode, reg, GEN_INT (32)), SET)
         <= 2 * add_cost);
  
+  for (mode = GET_CLASS_NARROWEST_MODE (MODE_INT);
+       mode != VOIDmode;
+       mode = GET_MODE_WIDER_MODE (mode))
+    {
+      reg = gen_rtx (REG, mode, 10000);
+      div_cost[(int) mode] = rtx_cost (gen_rtx (UDIV, mode, reg, reg), SET);
+      mul_cost[(int) mode] = rtx_cost (gen_rtx (MULT, mode, reg, reg), SET);
+      wider_mode = GET_MODE_WIDER_MODE (mode);
+      if (wider_mode != VOIDmode)
+       {
+         mul_widen_cost[(int) wider_mode]
+           = rtx_cost (gen_rtx (MULT, wider_mode,
+                                gen_rtx (ZERO_EXTEND, wider_mode, reg),
+                                gen_rtx (ZERO_EXTEND, wider_mode, reg)),
+                       SET);
+         mul_highpart_cost[(int) mode]
+           = rtx_cost (gen_rtx (TRUNCATE, mode,
+                                gen_rtx (LSHIFTRT, wider_mode,
+                                         gen_rtx (MULT, wider_mode,
+                                                  gen_rtx (ZERO_EXTEND, wider_mode, reg),
+                                                  gen_rtx (ZERO_EXTEND, wider_mode, reg)),
+                                         GEN_INT (GET_MODE_BITSIZE (mode)))),
+                       SET);
+       }
+    }
+
    /* Free the objects we just allocated.  */
    end_sequence ();
    obfree (free_point);
@@ -146,21 +185,12 @@ negate_rtx (mode, x)
       enum machine_mode mode;
       rtx x;
  {
-  if (GET_CODE (x) == CONST_INT)
-    {
-      HOST_WIDE_INT val = - INTVAL (x);
-      if (GET_MODE_BITSIZE (mode) < HOST_BITS_PER_WIDE_INT)
-       {
-         /* Sign extend the value from the bits that are significant.  */
-         if (val & ((HOST_WIDE_INT) 1 << (GET_MODE_BITSIZE (mode) - 1)))
-           val |= (HOST_WIDE_INT) (-1) << GET_MODE_BITSIZE (mode);
-         else
-           val &= ((HOST_WIDE_INT) 1 << GET_MODE_BITSIZE (mode)) - 1;
-       }
-      return GEN_INT (val);
-    }
-  else
-    return expand_unop (GET_MODE (x), neg_optab, x, NULL_RTX, 0);
+  rtx result = simplify_unary_operation (NEG, mode, x, mode);
+
+  if (result == 0)
+    result = expand_unop (mode, neg_optab, x, NULL_RTX, 0);
+
+  return result;
  }
  \f
  /* Generate code to store value from rtx VALUE
@@ -215,13 +245,13 @@ store_bit_field (str_rtx, bitsize, bitnum, fieldmode, value, align, total_size)
        op0 = SUBREG_REG (op0);
      }
  
-#if BYTES_BIG_ENDIAN
    /* If OP0 is a register, BITPOS must count within a word.
       But as we have it, it counts within whatever size OP0 now has.
       On a bigendian machine, these are not the same, so convert.  */
-  if (GET_CODE (op0) != MEM && unit > GET_MODE_BITSIZE (GET_MODE (op0)))
+  if (BYTES_BIG_ENDIAN
+      && GET_CODE (op0) != MEM
+      && unit > GET_MODE_BITSIZE (GET_MODE (op0)))
      bitpos += unit - GET_MODE_BITSIZE (GET_MODE (op0));
-#endif
  
    value = protect_from_queue (value, 0);
  
@@ -231,17 +261,22 @@ store_bit_field (str_rtx, bitsize, bitnum, fieldmode, value, align, total_size)
    /* Note that the adjustment of BITPOS above has no effect on whether
       BITPOS is 0 in a REG bigger than a word.  */
    if (GET_MODE_SIZE (fieldmode) >= UNITS_PER_WORD
-      && (! STRICT_ALIGNMENT || GET_CODE (op0) != MEM)
+      && (GET_CODE (op0) != MEM
+         || ! SLOW_UNALIGNED_ACCESS
+         || (offset * BITS_PER_UNIT % bitsize == 0
+             && align % GET_MODE_SIZE (fieldmode) == 0))
        && bitpos == 0 && bitsize == GET_MODE_BITSIZE (fieldmode))
      {
        /* Storing in a full-word or multi-word field in a register
          can be done with just SUBREG.  */
        if (GET_MODE (op0) != fieldmode)
-       if (GET_CODE (op0) == REG)
-         op0 = gen_rtx (SUBREG, fieldmode, op0, offset);
-       else
-         op0 = change_address (op0, fieldmode,
-                               plus_constant (XEXP (op0, 0), offset));
+       {
+         if (GET_CODE (op0) == REG)
+           op0 = gen_rtx (SUBREG, fieldmode, op0, offset);
+         else
+           op0 = change_address (op0, fieldmode,
+                                 plus_constant (XEXP (op0, 0), offset));
+       }
        emit_move_insn (op0, value);
        return value;
      }
@@ -250,11 +285,7 @@ store_bit_field (str_rtx, bitsize, bitnum, fieldmode, value, align, total_size)
       can be done with a movestrict instruction.  */
  
    if (GET_CODE (op0) != MEM
-#if BYTES_BIG_ENDIAN
-      && bitpos + bitsize == unit
-#else
-      && bitpos == 0
-#endif
+      && (BYTES_BIG_ENDIAN ? bitpos + bitsize == unit : bitpos == 0)
        && bitsize == GET_MODE_BITSIZE (fieldmode)
        && (GET_MODE (op0) == fieldmode
           || (movstrict_optab->handlers[(int) fieldmode].insn_code
@@ -288,7 +319,10 @@ store_bit_field (str_rtx, bitsize, bitnum, fieldmode, value, align, total_size)
        /* Here we transfer the words of the field
          in the order least significant first.
          This is because the most significant word is the one which may
-        be less than full.  */
+        be less than full.
+        However, only do that if the value is not BLKmode.  */
+
+      int backwards = WORDS_BIG_ENDIAN && fieldmode != BLKmode;
  
        int nwords = (bitsize + (BITS_PER_WORD - 1)) / BITS_PER_WORD;
        int i;
@@ -304,14 +338,17 @@ store_bit_field (str_rtx, bitsize, bitnum, fieldmode, value, align, total_size)
         {
           /* If I is 0, use the low-order word in both field and target;
              if I is 1, use the next to lowest word; and so on.  */
-         int wordnum = (WORDS_BIG_ENDIAN ? nwords - i - 1 : i);
-         int bit_offset = (WORDS_BIG_ENDIAN
+         int wordnum = (backwards ? nwords - i - 1 : i);
+         int bit_offset = (backwards
                             ? MAX (bitsize - (i + 1) * BITS_PER_WORD, 0)
                             : i * BITS_PER_WORD);
           store_bit_field (op0, MIN (BITS_PER_WORD,
                                      bitsize - i * BITS_PER_WORD),
                            bitnum + bit_offset, word_mode,
-                          operand_subword_force (value, wordnum, fieldmode),
+                          operand_subword_force (value, wordnum,
+                                                 (GET_MODE (value) == VOIDmode
+                                                  ? fieldmode
+                                                  : GET_MODE (value))),
                            align, total_size);
         }
        return value;
@@ -336,15 +373,30 @@ store_bit_field (str_rtx, bitsize, bitnum, fieldmode, value, align, total_size)
        op0 = protect_from_queue (op0, 1);
      }
  
+  /* If VALUE is a floating-point mode, access it as an integer of the
+     corresponding size.  This can occur on a machine with 64 bit registers
+     that uses SFmode for float.  This can also occur for unaligned float
+     structure fields.  */
+  if (GET_MODE_CLASS (GET_MODE (value)) == MODE_FLOAT)
+    {
+      if (GET_CODE (value) != REG)
+       value = copy_to_reg (value);
+      value = gen_rtx (SUBREG, word_mode, value, 0);
+    }
+
    /* Now OFFSET is nonzero only if OP0 is memory
       and is therefore always measured in bytes.  */
  
  #ifdef HAVE_insv
    if (HAVE_insv
+      && GET_MODE (value) != BLKmode
        && !(bitsize == 1 && GET_CODE (value) == CONST_INT)
        /* Ensure insv's size is wide enough for this field.  */
        && (GET_MODE_BITSIZE (insn_operand_mode[(int) CODE_FOR_insv][3])
-         >= bitsize))
+         >= bitsize)
+      && ! ((GET_CODE (op0) == REG || GET_CODE (op0) == SUBREG)
+           && (bitsize + bitpos
+               > GET_MODE_BITSIZE (insn_operand_mode[(int) CODE_FOR_insv][3]))))
      {
        int xbitpos = bitpos;
        rtx value1;
@@ -357,13 +409,13 @@ store_bit_field (str_rtx, bitsize, bitnum, fieldmode, value, align, total_size)
        int save_volatile_ok = volatile_ok;
        volatile_ok = 1;
  
-      /* If this machine's insv can only insert into a register, or if we
-        are to force MEMs into a register, copy OP0 into a register and
-        save it back later.  */
+      /* If this machine's insv can only insert into a register, copy OP0
+        into a register and save it back later.  */
+      /* This used to check flag_force_mem, but that was a serious
+        de-optimization now that flag_force_mem is enabled by -O2.  */
        if (GET_CODE (op0) == MEM
-         && (flag_force_mem
-             || ! ((*insn_operand_predicate[(int) CODE_FOR_insv][0])
-                   (op0, VOIDmode))))
+         && ! ((*insn_operand_predicate[(int) CODE_FOR_insv][0])
+               (op0, VOIDmode)))
         {
           rtx tempreg;
           enum machine_mode bestmode;
@@ -381,7 +433,8 @@ store_bit_field (str_rtx, bitsize, bitnum, fieldmode, value, align, total_size)
           else
             bestmode = GET_MODE (op0);
  
-         if (bestmode == VOIDmode)
+         if (bestmode == VOIDmode
+             || (SLOW_UNALIGNED_ACCESS && GET_MODE_SIZE (bestmode) > align))
             goto insv_loses;
  
           /* Adjust address to point to the containing unit of that mode.  */
@@ -409,22 +462,23 @@ store_bit_field (str_rtx, bitsize, bitnum, fieldmode, value, align, total_size)
        /* If xop0 is a register, we need it in MAXMODE
          to make it acceptable to the format of insv.  */
        if (GET_CODE (xop0) == SUBREG)
-       PUT_MODE (xop0, maxmode);
+       /* We can't just change the mode, because this might clobber op0,
+          and we will need the original value of op0 if insv fails.  */
+       xop0 = gen_rtx (SUBREG, maxmode, SUBREG_REG (xop0), SUBREG_WORD (xop0));
        if (GET_CODE (xop0) == REG && GET_MODE (xop0) != maxmode)
         xop0 = gen_rtx (SUBREG, maxmode, xop0, 0);
  
        /* On big-endian machines, we count bits from the most significant.
          If the bit field insn does not, we must invert.  */
  
-#if BITS_BIG_ENDIAN != BYTES_BIG_ENDIAN
-      xbitpos = unit - bitsize - xbitpos;
-#endif
+      if (BITS_BIG_ENDIAN != BYTES_BIG_ENDIAN)
+       xbitpos = unit - bitsize - xbitpos;
+
        /* We have been counting XBITPOS within UNIT.
          Count instead within the size of the register.  */
-#if BITS_BIG_ENDIAN
-      if (GET_CODE (xop0) != MEM)
+      if (BITS_BIG_ENDIAN && GET_CODE (xop0) != MEM)
         xbitpos += GET_MODE_BITSIZE (maxmode) - unit;
-#endif
+
        unit = GET_MODE_BITSIZE (maxmode);
  
        /* Convert VALUE to maxmode (which insv insn wants) in VALUE1.  */
@@ -503,10 +557,9 @@ store_fixed_bit_field (op0, offset, bitsize, bitpos, value, struct_align)
    int all_zero = 0;
    int all_one = 0;
  
-  /* Add OFFSET to OP0's address (if it is in memory)
-     and if a single byte contains the whole bit field
-     change OP0 to a byte.  */
-
+  if (! SLOW_UNALIGNED_ACCESS)
+    struct_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
+    
    /* There is a case not handled here:
       a structure with a known alignment of just a halfword
       and a field split across two aligned halfwords within the structure.
@@ -521,7 +574,8 @@ store_fixed_bit_field (op0, offset, bitsize, bitpos, value, struct_align)
        /* Special treatment for a bit field split across two registers.  */
        if (bitsize + bitpos > BITS_PER_WORD)
         {
-         store_split_bit_field (op0, bitsize, bitpos, value, BITS_PER_WORD);
+         store_split_bit_field (op0, bitsize, bitpos,
+                                value, BITS_PER_WORD);
           return;
         }
      }
@@ -539,13 +593,24 @@ store_fixed_bit_field (op0, offset, bitsize, bitpos, value, struct_align)
         {
           /* The only way this should occur is if the field spans word
              boundaries.  */
-         store_split_bit_field (op0, bitsize, bitpos + offset * BITS_PER_UNIT,
+         store_split_bit_field (op0,
+                                bitsize, bitpos + offset * BITS_PER_UNIT,
                                  value, struct_align);
           return;
         }
  
        total_bits = GET_MODE_BITSIZE (mode);
  
+      /* Make sure bitpos is valid for the chosen mode.  Adjust BITPOS to
+        be be in the range 0 to total_bits-1, and put any excess bytes in
+        OFFSET.  */
+      if (bitpos >= total_bits)
+       {
+         offset += (bitpos / total_bits) * (total_bits / BITS_PER_UNIT);
+         bitpos -= ((bitpos / total_bits) * (total_bits / BITS_PER_UNIT)
+                    * BITS_PER_UNIT);
+       }
+
        /* Get ref to an aligned byte, halfword, or word containing the field.
          Adjust BITPOS to be position within a word,
          and OFFSET to be the offset of that word.
@@ -564,13 +629,12 @@ store_fixed_bit_field (op0, offset, bitsize, bitpos, value, struct_align)
       BITPOS is the starting bit number within OP0.
       (OP0's mode may actually be narrower than MODE.)  */
  
-#if BYTES_BIG_ENDIAN
-  /* BITPOS is the distance between our msb
-     and that of the containing datum.
-     Convert it to the distance from the lsb.  */
+  if (BYTES_BIG_ENDIAN)
+      /* BITPOS is the distance between our msb
+        and that of the containing datum.
+        Convert it to the distance from the lsb.  */
+      bitpos = total_bits - bitsize - bitpos;
  
-  bitpos = total_bits - bitsize - bitpos;
-#endif
    /* Now BITPOS is always the distance between our lsb
       and that of OP0.  */
  
@@ -600,17 +664,6 @@ store_fixed_bit_field (op0, offset, bitsize, bitpos, value, struct_align)
  
        if (GET_MODE (value) != mode)
         {
-         /* If VALUE is a floating-point mode, access it as an integer
-            of the corresponding size, then convert it.  This can occur on
-            a machine with 64 bit registers that uses SFmode for float.  */
-         if (GET_MODE_CLASS (GET_MODE (value)) == MODE_FLOAT)
-           {
-             if (GET_CODE (value) != REG)
-               value = copy_to_reg (value);
-             value
-               = gen_rtx (SUBREG, word_mode, value, 0);
-           }
-
           if ((GET_CODE (value) == REG || GET_CODE (value) == SUBREG)
               && GET_MODE_SIZE (mode) < GET_MODE_SIZE (GET_MODE (value)))
             value = gen_lowpart (mode, value);
@@ -651,12 +704,16 @@ store_fixed_bit_field (op0, offset, bitsize, bitpos, value, struct_align)
      emit_move_insn (op0, temp);
  }
  \f
-/* Store a bit field that is split across two words.
+/* Store a bit field that is split across multiple accessible memory objects.
  
-   OP0 is the REG, SUBREG or MEM rtx for the first of the two words.
+   OP0 is the REG, SUBREG or MEM rtx for the first of the objects.
     BITSIZE is the field width; BITPOS the position of its first bit
     (within the word).
-   VALUE is the value to store.  */
+   VALUE is the value to store.
+   ALIGN is the known alignment of OP0, measured in bytes.
+   This is also the size of the memory objects to be used.
+
+   This does not yet handle fields wider than BITS_PER_WORD.  */
  
  static void
  store_split_bit_field (op0, bitsize, bitpos, value, align)
@@ -665,94 +722,127 @@ store_split_bit_field (op0, bitsize, bitpos, value, align)
       rtx value;
       int align;
  {
-  /* BITSIZE_1 is size of the part in the first word.  */
-  int bitsize_1 = BITS_PER_WORD - bitpos % BITS_PER_WORD;
-  /* BITSIZE_2 is size of the rest (in the following word).  */
-  int bitsize_2 = bitsize - bitsize_1;
-  rtx part1, part2;
-  int unit = GET_CODE (op0) == MEM ? BITS_PER_UNIT : BITS_PER_WORD;
-  int offset = bitpos / unit;
-  rtx word;
-
-  /* The field must span exactly one word boundary.  */
-  if (bitpos / BITS_PER_WORD != (bitpos + bitsize - 1) / BITS_PER_WORD - 1)
-    abort ();
-
-  if (GET_MODE (value) != VOIDmode)
-    value = convert_to_mode (word_mode, value, 1);
+  int unit;
+  int bitsdone = 0;
  
-  if (GET_CODE (value) == CONST_DOUBLE
-      && (part1 = gen_lowpart_common (word_mode, value)) != 0)
-    value = part1;
+  /* Make sure UNIT isn't larger than BITS_PER_WORD, we can only handle that
+     much at a time.  */
+  if (GET_CODE (op0) == REG || GET_CODE (op0) == SUBREG)
+    unit = BITS_PER_WORD;
+  else
+    unit = MIN (align * BITS_PER_UNIT, BITS_PER_WORD);
  
+  /* If VALUE is a constant other than a CONST_INT, get it into a register in
+     WORD_MODE.  If we can do this using gen_lowpart_common, do so.  Note
+     that VALUE might be a floating-point constant.  */
    if (CONSTANT_P (value) && GET_CODE (value) != CONST_INT)
-    value = copy_to_mode_reg (word_mode, value);
-
-  /* Split the value into two parts:
-     PART1 gets that which goes in the first word; PART2 the other.  */
-#if BYTES_BIG_ENDIAN
-  /* PART1 gets the more significant part.  */
-  if (GET_CODE (value) == CONST_INT)
-    {
-      part1 = GEN_INT ((unsigned HOST_WIDE_INT) (INTVAL (value)) >> bitsize_2);
-      part2 = GEN_INT ((unsigned HOST_WIDE_INT) (INTVAL (value))
-                      & (((HOST_WIDE_INT) 1 << bitsize_2) - 1));
-    }
-  else
      {
-      part1 = extract_fixed_bit_field (word_mode, value, 0, bitsize_1,
-                                      BITS_PER_WORD - bitsize, NULL_RTX, 1,
-                                      BITS_PER_WORD);
-      part2 = extract_fixed_bit_field (word_mode, value, 0, bitsize_2,
-                                      BITS_PER_WORD - bitsize_2, NULL_RTX, 1,
-                                      BITS_PER_WORD);
-    }
-#else
-  /* PART1 gets the less significant part.  */
-  if (GET_CODE (value) == CONST_INT)
-    {
-      part1 = GEN_INT ((unsigned HOST_WIDE_INT) (INTVAL (value))
-                      & (((HOST_WIDE_INT) 1 << bitsize_1) - 1));
-      part2 = GEN_INT ((unsigned HOST_WIDE_INT) (INTVAL (value)) >> bitsize_1);
+      rtx word = gen_lowpart_common (word_mode, value);
+
+      if (word && (value != word))
+       value = word;
+      else
+       value = gen_lowpart_common (word_mode,
+                                   force_reg (GET_MODE (value) != VOIDmode
+                                              ? GET_MODE (value)
+                                              : word_mode, value));
      }
-  else
+
+  while (bitsdone < bitsize)
      {
-      part1 = extract_fixed_bit_field (word_mode, value, 0, bitsize_1, 0,
-                                      NULL_RTX, 1, BITS_PER_WORD);
-      part2 = extract_fixed_bit_field (word_mode, value, 0, bitsize_2,
-                                      bitsize_1, NULL_RTX, 1, BITS_PER_WORD);
-    }
-#endif
+      int thissize;
+      rtx part, word;
+      int thispos;
+      int offset;
  
-  /* Store PART1 into the first word.  If OP0 is a MEM, pass OP0 and the
-     offset computed above.  Otherwise, get the proper word and pass an
-     offset of zero.  */
-  word = (GET_CODE (op0) == MEM ? op0
-         : operand_subword (op0, offset, 1, GET_MODE (op0)));
-  if (word == 0)
-    abort ();
+      offset = (bitpos + bitsdone) / unit;
+      thispos = (bitpos + bitsdone) % unit;
  
-  store_fixed_bit_field (word, GET_CODE (op0) == MEM ? offset : 0,
-                        bitsize_1, bitpos % unit, part1, align);
+      /* THISSIZE must not overrun a word boundary.  Otherwise,
+        store_fixed_bit_field will call us again, and we will mutually
+        recurse forever.  */
+      thissize = MIN (bitsize - bitsdone, BITS_PER_WORD);
+      thissize = MIN (thissize, unit - thispos);
  
-  /* Offset op0 by 1 word to get to the following one.  */
-  if (GET_CODE (op0) == SUBREG)
-    word = operand_subword (SUBREG_REG (op0), SUBREG_WORD (op0) + offset + 1,
-                           1, VOIDmode);
-  else if (GET_CODE (op0) == MEM)
-    word = op0;
-  else
-    word = operand_subword (op0, offset + 1, 1, GET_MODE (op0));
+      if (BYTES_BIG_ENDIAN)
+       {
+         int total_bits;
  
-  if (word == 0)
-    abort ();
+         /* We must do an endian conversion exactly the same way as it is
+            done in extract_bit_field, so that the two calls to
+            extract_fixed_bit_field will have comparable arguments.  */
+         if (GET_CODE (value) != MEM || GET_MODE (value) == BLKmode)
+           total_bits = BITS_PER_WORD;
+         else
+           total_bits = GET_MODE_BITSIZE (GET_MODE (value));
+
+         /* Fetch successively less significant portions.  */
+         if (GET_CODE (value) == CONST_INT)
+           part = GEN_INT (((unsigned HOST_WIDE_INT) (INTVAL (value))
+                            >> (bitsize - bitsdone - thissize))
+                           & (((HOST_WIDE_INT) 1 << thissize) - 1));
+         else
+           /* The args are chosen so that the last part includes the
+              lsb.  Give extract_bit_field the value it needs (with
+              endianness compensation) to fetch the piece we want.
+
+              ??? We have no idea what the alignment of VALUE is, so
+              we have to use a guess.  */
+           part
+             = extract_fixed_bit_field
+               (word_mode, value, 0, thissize,
+                total_bits - bitsize + bitsdone, NULL_RTX, 1,
+                GET_MODE (value) == VOIDmode
+                ? UNITS_PER_WORD
+                : (GET_MODE (value) == BLKmode
+                   ? 1
+                   : GET_MODE_ALIGNMENT (GET_MODE (value)) / BITS_PER_UNIT));
+       }
+      else
+       {
+         /* Fetch successively more significant portions.  */
+         if (GET_CODE (value) == CONST_INT)
+           part = GEN_INT (((unsigned HOST_WIDE_INT) (INTVAL (value))
+                            >> bitsdone)
+                           & (((HOST_WIDE_INT) 1 << thissize) - 1));
+         else
+           part
+             = extract_fixed_bit_field
+               (word_mode, value, 0, thissize, bitsdone, NULL_RTX, 1,
+                GET_MODE (value) == VOIDmode
+                ? UNITS_PER_WORD
+                : (GET_MODE (value) == BLKmode
+                   ? 1
+                   : GET_MODE_ALIGNMENT (GET_MODE (value)) / BITS_PER_UNIT));
+       }
+
+      /* If OP0 is a register, then handle OFFSET here.
+
+        When handling multiword bitfields, extract_bit_field may pass
+        down a word_mode SUBREG of a larger REG for a bitfield that actually
+        crosses a word boundary.  Thus, for a SUBREG, we must find
+        the current word starting from the base register.  */
+      if (GET_CODE (op0) == SUBREG)
+       {
+         word = operand_subword_force (SUBREG_REG (op0),
+                                       SUBREG_WORD (op0) + offset,
+                                       GET_MODE (SUBREG_REG (op0)));
+         offset = 0;
+       }
+      else if (GET_CODE (op0) == REG)
+       {
+         word = operand_subword_force (op0, offset, GET_MODE (op0));
+         offset = 0;
+       }
+      else
+       word = op0;
  
-  /* Store PART2 into the second word.  */
-  store_fixed_bit_field (word,
-                        (GET_CODE (op0) == MEM
-                         ? CEIL (offset + 1, UNITS_PER_WORD) * UNITS_PER_WORD
-                         : 0),
-                        bitsize_2, 0, part2, align);
+      /* OFFSET is in UNITs, and UNIT is in bits.
+         store_fixed_bit_field wants offset in bytes.  */
+      store_fixed_bit_field (word, offset * unit / BITS_PER_UNIT,
+                            thissize, thispos, part, align);
+      bitsdone += thissize;
+    }
  }
  \f
  /* Generate code to extract a byte-field from STR_RTX
@@ -795,9 +885,6 @@ extract_bit_field (str_rtx, bitsize, bitnum, unsignedp,
    rtx spec_target = target;
    rtx spec_target_subreg = 0;
  
-  if (GET_CODE (str_rtx) == MEM && ! MEM_IN_STRUCT_P (str_rtx))
-    abort ();
-
    /* Discount the part of the structure before the desired byte.
       We need to know how many bytes are safe to reference after it.  */
    if (total_size >= 0)
@@ -808,41 +895,67 @@ extract_bit_field (str_rtx, bitsize, bitnum, unsignedp,
      tmode = mode;
    while (GET_CODE (op0) == SUBREG)
      {
+      int outer_size = GET_MODE_BITSIZE (GET_MODE (op0));
+      int inner_size = GET_MODE_BITSIZE (GET_MODE (SUBREG_REG (op0)));
+
        offset += SUBREG_WORD (op0);
+
+      if (BYTES_BIG_ENDIAN && (outer_size < inner_size))
+       {
+         bitpos += inner_size - outer_size;
+         if (bitpos > unit)
+           {
+             offset += (bitpos / unit);
+             bitpos %= unit;
+           }
+       }
+
        op0 = SUBREG_REG (op0);
      }
+
+  /* ??? We currently assume TARGET is at least as big as BITSIZE.
+     If that's wrong, the solution is to test for it and set TARGET to 0
+     if needed.  */
    
-#if BYTES_BIG_ENDIAN
    /* If OP0 is a register, BITPOS must count within a word.
       But as we have it, it counts within whatever size OP0 now has.
       On a bigendian machine, these are not the same, so convert.  */
-  if (GET_CODE (op0) != MEM && unit > GET_MODE_BITSIZE (GET_MODE (op0)))
+  if (BYTES_BIG_ENDIAN &&
+      GET_CODE (op0) != MEM
+      && unit > GET_MODE_BITSIZE (GET_MODE (op0)))
      bitpos += unit - GET_MODE_BITSIZE (GET_MODE (op0));
-#endif
  
    /* Extracting a full-word or multi-word value
-     from a structure in a register.
+     from a structure in a register or aligned memory.
       This can be done with just SUBREG.
       So too extracting a subword value in
       the least significant part of the register.  */
  
-  if (GET_CODE (op0) == REG
+  if (((GET_CODE (op0) == REG
+       && TRULY_NOOP_TRUNCATION (GET_MODE_BITSIZE (mode),
+                                 GET_MODE_BITSIZE (GET_MODE (op0))))
+       || (GET_CODE (op0) == MEM
+          && (! SLOW_UNALIGNED_ACCESS
+              || (offset * BITS_PER_UNIT % bitsize == 0
+                  && align * BITS_PER_UNIT % bitsize == 0))))
        && ((bitsize >= BITS_PER_WORD && bitsize == GET_MODE_BITSIZE (mode)
            && bitpos % BITS_PER_WORD == 0)
           || (mode_for_size (bitsize, GET_MODE_CLASS (tmode), 0) != BLKmode
-#if BYTES_BIG_ENDIAN
-             && bitpos + bitsize == BITS_PER_WORD
-#else
-             && bitpos == 0
-#endif
-             )))
+             && (BYTES_BIG_ENDIAN
+                 ? bitpos + bitsize == BITS_PER_WORD
+                 : bitpos == 0))))
      {
        enum machine_mode mode1
         = mode_for_size (bitsize, GET_MODE_CLASS (tmode), 0);
  
        if (mode1 != GET_MODE (op0))
-       op0 = gen_rtx (SUBREG, mode1, op0, offset);
-
+       {
+         if (GET_CODE (op0) == REG)
+           op0 = gen_rtx (SUBREG, mode1, op0, offset);
+         else
+           op0 = change_address (op0, mode1,
+                                 plus_constant (XEXP (op0, 0), offset));
+       }
        if (mode1 != mode)
         return convert_to_mode (tmode, op0, unsignedp);
        return op0;
@@ -863,11 +976,18 @@ extract_bit_field (str_rtx, bitsize, bitnum, unsignedp,
        if (target == 0 || GET_CODE (target) != REG)
         target = gen_reg_rtx (mode);
  
+      /* Indicate for flow that the entire target reg is being set.  */
+      emit_insn (gen_rtx (CLOBBER, VOIDmode, target));
+
        for (i = 0; i < nwords; i++)
         {
           /* If I is 0, use the low-order word in both field and target;
              if I is 1, use the next to lowest word; and so on.  */
-         int wordnum = (WORDS_BIG_ENDIAN ? nwords - i - 1 : i);
+         /* Word number in TARGET to use.  */
+         int wordnum = (WORDS_BIG_ENDIAN
+                        ? GET_MODE_SIZE (GET_MODE (target)) / UNITS_PER_WORD - i - 1
+                        : i);
+         /* Offset from start of field in OP0.  */
           int bit_offset = (WORDS_BIG_ENDIAN
                             ? MAX (0, bitsize - (i + 1) * BITS_PER_WORD)
                             : i * BITS_PER_WORD);
@@ -886,7 +1006,32 @@ extract_bit_field (str_rtx, bitsize, bitnum, unsignedp,
             emit_move_insn (target_part, result_part);
         }
  
-      return target;
+      if (unsignedp)
+       {
+         /* Unless we've filled TARGET, the upper regs in a multi-reg value
+            need to be zero'd out.  */
+         if (GET_MODE_SIZE (GET_MODE (target)) > nwords * UNITS_PER_WORD)
+           {
+             int i,total_words;
+
+             total_words = GET_MODE_SIZE (GET_MODE (target)) / UNITS_PER_WORD;
+             for (i = nwords; i < total_words; i++)
+               {
+                 int wordnum = WORDS_BIG_ENDIAN ? total_words - i - 1 : i;
+                 rtx target_part = operand_subword (target, wordnum, 1, VOIDmode);
+                 emit_move_insn (target_part, const0_rtx);
+               }
+           }
+         return target;
+       }
+
+      /* Signed bit field: sign-extend with two arithmetic shifts.  */
+      target = expand_shift (LSHIFT_EXPR, mode, target,
+                            build_int_2 (GET_MODE_BITSIZE (mode) - bitsize, 0),
+                            NULL_RTX, 0);
+      return expand_shift (RSHIFT_EXPR, mode, target,
+                          build_int_2 (GET_MODE_BITSIZE (mode) - bitsize, 0),
+                          NULL_RTX, 0);
      }
    
    /* From here on we know the desired field is smaller than a word
@@ -917,7 +1062,10 @@ extract_bit_field (str_rtx, bitsize, bitnum, unsignedp,
  #ifdef HAVE_extzv
        if (HAVE_extzv
           && (GET_MODE_BITSIZE (insn_operand_mode[(int) CODE_FOR_extzv][0])
-             >= bitsize))
+             >= bitsize)
+         && ! ((GET_CODE (op0) == REG || GET_CODE (op0) == SUBREG)
+               && (bitsize + bitpos
+                   > GET_MODE_BITSIZE (insn_operand_mode[(int) CODE_FOR_extzv][0]))))
         {
           int xbitpos = bitpos, xoffset = offset;
           rtx bitsize_rtx, bitpos_rtx;
@@ -958,7 +1106,8 @@ extract_bit_field (str_rtx, bitsize, bitnum, unsignedp,
                   else
                     bestmode = GET_MODE (xop0);
  
-                 if (bestmode == VOIDmode)
+                 if (bestmode == VOIDmode
+                     || (SLOW_UNALIGNED_ACCESS && GET_MODE_SIZE (bestmode) > align))
                     goto extzv_loses;
  
                   /* Compute offset as multiple of this unit,
@@ -991,14 +1140,13 @@ extract_bit_field (str_rtx, bitsize, bitnum, unsignedp,
  
           /* On big-endian machines, we count bits from the most significant.
              If the bit field insn does not, we must invert.  */
-#if BITS_BIG_ENDIAN != BYTES_BIG_ENDIAN
-         xbitpos = unit - bitsize - xbitpos;
-#endif
+         if (BITS_BIG_ENDIAN != BYTES_BIG_ENDIAN)
+           xbitpos = unit - bitsize - xbitpos;
+
           /* Now convert from counting within UNIT to counting in MAXMODE.  */
-#if BITS_BIG_ENDIAN
-         if (GET_CODE (xop0) != MEM)
+         if (BITS_BIG_ENDIAN && GET_CODE (xop0) != MEM)
             xbitpos += GET_MODE_BITSIZE (maxmode) - unit;
-#endif
+
           unit = GET_MODE_BITSIZE (maxmode);
  
           if (xtarget == 0
@@ -1055,7 +1203,10 @@ extract_bit_field (str_rtx, bitsize, bitnum, unsignedp,
  #ifdef HAVE_extv
        if (HAVE_extv
           && (GET_MODE_BITSIZE (insn_operand_mode[(int) CODE_FOR_extv][0])
-             >= bitsize))
+             >= bitsize)
+         && ! ((GET_CODE (op0) == REG || GET_CODE (op0) == SUBREG)
+               && (bitsize + bitpos
+                   > GET_MODE_BITSIZE (insn_operand_mode[(int) CODE_FOR_extv][0]))))
         {
           int xbitpos = bitpos, xoffset = offset;
           rtx bitsize_rtx, bitpos_rtx;
@@ -1091,7 +1242,8 @@ extract_bit_field (str_rtx, bitsize, bitnum, unsignedp,
                   else
                     bestmode = GET_MODE (xop0);
  
-                 if (bestmode == VOIDmode)
+                 if (bestmode == VOIDmode
+                     || (SLOW_UNALIGNED_ACCESS && GET_MODE_SIZE (bestmode) > align))
                     goto extv_loses;
  
                   /* Compute offset as multiple of this unit,
@@ -1122,15 +1274,14 @@ extract_bit_field (str_rtx, bitsize, bitnum, unsignedp,
  
           /* On big-endian machines, we count bits from the most significant.
              If the bit field insn does not, we must invert.  */
-#if BITS_BIG_ENDIAN != BYTES_BIG_ENDIAN
-         xbitpos = unit - bitsize - xbitpos;
-#endif
+         if (BITS_BIG_ENDIAN != BYTES_BIG_ENDIAN)
+           xbitpos = unit - bitsize - xbitpos;
+
           /* XBITPOS counts within a size of UNIT.
              Adjust to count within a size of MAXMODE.  */
-#if BITS_BIG_ENDIAN
-         if (GET_CODE (xop0) != MEM)
+         if (BITS_BIG_ENDIAN && GET_CODE (xop0) != MEM)
             xbitpos += (GET_MODE_BITSIZE (maxmode) - unit);
-#endif
+
           unit = GET_MODE_BITSIZE (maxmode);
  
           if (xtarget == 0
@@ -1284,12 +1435,14 @@ extract_fixed_bit_field (tmode, op0, offset, bitsize, bitpos,
  
    mode = GET_MODE (op0);
  
-#if BYTES_BIG_ENDIAN
-  /* BITPOS is the distance between our msb and that of OP0.
-     Convert it to the distance from the lsb.  */
+  if (BYTES_BIG_ENDIAN)
+    {
+      /* BITPOS is the distance between our msb and that of OP0.
+        Convert it to the distance from the lsb.  */
+
+      bitpos = total_bits - bitsize - bitpos;
+    }
  
-  bitpos = total_bits - bitsize - bitpos;
-#endif
    /* Now BITPOS is always the distance between the field's lsb and that of OP0.
       We have reduced the big-endian case to the little-endian case.  */
  
@@ -1320,7 +1473,7 @@ extract_fixed_bit_field (tmode, op0, offset, bitsize, bitpos,
  #ifdef SLOW_ZERO_EXTEND
           /* Always generate an `and' if
              we just zero-extended op0 and SLOW_ZERO_EXTEND, since it
-            will combine fruitfully with the zero-extend. */
+            will combine fruitfully with the zero-extend.  */
           || tmode != mode
  #endif
  #endif
@@ -1366,7 +1519,8 @@ extract_fixed_bit_field (tmode, op0, offset, bitsize, bitpos,
  /* Return a constant integer (CONST_INT or CONST_DOUBLE) mask value
     of mode MODE with BITSIZE ones followed by BITPOS zeros, or the
     complement of that if COMPLEMENT.  The mask is truncated if
-   necessary to the width of mode MODE.  */
+   necessary to the width of mode MODE.  The mask is zero-extended if
+   BITSIZE+BITPOS is too small for MODE.  */
  
  static rtx
  mask_rtx (mode, bitpos, bitsize, complement)
@@ -1438,65 +1592,98 @@ lshift_value (mode, value, bitpos, bitsize)
  
     OP0 is the REG, SUBREG or MEM rtx for the first of the two words.
     BITSIZE is the field width; BITPOS, position of its first bit, in the word.
-   UNSIGNEDP is 1 if should zero-extend the contents; else sign-extend.  */
+   UNSIGNEDP is 1 if should zero-extend the contents; else sign-extend.
+
+   ALIGN is the known alignment of OP0, measured in bytes.
+   This is also the size of the memory objects to be used.  */
  
  static rtx
  extract_split_bit_field (op0, bitsize, bitpos, unsignedp, align)
       rtx op0;
       int bitsize, bitpos, unsignedp, align;
  {
-  /* BITSIZE_1 is size of the part in the first word.  */
-  int bitsize_1 = BITS_PER_WORD - bitpos % BITS_PER_WORD;
-  /* BITSIZE_2 is size of the rest (in the following word).  */
-  int bitsize_2 = bitsize - bitsize_1;
-  rtx part1, part2, result;
-  int unit = GET_CODE (op0) == MEM ? BITS_PER_UNIT : BITS_PER_WORD;
-  int offset = bitpos / unit;
-  rtx word;
- 
-  /* The field must span exactly one word boundary.  */
-  if (bitpos / BITS_PER_WORD != (bitpos + bitsize - 1) / BITS_PER_WORD - 1)
-    abort ();
+  int unit;
+  int bitsdone = 0;
+  rtx result;
+  int first = 1;
  
-  /* Get the part of the bit field from the first word.  If OP0 is a MEM,
-     pass OP0 and the offset computed above.  Otherwise, get the proper
-     word and pass an offset of zero.  */
-  word = (GET_CODE (op0) == MEM ? op0
-         : operand_subword_force (op0, offset, GET_MODE (op0)));
-  part1 = extract_fixed_bit_field (word_mode, word,
-                                  GET_CODE (op0) == MEM ? offset : 0,
-                                  bitsize_1, bitpos % unit, NULL_RTX,
-                                  1, align);
-
-  /* Offset op0 by 1 word to get to the following one.  */
-  if (GET_CODE (op0) == SUBREG)
-    word = operand_subword_force (SUBREG_REG (op0),
-                                 SUBREG_WORD (op0) + offset + 1, VOIDmode);
-  else if (GET_CODE (op0) == MEM)
-    word = op0;
+  /* Make sure UNIT isn't larger than BITS_PER_WORD, we can only handle that
+     much at a time.  */
+  if (GET_CODE (op0) == REG || GET_CODE (op0) == SUBREG)
+    unit = BITS_PER_WORD;
    else
-    word = operand_subword_force (op0, offset + 1, GET_MODE (op0));
-
-  /* Get the part of the bit field from the second word.  */
-  part2 = extract_fixed_bit_field (word_mode, word,
-                                  (GET_CODE (op0) == MEM
-                                   ? CEIL (offset + 1, UNITS_PER_WORD) * UNITS_PER_WORD
-                                   : 0),
-                                  bitsize_2, 0, NULL_RTX, 1, align);
-
-  /* Shift the more significant part up to fit above the other part.  */
-#if BYTES_BIG_ENDIAN
-  part1 = expand_shift (LSHIFT_EXPR, word_mode, part1,
-                       build_int_2 (bitsize_2, 0), 0, 1);
-#else
-  part2 = expand_shift (LSHIFT_EXPR, word_mode, part2,
-                       build_int_2 (bitsize_1, 0), 0, 1);
-#endif
+    unit = MIN (align * BITS_PER_UNIT, BITS_PER_WORD);
+
+  while (bitsdone < bitsize)
+    {
+      int thissize;
+      rtx part, word;
+      int thispos;
+      int offset;
+
+      offset = (bitpos + bitsdone) / unit;
+      thispos = (bitpos + bitsdone) % unit;
+
+      /* THISSIZE must not overrun a word boundary.  Otherwise,
+        extract_fixed_bit_field will call us again, and we will mutually
+        recurse forever.  */
+      thissize = MIN (bitsize - bitsdone, BITS_PER_WORD);
+      thissize = MIN (thissize, unit - thispos);
+
+      /* If OP0 is a register, then handle OFFSET here.
+
+        When handling multiword bitfields, extract_bit_field may pass
+        down a word_mode SUBREG of a larger REG for a bitfield that actually
+        crosses a word boundary.  Thus, for a SUBREG, we must find
+        the current word starting from the base register.  */
+      if (GET_CODE (op0) == SUBREG)
+       {
+         word = operand_subword_force (SUBREG_REG (op0),
+                                       SUBREG_WORD (op0) + offset,
+                                       GET_MODE (SUBREG_REG (op0)));
+         offset = 0;
+       }
+      else if (GET_CODE (op0) == REG)
+       {
+         word = operand_subword_force (op0, offset, GET_MODE (op0));
+         offset = 0;
+       }
+      else
+       word = op0;
+
+      /* Extract the parts in bit-counting order,
+        whose meaning is determined by BYTES_PER_UNIT.
+        OFFSET is in UNITs, and UNIT is in bits.
+        extract_fixed_bit_field wants offset in bytes.  */
+      part = extract_fixed_bit_field (word_mode, word,
+                                     offset * unit / BITS_PER_UNIT,
+                                     thissize, thispos, 0, 1, align);
+      bitsdone += thissize;
+
+      /* Shift this part into place for the result.  */
+      if (BYTES_BIG_ENDIAN)
+       {
+         if (bitsize != bitsdone)
+           part = expand_shift (LSHIFT_EXPR, word_mode, part,
+                                build_int_2 (bitsize - bitsdone, 0), 0, 1);
+       }
+      else
+       {
+         if (bitsdone != thissize)
+           part = expand_shift (LSHIFT_EXPR, word_mode, part,
+                                build_int_2 (bitsdone - thissize, 0), 0, 1);
+       }
  
-  /* Combine the two parts with bitwise or.  This works
-     because we extracted both parts as unsigned bit fields.  */
-  result = expand_binop (word_mode, ior_optab, part1, part2, NULL_RTX, 1,
-                        OPTAB_LIB_WIDEN);
+      if (first)
+       result = part;
+      else
+       /* Combine the parts with bitwise or.  This works
+          because we extracted each part as an unsigned bit field.  */
+       result = expand_binop (word_mode, ior_optab, part, result, NULL_RTX, 1,
+                              OPTAB_LIB_WIDEN);
+
+      first = 0;
+    }
  
    /* Unsigned bit field: we are done.  */
    if (unsignedp)
@@ -1562,6 +1749,14 @@ expand_shift (code, mode, shifted, amount, target, unsignedp)
  
    op1 = expand_expr (amount, NULL_RTX, VOIDmode, 0);
  
+#ifdef SHIFT_COUNT_TRUNCATED
+  if (SHIFT_COUNT_TRUNCATED
+      && GET_CODE (op1) == CONST_INT
+      && (unsigned HOST_WIDE_INT) INTVAL (op1) >= GET_MODE_BITSIZE (mode))
+    op1 = GEN_INT ((unsigned HOST_WIDE_INT) INTVAL (op1)
+                  % GET_MODE_BITSIZE (mode));
+#endif
+
    if (op1 == const0_rtx)
      return shifted;
  
@@ -1583,8 +1778,7 @@ expand_shift (code, mode, shifted, amount, target, unsignedp)
             continue;
           else if (methods == OPTAB_LIB_WIDEN)
             {
-             /* If we are rotating by a constant that is valid and
-                we have been unable to open-code this by a rotation,
+             /* If we have been unable to open-code this by a rotation,
                  do it as the IOR of two shifts.  I.e., to rotate A
                  by N bits, compute (A << N) | ((unsigned) A >> (C - N))
                  where C is the bitsize of A.
@@ -1596,25 +1790,25 @@ expand_shift (code, mode, shifted, amount, target, unsignedp)
                  this extremely unlikely lossage to avoid complicating the
                  code below.  */
  
-             if (GET_CODE (op1) == CONST_INT && INTVAL (op1) > 0
-                 && INTVAL (op1) < GET_MODE_BITSIZE (mode))
-               {
-                 rtx subtarget = target == shifted ? 0 : target;
-                 rtx temp1;
-                 tree other_amount
-                   = build_int_2 (GET_MODE_BITSIZE (mode) - INTVAL (op1), 0);
-
-                 shifted = force_reg (mode, shifted);
-
-                 temp = expand_shift (left ? LSHIFT_EXPR : RSHIFT_EXPR,
-                                      mode, shifted, amount, subtarget, 1);
-                 temp1 = expand_shift (left ? RSHIFT_EXPR : LSHIFT_EXPR,
-                                       mode, shifted, other_amount, 0, 1);
-                 return expand_binop (mode, ior_optab, temp, temp1, target,
-                                      unsignedp, methods);
-               }
-             else
-               methods = OPTAB_LIB;
+             rtx subtarget = target == shifted ? 0 : target;
+             rtx temp1;
+             tree type = TREE_TYPE (amount);
+             tree new_amount = make_tree (type, op1);
+             tree other_amount
+               = fold (build (MINUS_EXPR, type,
+                              convert (type,
+                                       build_int_2 (GET_MODE_BITSIZE (mode),
+                                                    0)),
+                              amount));
+
+             shifted = force_reg (mode, shifted);
+
+             temp = expand_shift (left ? LSHIFT_EXPR : RSHIFT_EXPR,
+                                  mode, shifted, new_amount, subtarget, 1);
+             temp1 = expand_shift (left ? RSHIFT_EXPR : LSHIFT_EXPR,
+                                   mode, shifted, other_amount, 0, 1);
+             return expand_binop (mode, ior_optab, temp, temp1, target,
+                                  unsignedp, methods);
             }
  
           temp = expand_binop (mode,
@@ -1634,14 +1828,9 @@ expand_shift (code, mode, shifted, amount, target, unsignedp)
                                  target, unsignedp, methods);
         }
        else if (unsignedp)
-       {
-         temp = expand_binop (mode,
-                              left ? lshl_optab : lshr_optab,
-                              shifted, op1, target, unsignedp, methods);
-         if (temp == 0 && left)
-           temp = expand_binop (mode, ashl_optab,
-                                shifted, op1, target, unsignedp, methods);
-       }
+       temp = expand_binop (mode,
+                            left ? ashl_optab : lshr_optab,
+                            shifted, op1, target, unsignedp, methods);
  
        /* Do arithmetic shifts.
          Also, if we are going to widen the operand, we can just as well
@@ -1663,99 +1852,10 @@ expand_shift (code, mode, shifted, amount, target, unsignedp)
                                shifted, op1, target, unsignedp, methods1);
         }
  
-#ifdef HAVE_extzv
-      /* We can do a logical (unsigned) right shift with a bit-field
-        extract insn.  But first check if one of the above methods worked.  */
-      if (temp != 0)
-       return temp;
-
-      if (unsignedp && code == RSHIFT_EXPR && ! BITS_BIG_ENDIAN && HAVE_extzv)
-       {
-         enum machine_mode output_mode
-           = insn_operand_mode[(int) CODE_FOR_extzv][0];
-
-         if ((methods == OPTAB_DIRECT && mode == output_mode)
-             || (methods == OPTAB_WIDEN
-                 && GET_MODE_SIZE (mode) < GET_MODE_SIZE (output_mode)))
-           {
-             rtx shifted1 = convert_to_mode (output_mode,
-                                             protect_from_queue (shifted, 0),
-                                             1);
-             enum machine_mode length_mode
-               = insn_operand_mode[(int) CODE_FOR_extzv][2];
-             enum machine_mode pos_mode
-               = insn_operand_mode[(int) CODE_FOR_extzv][3];
-             rtx target1 = 0;
-             rtx last = get_last_insn ();
-             rtx width;
-             rtx xop1 = op1;
-             rtx pat;
-
-             if (target != 0)
-               target1 = protect_from_queue (target, 1);
-
-             /* We define extract insns as having OUTPUT_MODE in a register
-                and the mode of operand 1 in memory.  Since we want
-                OUTPUT_MODE, we will always force the operand into a
-                register.  At some point we might want to support MEM
-                directly. */
-             shifted1 = force_reg (output_mode, shifted1);
-
-             /* If we don't have or cannot use a suggested target,
-                make a place for the result, in the proper mode.  */
-             if (methods == OPTAB_WIDEN || target1 == 0
-                 || ! ((*insn_operand_predicate[(int) CODE_FOR_extzv][0])
-                       (target1, output_mode)))
-               target1 = gen_reg_rtx (output_mode);
-
-             xop1 = protect_from_queue (xop1, 0);
-             xop1 = convert_to_mode (pos_mode, xop1,
-                                     TREE_UNSIGNED (TREE_TYPE (amount)));
-
-             /* If this machine's extzv insists on a register for
-                operand 3 (position), arrange for that.  */
-             if (! ((*insn_operand_predicate[(int) CODE_FOR_extzv][3])
-                    (xop1, pos_mode)))
-               xop1 = force_reg (pos_mode, xop1);
-
-             /* WIDTH gets the width of the bit field to extract:
-                wordsize minus # bits to shift by.  */
-             if (GET_CODE (xop1) == CONST_INT)
-               width = GEN_INT (GET_MODE_BITSIZE (mode) - INTVAL (op1));
-             else
-               {
-                 /* Now get the width in the proper mode.  */
-                 op1 = protect_from_queue (op1, 0);
-                 width = convert_to_mode (length_mode, op1,
-                                          TREE_UNSIGNED (TREE_TYPE (amount)));
-
-                 width = expand_binop (length_mode, sub_optab,
-                                       GEN_INT (GET_MODE_BITSIZE (mode)),
-                                       width, NULL_RTX, 0, OPTAB_LIB_WIDEN);
-               }
-
-             /* If this machine's extzv insists on a register for
-                operand 2 (length), arrange for that.  */
-             if (! ((*insn_operand_predicate[(int) CODE_FOR_extzv][2])
-                    (width, length_mode)))
-               width = force_reg (length_mode, width);
-
-             /* Now extract with WIDTH, omitting OP1 least sig bits.  */
-             pat = gen_extzv (target1, shifted1, width, xop1);
-             if (pat)
-               {
-                 emit_insn (pat);
-                 temp = convert_to_mode (mode, target1, 1);
-               }
-             else
-               delete_insns_since (last);
-           }
-
-         /* Can also do logical shift with signed bit-field extract
-            followed by inserting the bit-field at a different position.
-            That strategy is not yet implemented.  */
-       }
-#endif /* HAVE_extzv */
+      /* We used to try extzv here for logical right shifts, but that was
+        only useful for one machine, the VAX, and caused poor code 
+        generation there for lshrdi3, so the code was deleted and a
+        define_expand for lshrsi3 was added to vax.md.  */
      }
  
    if (temp == 0)
@@ -1788,10 +1888,6 @@ enum alg_code { alg_zero, alg_m, alg_shift,
  
     The first operand must be either alg_zero or alg_m.  */
  
-#ifndef MAX_BITS_PER_WORD
-#define MAX_BITS_PER_WORD BITS_PER_WORD
-#endif
-
  struct algorithm
  {
    short cost;
@@ -1810,51 +1906,53 @@ struct algorithm
     If retval.cost >= COST_LIMIT, no algorithm was found and all
     other field of the returned struct are undefined.  */
  
-static struct algorithm
-synth_mult (t, cost_limit)
+static void
+synth_mult (alg_out, t, cost_limit)
+     struct algorithm *alg_out;
       unsigned HOST_WIDE_INT t;
       int cost_limit;
  {
    int m;
-  struct algorithm *best_alg
-    = (struct algorithm *)alloca (sizeof (struct algorithm));
-  struct algorithm *alg_in
-    = (struct algorithm *)alloca (sizeof (struct algorithm));
+  struct algorithm *alg_in, *best_alg;
    unsigned int cost;
    unsigned HOST_WIDE_INT q;
  
    /* Indicate that no algorithm is yet found.  If no algorithm
       is found, this value will be returned and indicate failure.  */
-  best_alg->cost = cost_limit;
+  alg_out->cost = cost_limit;
  
    if (cost_limit <= 0)
-    return *best_alg;
+    return;
  
    /* t == 1 can be done in zero cost.  */
    if (t == 1)
      {
-      best_alg->ops = 1;
-      best_alg->cost = 0;
-      best_alg->op[0] = alg_m;
-      return *best_alg;
+      alg_out->ops = 1;
+      alg_out->cost = 0;
+      alg_out->op[0] = alg_m;
+      return;
      }
  
    /* t == 0 sometimes has a cost.  If it does and it exceeds our limit,
       fail now.  */
-
-  else if (t == 0)
+  if (t == 0)
      {
        if (zero_cost >= cost_limit)
-       return *best_alg;
+       return;
        else
         {
-         best_alg->ops = 1;
-         best_alg->cost = zero_cost;
-         best_alg->op[0] = alg_zero;
-         return *best_alg;
+         alg_out->ops = 1;
+         alg_out->cost = zero_cost;
+         alg_out->op[0] = alg_zero;
+         return;
         }
      }
  
+  /* We'll be needing a couple extra algorithm structures now.  */
+
+  alg_in = (struct algorithm *)alloca (sizeof (struct algorithm));
+  best_alg = (struct algorithm *)alloca (sizeof (struct algorithm));
+
    /* If we have a group of zero bits at the low-order part of T, try
       multiplying by the remaining bits and then doing a shift.  */
  
@@ -1863,18 +1961,61 @@ synth_mult (t, cost_limit)
        m = floor_log2 (t & -t); /* m = number of low zero bits */
        q = t >> m;
        cost = shift_cost[m];
+      synth_mult (alg_in, q, cost_limit - cost);
+
+      cost += alg_in->cost;
        if (cost < cost_limit)
         {
-         *alg_in = synth_mult (q, cost_limit - cost);
+         struct algorithm *x;
+         x = alg_in, alg_in = best_alg, best_alg = x;
+         best_alg->log[best_alg->ops] = m;
+         best_alg->op[best_alg->ops] = alg_shift;
+         cost_limit = cost;
+       }
+    }
+
+  /* If we have an odd number, add or subtract one.  */
+  if ((t & 1) != 0)
+    {
+      unsigned HOST_WIDE_INT w;
+
+      for (w = 1; (w & t) != 0; w <<= 1)
+       ;
+      if (w > 2
+         /* Reject the case where t is 3.
+            Thus we prefer addition in that case.  */
+         && t != 3)
+       {
+         /* T ends with ...111.  Multiply by (T + 1) and subtract 1.  */
+
+         cost = add_cost;
+         synth_mult (alg_in, t + 1, cost_limit - cost);
  
           cost += alg_in->cost;
-         if (cost < best_alg->cost)
+         if (cost < cost_limit)
             {
               struct algorithm *x;
               x = alg_in, alg_in = best_alg, best_alg = x;
-             best_alg->log[best_alg->ops] = m;
-             best_alg->op[best_alg->ops++] = alg_shift;
-             best_alg->cost = cost_limit = cost;
+             best_alg->log[best_alg->ops] = 0;
+             best_alg->op[best_alg->ops] = alg_sub_t_m2;
+             cost_limit = cost;
+           }
+       }
+      else
+       {
+         /* T ends with ...01 or ...011.  Multiply by (T - 1) and add 1.  */
+
+         cost = add_cost;
+         synth_mult (alg_in, t - 1, cost_limit - cost);
+
+         cost += alg_in->cost;
+         if (cost < cost_limit)
+           {
+             struct algorithm *x;
+             x = alg_in, alg_in = best_alg, best_alg = x;
+             best_alg->log[best_alg->ops] = 0;
+             best_alg->op[best_alg->ops] = alg_add_t_m2;
+             cost_limit = cost;
             }
         }
      }
@@ -1897,34 +2038,37 @@ synth_mult (t, cost_limit)
        if (t % d == 0 && t > d)
         {
           cost = MIN (shiftadd_cost[m], add_cost + shift_cost[m]);
-         *alg_in = synth_mult (t / d, cost_limit - cost);
+         synth_mult (alg_in, t / d, cost_limit - cost);
  
           cost += alg_in->cost;
-         if (cost < best_alg->cost)
+         if (cost < cost_limit)
             {
               struct algorithm *x;
               x = alg_in, alg_in = best_alg, best_alg = x;
               best_alg->log[best_alg->ops] = m;
-             best_alg->op[best_alg->ops++] = alg_add_factor;
-             best_alg->cost = cost_limit = cost;
+             best_alg->op[best_alg->ops] = alg_add_factor;
+             cost_limit = cost;
             }
+         /* Other factors will have been taken care of in the recursion.  */
+         break;
         }
  
        d = ((unsigned HOST_WIDE_INT) 1 << m) - 1;
        if (t % d == 0 && t > d)
         {
           cost = MIN (shiftsub_cost[m], add_cost + shift_cost[m]);
-         *alg_in = synth_mult (t / d, cost_limit - cost);
+         synth_mult (alg_in, t / d, cost_limit - cost);
  
           cost += alg_in->cost;
-         if (cost < best_alg->cost)
+         if (cost < cost_limit)
             {
               struct algorithm *x;
               x = alg_in, alg_in = best_alg, best_alg = x;
               best_alg->log[best_alg->ops] = m;
-             best_alg->op[best_alg->ops++] = alg_sub_factor;
-             best_alg->cost = cost_limit = cost;
+             best_alg->op[best_alg->ops] = alg_sub_factor;
+             cost_limit = cost;
             }
+         break;
         }
      }
  
@@ -1938,16 +2082,16 @@ synth_mult (t, cost_limit)
        if (m >= 0)
         {
           cost = shiftadd_cost[m];
-         *alg_in = synth_mult ((t - 1) >> m, cost_limit - cost);
+         synth_mult (alg_in, (t - 1) >> m, cost_limit - cost);
  
           cost += alg_in->cost;
-         if (cost < best_alg->cost)
+         if (cost < cost_limit)
             {
               struct algorithm *x;
               x = alg_in, alg_in = best_alg, best_alg = x;
               best_alg->log[best_alg->ops] = m;
-             best_alg->op[best_alg->ops++] = alg_add_t2_m;
-             best_alg->cost = cost_limit = cost;
+             best_alg->op[best_alg->ops] = alg_add_t2_m;
+             cost_limit = cost;
             }
         }
  
@@ -1957,78 +2101,39 @@ synth_mult (t, cost_limit)
        if (m >= 0)
         {
           cost = shiftsub_cost[m];
-         *alg_in = synth_mult ((t + 1) >> m, cost_limit - cost);
+         synth_mult (alg_in, (t + 1) >> m, cost_limit - cost);
  
           cost += alg_in->cost;
-         if (cost < best_alg->cost)
+         if (cost < cost_limit)
             {
               struct algorithm *x;
               x = alg_in, alg_in = best_alg, best_alg = x;
               best_alg->log[best_alg->ops] = m;
-             best_alg->op[best_alg->ops++] = alg_sub_t2_m;
-             best_alg->cost = cost_limit = cost;
+             best_alg->op[best_alg->ops] = alg_sub_t2_m;
+             cost_limit = cost;
             }
         }
      }
  
-  /* Now, use the simple method of adding or subtracting at the leftmost
-     1-bit.  */
-  {
-    unsigned HOST_WIDE_INT w;
-
-    q = t & -t;                        /* get out lsb */
-    for (w = q; (w & t) != 0; w <<= 1)
-      ;
-    if ((w > q << 1)
-       /* Reject the case where t has only two bits.
-          Thus we prefer addition in that case.  */
-       && !(t < w && w == q << 2))
-      {
-       /* There are many bits in a row.  Make 'em by subtraction.  */
-
-       m = exact_log2 (q);
-
-       /* Don't use shiftsub_cost here, this operation
-          scales wrong operand.  */
-       cost = add_cost + shift_cost[m];
-       *alg_in = synth_mult (t + q, cost_limit - cost);
-
-       cost += alg_in->cost;
-       if (cost < best_alg->cost)
-         {
-           struct algorithm *x;
-           x = alg_in, alg_in = best_alg, best_alg = x;
-           best_alg->log[best_alg->ops] = m;
-           best_alg->op[best_alg->ops++] = alg_sub_t_m2;
-           best_alg->cost = cost_limit = cost;
-         }
-      }
-    else
-      {
-       /* There's only one or two bit at the left.  Make it by addition.  */
-
-       m = exact_log2 (q);
-       cost = MIN (shiftadd_cost[m], add_cost + shift_cost[m]);
-       *alg_in = synth_mult (t - q, cost_limit - cost);
-
-       cost += alg_in->cost;
-       if (cost < best_alg->cost)
-         {
-           struct algorithm *x;
-           x = alg_in, alg_in = best_alg, best_alg = x;
-           best_alg->log[best_alg->ops] = m;
-           best_alg->op[best_alg->ops++] = alg_add_t_m2;
-           best_alg->cost = cost_limit = cost;
-         }
-      }
-  }
+  /* If cost_limit has not decreased since we stored it in alg_out->cost,
+     we have not found any algorithm.  */
+  if (cost_limit == alg_out->cost)
+    return;
  
    /* If we are getting a too long sequence for `struct algorithm'
-     to record, store a fake cost to make this search fail.  */
+     to record, make this search fail.  */
    if (best_alg->ops == MAX_BITS_PER_WORD)
-    best_alg->cost = cost_limit;
-
-  return *best_alg;
+    return;
+
+  /* Copy the algorithm from temporary space to the space at alg_out.
+     We avoid using structure assignment because the majority of
+     best_alg is normally undefined, and this is a critical function.  */
+  alg_out->ops = best_alg->ops + 1;
+  alg_out->cost = cost_limit;
+  bcopy ((char *) best_alg->op, (char *) alg_out->op,
+        alg_out->ops * sizeof *alg_out->op);
+  bcopy ((char *) best_alg->log, (char *) alg_out->log,
+        alg_out->ops * sizeof *alg_out->log);
  }
  \f
  /* Perform a multiplication and return an rtx for the result.
@@ -2047,45 +2152,64 @@ expand_mult (mode, op0, op1, target, unsignedp)
  {
    rtx const_op1 = op1;
  
+  /* synth_mult does an `unsigned int' multiply.  As long as the mode is
+     less than or equal in size to `unsigned int' this doesn't matter.
+     If the mode is larger than `unsigned int', then synth_mult works only
+     if the constant value exactly fits in an `unsigned int' without any
+     truncation.  This means that multiplying by negative values does
+     not work; results are off by 2^32 on a 32 bit machine.  */
+
    /* If we are multiplying in DImode, it may still be a win
       to try to work with shifts and adds.  */
    if (GET_CODE (op1) == CONST_DOUBLE
        && GET_MODE_CLASS (GET_MODE (op1)) == MODE_INT
-      && HOST_BITS_PER_INT <= BITS_PER_WORD)
-    {
-      if ((CONST_DOUBLE_HIGH (op1) == 0 && CONST_DOUBLE_LOW (op1) >= 0)
-         || (CONST_DOUBLE_HIGH (op1) == -1 && CONST_DOUBLE_LOW (op1) < 0))
-       const_op1 = GEN_INT (CONST_DOUBLE_LOW (op1));
-    }
+      && HOST_BITS_PER_INT >= BITS_PER_WORD
+      && CONST_DOUBLE_HIGH (op1) == 0)
+    const_op1 = GEN_INT (CONST_DOUBLE_LOW (op1));
+  else if (HOST_BITS_PER_INT < GET_MODE_BITSIZE (mode)
+          && GET_CODE (op1) == CONST_INT
+          && INTVAL (op1) < 0)
+    const_op1 = 0;
  
    /* We used to test optimize here, on the grounds that it's better to
       produce a smaller program when -O is not used.
       But this causes such a terrible slowdown sometimes
       that it seems better to use synth_mult always.  */
  
-  if (GET_CODE (const_op1) == CONST_INT && ! mult_is_very_cheap)
+  if (const_op1 && GET_CODE (const_op1) == CONST_INT)
      {
        struct algorithm alg;
-      struct algorithm neg_alg;
-      int negate = 0;
+      struct algorithm alg2;
        HOST_WIDE_INT val = INTVAL (op1);
        HOST_WIDE_INT val_so_far;
        rtx insn;
+      int mult_cost;
+      enum {basic_variant, negate_variant, add_variant} variant = basic_variant;
+
+      /* Try to do the computation three ways: multiply by the negative of OP1
+        and then negate, do the multiplication directly, or do multiplication
+        by OP1 - 1.  */
  
-      /* Try to do the computation two ways: multiply by the negative of OP1
-        and then negate, or do the multiplication directly.  The latter is
-        usually faster for positive numbers and the former for negative
-        numbers, but the opposite can be faster if the original value
-        has a factor of 2**m +/- 1, while the negated value does not or
-        vice versa.  */
+      mult_cost = rtx_cost (gen_rtx (MULT, mode, op0, op1), SET);
+      mult_cost = MIN (12 * add_cost, mult_cost);
  
-      alg = synth_mult (val, mult_cost);
-      neg_alg = synth_mult (- val,
-                           (alg.cost < mult_cost ? alg.cost : mult_cost)
-                           - negate_cost);
+      synth_mult (&alg, val, mult_cost);
  
-      if (neg_alg.cost + negate_cost < alg.cost)
-       alg = neg_alg, negate = 1;
+      /* This works only if the inverted value actually fits in an
+        `unsigned int' */
+      if (HOST_BITS_PER_INT >= GET_MODE_BITSIZE (mode))
+       {
+         synth_mult (&alg2, - val,
+                     (alg.cost < mult_cost ? alg.cost : mult_cost) - negate_cost);
+         if (alg2.cost + negate_cost < alg.cost)
+           alg = alg2, variant = negate_variant;
+       }
+
+      /* This proves very useful for division-by-constant.  */
+      synth_mult (&alg2, val - 1,
+                 (alg.cost < mult_cost ? alg.cost : mult_cost) - add_cost);
+      if (alg2.cost + add_cost < alg.cost)
+       alg = alg2, variant = add_variant;
  
        if (alg.cost < mult_cost)
         {
@@ -2110,7 +2234,7 @@ expand_mult (mode, op0, op1, target, unsignedp)
             }
           else if (alg.op[0] == alg_m)
             {
-             accum  = copy_to_mode_reg (mode, op0);
+             accum = copy_to_mode_reg (mode, op0);
               val_so_far = 1;
             }
           else
@@ -2119,9 +2243,13 @@ expand_mult (mode, op0, op1, target, unsignedp)
           for (opno = 1; opno < alg.ops; opno++)
             {
               int log = alg.log[opno];
-             rtx shift_subtarget = preserve_subexpressions_p () ? 0 : accum;
-             rtx add_target = opno == alg.ops - 1 && target != 0 ? target : 0;
-
+             int preserve = preserve_subexpressions_p ();
+             rtx shift_subtarget = preserve ? 0 : accum;
+             rtx add_target
+               = (opno == alg.ops - 1 && target != 0 && variant != add_variant
+                 ? target : 0);
+             rtx accum_target = preserve ? 0 : accum;
+             
               switch (alg.op[opno])
                 {
                 case alg_shift:
@@ -2134,7 +2262,7 @@ expand_mult (mode, op0, op1, target, unsignedp)
                   tem = expand_shift (LSHIFT_EXPR, mode, op0,
                                       build_int_2 (log, 0), NULL_RTX, 0);
                   accum = force_operand (gen_rtx (PLUS, mode, accum, tem),
-                                        add_target ? add_target : accum);
+                                        add_target ? add_target : accum_target);
                   val_so_far += (HOST_WIDE_INT) 1 << log;
                   break;
  
@@ -2142,23 +2270,25 @@ expand_mult (mode, op0, op1, target, unsignedp)
                   tem = expand_shift (LSHIFT_EXPR, mode, op0,
                                       build_int_2 (log, 0), NULL_RTX, 0);
                   accum = force_operand (gen_rtx (MINUS, mode, accum, tem),
-                                        add_target ? add_target : accum);
+                                        add_target ? add_target : accum_target);
                   val_so_far -= (HOST_WIDE_INT) 1 << log;
                   break;
  
                 case alg_add_t2_m:
                   accum = expand_shift (LSHIFT_EXPR, mode, accum,
-                                       build_int_2 (log, 0), accum, 0);
+                                       build_int_2 (log, 0), shift_subtarget,
+                                       0);
                   accum = force_operand (gen_rtx (PLUS, mode, accum, op0),
-                                        add_target ? add_target : accum);
+                                        add_target ? add_target : accum_target);
                   val_so_far = (val_so_far << log) + 1;
                   break;
  
                 case alg_sub_t2_m:
                   accum = expand_shift (LSHIFT_EXPR, mode, accum,
-                                       build_int_2 (log, 0), accum, 0);
+                                       build_int_2 (log, 0), shift_subtarget,
+                                       0);
                   accum = force_operand (gen_rtx (MINUS, mode, accum, op0),
-                                        add_target ? add_target : accum);
+                                        add_target ? add_target : accum_target);
                   val_so_far = (val_so_far << log) - 1;
                   break;
  
@@ -2166,7 +2296,7 @@ expand_mult (mode, op0, op1, target, unsignedp)
                   tem = expand_shift (LSHIFT_EXPR, mode, accum,
                                       build_int_2 (log, 0), NULL_RTX, 0);
                   accum = force_operand (gen_rtx (PLUS, mode, accum, tem),
-                                        add_target ? add_target : accum);
+                                        add_target ? add_target : accum_target);
                   val_so_far += val_so_far << log;
                   break;
  
@@ -2174,7 +2304,8 @@ expand_mult (mode, op0, op1, target, unsignedp)
                   tem = expand_shift (LSHIFT_EXPR, mode, accum,
                                       build_int_2 (log, 0), NULL_RTX, 0);
                   accum = force_operand (gen_rtx (MINUS, mode, tem, accum),
-                                        add_target ? add_target : tem);
+                                        (add_target ? add_target
+                                         : preserve ? 0 : tem));
                   val_so_far = (val_so_far << log) - val_so_far;
                   break;
  
@@ -2192,11 +2323,16 @@ expand_mult (mode, op0, op1, target, unsignedp)
                            REG_NOTES (insn));
             }
  
-         if (negate)
+         if (variant == negate_variant)
             {
               val_so_far = - val_so_far;
               accum = expand_unop (mode, neg_optab, accum, target, 0);
             }
+         else if (variant == add_variant)
+           {
+             val_so_far = val_so_far + 1;
+             accum = force_operand (gen_rtx (PLUS, mode, accum, op0), target);
+           }
  
           if (val != val_so_far)
             abort ();
@@ -2205,9 +2341,8 @@ expand_mult (mode, op0, op1, target, unsignedp)
         }
      }
  
-  /* This used to use umul_optab if unsigned,
-     but for non-widening multiply there is no difference
-     between signed and unsigned.  */
+  /* This used to use umul_optab if unsigned, but for non-widening multiply
+     there is no difference between signed and unsigned.  */
    op0 = expand_binop (mode, smul_optab,
                       op0, op1, target, unsignedp, OPTAB_LIB_WIDEN);
    if (op0 == 0)
@@ -2215,6 +2350,334 @@ expand_mult (mode, op0, op1, target, unsignedp)
    return op0;
  }
  \f
+/* Return the smallest n such that 2**n >= X.  */
+
+int
+ceil_log2 (x)
+     unsigned HOST_WIDE_INT x;
+{
+  return floor_log2 (x - 1) + 1;
+}
+
+/* Choose a minimal N + 1 bit approximation to 1/D that can be used to
+   replace division by D, and put the least significant N bits of the result
+   in *MULTIPLIER_PTR and return the most significant bit.
+
+   The width of operations is N (should be <= HOST_BITS_PER_WIDE_INT), the
+   needed precision is in PRECISION (should be <= N).
+
+   PRECISION should be as small as possible so this function can choose
+   multiplier more freely.
+
+   The rounded-up logarithm of D is placed in *lgup_ptr.  A shift count that
+   is to be used for a final right shift is placed in *POST_SHIFT_PTR.
+
+   Using this function, x/D will be equal to (x * m) >> (*POST_SHIFT_PTR),
+   where m is the full HOST_BITS_PER_WIDE_INT + 1 bit multiplier.  */
+
+static
+unsigned HOST_WIDE_INT
+choose_multiplier (d, n, precision, multiplier_ptr, post_shift_ptr, lgup_ptr)
+     unsigned HOST_WIDE_INT d;
+     int n;
+     int precision;
+     unsigned HOST_WIDE_INT *multiplier_ptr;
+     int *post_shift_ptr;
+     int *lgup_ptr;
+{
+  unsigned HOST_WIDE_INT mhigh_hi, mhigh_lo;
+  unsigned HOST_WIDE_INT mlow_hi, mlow_lo;
+  int lgup, post_shift;
+  int pow, pow2;
+  unsigned HOST_WIDE_INT nh, nl, dummy1, dummy2;
+
+  /* lgup = ceil(log2(divisor)); */
+  lgup = ceil_log2 (d);
+
+  if (lgup > n)
+    abort ();
+
+  pow = n + lgup;
+  pow2 = n + lgup - precision;
+
+  if (pow == 2 * HOST_BITS_PER_WIDE_INT)
+    {
+      /* We could handle this with some effort, but this case is much better
+        handled directly with a scc insn, so rely on caller using that.  */
+      abort ();
+    }
+
+  /* mlow = 2^(N + lgup)/d */
+ if (pow >= HOST_BITS_PER_WIDE_INT)
+    {
+      nh = (unsigned HOST_WIDE_INT) 1 << (pow - HOST_BITS_PER_WIDE_INT);
+      nl = 0;
+    }
+  else
+    {
+      nh = 0;
+      nl = (unsigned HOST_WIDE_INT) 1 << pow;
+    }
+  div_and_round_double (TRUNC_DIV_EXPR, 1, nl, nh, d, (HOST_WIDE_INT) 0,
+                       &mlow_lo, &mlow_hi, &dummy1, &dummy2);
+
+  /* mhigh = (2^(N + lgup) + 2^N + lgup - precision)/d */
+  if (pow2 >= HOST_BITS_PER_WIDE_INT)
+    nh |= (unsigned HOST_WIDE_INT) 1 << (pow2 - HOST_BITS_PER_WIDE_INT);
+  else
+    nl |= (unsigned HOST_WIDE_INT) 1 << pow2;
+  div_and_round_double (TRUNC_DIV_EXPR, 1, nl, nh, d, (HOST_WIDE_INT) 0,
+                       &mhigh_lo, &mhigh_hi, &dummy1, &dummy2);
+
+  if (mhigh_hi && nh - d >= d)
+    abort ();
+  if (mhigh_hi > 1 || mlow_hi > 1)
+    abort ();
+  /* assert that mlow < mhigh.  */
+  if (! (mlow_hi < mhigh_hi || (mlow_hi == mhigh_hi && mlow_lo < mhigh_lo)))
+    abort();
+
+  /* If precision == N, then mlow, mhigh exceed 2^N
+     (but they do not exceed 2^(N+1)).  */
+
+  /* Reduce to lowest terms */
+  for (post_shift = lgup; post_shift > 0; post_shift--)
+    {
+      unsigned HOST_WIDE_INT ml_lo = (mlow_hi << (HOST_BITS_PER_WIDE_INT - 1)) | (mlow_lo >> 1);
+      unsigned HOST_WIDE_INT mh_lo = (mhigh_hi << (HOST_BITS_PER_WIDE_INT - 1)) | (mhigh_lo >> 1);
+      if (ml_lo >= mh_lo)
+       break;
+
+      mlow_hi = 0;
+      mlow_lo = ml_lo;
+      mhigh_hi = 0;
+      mhigh_lo = mh_lo;
+    }
+
+  *post_shift_ptr = post_shift;
+  *lgup_ptr = lgup;
+  if (n < HOST_BITS_PER_WIDE_INT)
+    {
+      unsigned HOST_WIDE_INT mask = ((unsigned HOST_WIDE_INT) 1 << n) - 1;
+      *multiplier_ptr = mhigh_lo & mask;
+      return mhigh_lo >= mask;
+    }
+  else
+    {
+      *multiplier_ptr = mhigh_lo;
+      return mhigh_hi;
+    }
+}
+
+/* Compute the inverse of X mod 2**n, i.e., find Y such that X * Y is
+   congruent to 1 (mod 2**N).  */
+
+static unsigned HOST_WIDE_INT
+invert_mod2n (x, n)
+     unsigned HOST_WIDE_INT x;
+     int n;
+{
+  /* Solve x*y == 1 (mod 2^n), where x is odd.  Return y.  */
+
+  /* The algorithm notes that the choice y = x satisfies
+     x*y == 1 mod 2^3, since x is assumed odd.
+     Each iteration doubles the number of bits of significance in y.  */
+
+  unsigned HOST_WIDE_INT mask;
+  unsigned HOST_WIDE_INT y = x;
+  int nbit = 3;
+
+  mask = (n == HOST_BITS_PER_WIDE_INT
+         ? ~(unsigned HOST_WIDE_INT) 0
+         : ((unsigned HOST_WIDE_INT) 1 << n) - 1);
+
+  while (nbit < n)
+    {
+      y = y * (2 - x*y) & mask;                /* Modulo 2^N */
+      nbit *= 2;
+    }
+  return y;
+}
+
+/* Emit code to adjust ADJ_OPERAND after multiplication of wrong signedness
+   flavor of OP0 and OP1.  ADJ_OPERAND is already the high half of the
+   product OP0 x OP1.  If UNSIGNEDP is nonzero, adjust the signed product
+   to become unsigned, if UNSIGNEDP is zero, adjust the unsigned product to
+   become signed.
+
+   The result is put in TARGET if that is convenient.
+
+   MODE is the mode of operation.  */
+
+rtx
+expand_mult_highpart_adjust (mode, adj_operand, op0, op1, target, unsignedp)
+     enum machine_mode mode;
+     register rtx adj_operand, op0, op1, target;
+     int unsignedp;
+{
+  rtx tem;
+  enum rtx_code adj_code = unsignedp ? PLUS : MINUS;
+
+  tem = expand_shift (RSHIFT_EXPR, mode, op0,
+                     build_int_2 (GET_MODE_BITSIZE (mode) - 1, 0),
+                     NULL_RTX, 0);
+  tem = expand_and (tem, op1, NULL_RTX);
+  adj_operand = force_operand (gen_rtx (adj_code, mode, adj_operand, tem),
+                              adj_operand);
+
+  tem = expand_shift (RSHIFT_EXPR, mode, op1,
+                     build_int_2 (GET_MODE_BITSIZE (mode) - 1, 0),
+                     NULL_RTX, 0);
+  tem = expand_and (tem, op0, NULL_RTX);
+  target = force_operand (gen_rtx (adj_code, mode, adj_operand, tem), target);
+
+  return target;
+}
+
+/* Emit code to multiply OP0 and CNST1, putting the high half of the result
+   in TARGET if that is convenient, and return where the result is.  If the
+   operation can not be performed, 0 is returned.
+
+   MODE is the mode of operation and result.
+
+   UNSIGNEDP nonzero means unsigned multiply.
+
+   MAX_COST is the total allowed cost for the expanded RTL.  */
+
+rtx
+expand_mult_highpart (mode, op0, cnst1, target, unsignedp, max_cost)
+     enum machine_mode mode;
+     register rtx op0, target;
+     unsigned HOST_WIDE_INT cnst1;
+     int unsignedp;
+     int max_cost;
+{
+  enum machine_mode wider_mode = GET_MODE_WIDER_MODE (mode);
+  optab mul_highpart_optab;
+  optab moptab;
+  rtx tem;
+  int size = GET_MODE_BITSIZE (mode);
+  rtx op1, wide_op1;
+
+  /* We can't support modes wider than HOST_BITS_PER_INT.  */
+  if (size > HOST_BITS_PER_WIDE_INT)
+    abort ();
+
+  op1 = GEN_INT (cnst1);
+
+  if (GET_MODE_BITSIZE (wider_mode) <= HOST_BITS_PER_INT)
+    wide_op1 = op1;
+  else
+    wide_op1
+      = immed_double_const (cnst1,
+                           (unsignedp
+                            ? (HOST_WIDE_INT) 0
+                            : -(cnst1 >> (HOST_BITS_PER_WIDE_INT - 1))),
+                           wider_mode);
+
+  /* expand_mult handles constant multiplication of word_mode
+     or narrower.  It does a poor job for large modes.  */
+  if (size < BITS_PER_WORD
+      && mul_cost[(int) wider_mode] + shift_cost[size-1] < max_cost)
+    {
+      /* We have to do this, since expand_binop doesn't do conversion for
+        multiply.  Maybe change expand_binop to handle widening multiply?  */
+      op0 = convert_to_mode (wider_mode, op0, unsignedp);
+
+      tem = expand_mult (wider_mode, op0, wide_op1, NULL_RTX, unsignedp);
+      tem = expand_shift (RSHIFT_EXPR, wider_mode, tem,
+                         build_int_2 (size, 0), NULL_RTX, 1);
+      return convert_modes (mode, wider_mode, tem, unsignedp);
+    }
+
+  if (target == 0)
+    target = gen_reg_rtx (mode);
+
+  /* Firstly, try using a multiplication insn that only generates the needed
+     high part of the product, and in the sign flavor of unsignedp.  */
+  if (mul_highpart_cost[(int) mode] < max_cost)
+    {
+      mul_highpart_optab = unsignedp ? umul_highpart_optab : smul_highpart_optab;
+      target = expand_binop (mode, mul_highpart_optab,
+                            op0, wide_op1, target, unsignedp, OPTAB_DIRECT);
+      if (target)
+       return target;
+    }
+
+  /* Secondly, same as above, but use sign flavor opposite of unsignedp.
+     Need to adjust the result after the multiplication.  */
+  if (mul_highpart_cost[(int) mode] + 2 * shift_cost[size-1] + 4 * add_cost < max_cost)
+    {
+      mul_highpart_optab = unsignedp ? smul_highpart_optab : umul_highpart_optab;
+      target = expand_binop (mode, mul_highpart_optab,
+                            op0, wide_op1, target, unsignedp, OPTAB_DIRECT);
+      if (target)
+       /* We used the wrong signedness.  Adjust the result.  */
+       return expand_mult_highpart_adjust (mode, target, op0,
+                                           op1, target, unsignedp);
+    }
+
+  /* Try widening multiplication.  */
+  moptab = unsignedp ? umul_widen_optab : smul_widen_optab;
+  if (moptab->handlers[(int) wider_mode].insn_code != CODE_FOR_nothing
+      && mul_widen_cost[(int) wider_mode] < max_cost)
+    {
+      op1 = force_reg (mode, op1);
+      goto try;
+    } 
+
+  /* Try widening the mode and perform a non-widening multiplication.  */
+  moptab = smul_optab;
+  if (smul_optab->handlers[(int) wider_mode].insn_code != CODE_FOR_nothing
+      && mul_cost[(int) wider_mode] + shift_cost[size-1] < max_cost)
+    {
+      op1 = wide_op1;
+      goto try;
+    }
+
+  /* Try widening multiplication of opposite signedness, and adjust.  */
+  moptab = unsignedp ? smul_widen_optab : umul_widen_optab;
+  if (moptab->handlers[(int) wider_mode].insn_code != CODE_FOR_nothing
+      && (mul_widen_cost[(int) wider_mode]
+         + 2 * shift_cost[size-1] + 4 * add_cost < max_cost))
+    {
+      rtx regop1 = force_reg (mode, op1);
+      tem = expand_binop (wider_mode, moptab, op0, regop1,
+                         NULL_RTX, ! unsignedp, OPTAB_WIDEN);
+      if (tem != 0)
+       {
+         /* Extract the high half of the just generated product.  */
+         tem = expand_shift (RSHIFT_EXPR, wider_mode, tem,
+                             build_int_2 (size, 0), NULL_RTX, 1);
+         tem = convert_modes (mode, wider_mode, tem, unsignedp);
+         /* We used the wrong signedness.  Adjust the result.  */
+         return expand_mult_highpart_adjust (mode, tem, op0, op1,
+                                             target, unsignedp);
+       }
+    }
+
+  return 0;
+
+ try:
+  /* Pass NULL_RTX as target since TARGET has wrong mode.  */
+  tem = expand_binop (wider_mode, moptab, op0, op1,
+                     NULL_RTX, unsignedp, OPTAB_WIDEN);
+  if (tem == 0)
+    return 0;
+
+  /* Extract the high half of the just generated product.  */
+  if (mode == word_mode)
+    {
+      return gen_highpart (mode, tem);
+    }
+  else
+    {
+      tem = expand_shift (RSHIFT_EXPR, wider_mode, tem,
+                         build_int_2 (size, 0), NULL_RTX, 1);
+      return convert_modes (mode, wider_mode, tem, unsignedp);
+    }
+}
+\f
  /* Emit the code to divide OP0 by OP1, putting the result in TARGET
     if that is convenient, and returning where the result is.
     You may request either the quotient or the remainder as the result;
@@ -2232,6 +2695,8 @@ expand_mult (mode, op0, op1, target, unsignedp)
     But C doesn't use these operations, so their optimizations are
     left for later.  */
  
+#define EXACT_POWER_OF_2_OR_ZERO_P(x) (((x) & ((x) - 1)) == 0)
+
  rtx
  expand_divmod (rem_flag, code, mode, op0, op1, target, unsignedp)
       int rem_flag;
@@ -2240,69 +2705,69 @@ expand_divmod (rem_flag, code, mode, op0, op1, target, unsignedp)
       register rtx op0, op1, target;
       int unsignedp;
  {
-  register rtx result = 0;
    enum machine_mode compute_mode;
-  int log = -1;
+  register rtx tquotient;
+  rtx quotient = 0, remainder = 0;
+  rtx last;
    int size;
-  int can_clobber_op0;
-  int mod_insn_no_good = 0;
-  rtx adjusted_op0 = op0;
+  rtx insn, set;
    optab optab1, optab2;
+  int op1_is_constant, op1_is_pow2;
+  int max_cost, extra_cost;
+
+  op1_is_constant = GET_CODE (op1) == CONST_INT;
+  op1_is_pow2 = (op1_is_constant
+                && ((EXACT_POWER_OF_2_OR_ZERO_P (INTVAL (op1))
+                     || EXACT_POWER_OF_2_OR_ZERO_P (-INTVAL (op1)))));
+
+  /*
+     This is the structure of expand_divmod:
+
+     First comes code to fix up the operands so we can perform the operations
+     correctly and efficiently.
+
+     Second comes a switch statement with code specific for each rounding mode.
+     For some special operands this code emits all RTL for the desired
+     operation, for other cases, it generates only a quotient and stores it in
+     QUOTIENT.  The case for trunc division/remainder might leave quotient = 0,
+     to indicate that it has not done anything.
+
+     Last comes code that finishes the operation.  If QUOTIENT is set and
+     REM_FLAG is set, the remainder is computed as OP0 - QUOTIENT * OP1.  If
+     QUOTIENT is not set, it is computed using trunc rounding.
  
-  /* We shouldn't be called with op1 == const1_rtx, but some of the
+     We try to generate special code for division and remainder when OP1 is a
+     constant.  If |OP1| = 2**n we can use shifts and some other fast
+     operations.  For other values of OP1, we compute a carefully selected
+     fixed-point approximation m = 1/OP1, and generate code that multiplies OP0
+     by m.
+
+     In all cases but EXACT_DIV_EXPR, this multiplication requires the upper
+     half of the product.  Different strategies for generating the product are
+     implemented in expand_mult_highpart.
+
+     If what we actually want is the remainder, we generate that by another
+     by-constant multiplication and a subtraction.  */
+
+  /* We shouldn't be called with OP1 == const1_rtx, but some of the
       code below will malfunction if we are, so check here and handle
       the special case if so.  */
    if (op1 == const1_rtx)
      return rem_flag ? const0_rtx : op0;
  
-  /* Don't use the function value register as a target
-     since we have to read it as well as write it,
-     and function-inlining gets confused by this.  */
-  if (target && REG_P (target) && REG_FUNCTION_VALUE_P (target))
+  if (target
+      /* Don't use the function value register as a target
+        since we have to read it as well as write it,
+        and function-inlining gets confused by this.  */
+      && ((REG_P (target) && REG_FUNCTION_VALUE_P (target))
+         /* Don't clobber an operand while doing a multi-step calculation.  */
+         || ((rem_flag || op1_is_constant)
+             && (reg_mentioned_p (target, op0)
+                 || (GET_CODE (op0) == MEM && GET_CODE (target) == MEM)))
+         || reg_mentioned_p (target, op1)
+         || (GET_CODE (op1) == MEM && GET_CODE (target) == MEM)))
      target = 0;
  
-  /* Don't clobber an operand while doing a multi-step calculation.  */
-  if (target)
-    if ((rem_flag && (reg_mentioned_p (target, op0)
-                     || (GET_CODE (op0) == MEM && GET_CODE (target) == MEM)))
-       || reg_mentioned_p (target, op1)
-       || (GET_CODE (op1) == MEM && GET_CODE (target) == MEM))
-      target = 0;
-
-  can_clobber_op0 = (GET_CODE (op0) == REG && op0 == target);
-
-  if (GET_CODE (op1) == CONST_INT)
-    log = exact_log2 (INTVAL (op1));
-
-  /* If log is >= 0, we are dividing by 2**log, and will do it by shifting,
-     which is really floor-division.  Otherwise we will really do a divide,
-     and we assume that is trunc-division.
-
-     We must correct the dividend by adding or subtracting something
-     based on the divisor, in order to do the kind of rounding specified
-     by CODE.  The correction depends on what kind of rounding is actually
-     available, and that depends on whether we will shift or divide.
-
-     In many of these cases it is possible to perform the operation by a
-     clever series of logical operations (shifts and/or exclusive-ors).
-     Although avoiding the jump has the advantage that it extends the basic
-     block and allows further optimization, the branch-free code is normally
-     at least one instruction longer in the (most common) case where the
-     dividend is non-negative.  Performance measurements of the two
-     alternatives show that the branch-free code is slightly faster on the
-     IBM ROMP but slower on CISC processors (significantly slower on the
-     VAX).  Accordingly, the jump code has been retained.
-
-     On machines where the jump code is slower, the cost of a DIV or MOD
-     operation can be set small (less than twice that of an addition); in 
-     that case, we pretend that we don't have a power of two and perform
-     a normal division or modulus operation.  */
-
-  if ((code == TRUNC_MOD_EXPR || code == TRUNC_DIV_EXPR)
-      && ! unsignedp
-      && (rem_flag ? smod_pow2_cheap : sdiv_pow2_cheap))
-    log = -1;
-
    /* Get the mode in which to perform this computation.  Normally it will
       be MODE, but sometimes we can't do the desired operation in MODE.
       If so, pick a wider mode in which we can do the operation.  Convert
@@ -2318,9 +2783,14 @@ expand_divmod (rem_flag, code, mode, op0, op1, target, unsignedp)
       (either a division, modulus, or shift).  Finally, check for the smallest
       mode for which we can do the operation with a library call.  */
  
-  optab1 = (log >= 0 ? (unsignedp ? lshr_optab : ashr_optab)
+  /* We might want to refine this now that we have division-by-constant
+     optimization.  Since expand_mult_highpart tries so many variants, it is
+     not straightforward to generalize this.  Maybe we should make an array
+     of possible modes in init_expmed?  Save this for GCC 2.7.  */
+
+  optab1 = (op1_is_pow2 ? (unsignedp ? lshr_optab : ashr_optab)
             : (unsignedp ? udiv_optab : sdiv_optab));
-  optab2 = (log >= 0 ? optab1 : (unsignedp ? udivmod_optab : sdivmod_optab));
+  optab2 = (op1_is_pow2 ? optab1 : (unsignedp ? udivmod_optab : sdivmod_optab));
  
    for (compute_mode = mode; compute_mode != VOIDmode;
         compute_mode = GET_MODE_WIDER_MODE (compute_mode))
@@ -2335,286 +2805,917 @@ expand_divmod (rem_flag, code, mode, op0, op1, target, unsignedp)
           || optab2->handlers[(int) compute_mode].libfunc)
         break;
  
-  /* If we still couldn't find a mode, use MODE; we'll probably abort in
-     expand_binop.  */
+  /* If we still couldn't find a mode, use MODE, but we'll probably abort
+     in expand_binop.  */
    if (compute_mode == VOIDmode)
      compute_mode = mode;
  
+  if (target && GET_MODE (target) == compute_mode)
+    tquotient = target;
+  else
+    tquotient = gen_reg_rtx (compute_mode);
+
    size = GET_MODE_BITSIZE (compute_mode);
+#if 0
+  /* It should be possible to restrict the precision to GET_MODE_BITSIZE
+     (mode), and thereby get better code when OP1 is a constant.  Do that
+     later.  It will require going over all usages of SIZE below.  */
+  size = GET_MODE_BITSIZE (mode);
+#endif
+
+  max_cost = div_cost[(int) compute_mode]
+    - (rem_flag ? mul_cost[(int) compute_mode] + add_cost : 0);
  
-  /* Now convert to the best mode to use.  Show we made a copy of OP0
-     and hence we can clobber it (we cannot use a SUBREG to widen
-     something.  */
+  /* Now convert to the best mode to use.  */
    if (compute_mode != mode)
      {
-      adjusted_op0 = op0 = convert_to_mode (compute_mode, op0, unsignedp);
-      can_clobber_op0 = 1;
-      op1 = convert_to_mode (compute_mode, op1, unsignedp);
+      op0 = convert_modes (compute_mode, mode, op0, unsignedp);
+      op1 = convert_modes (compute_mode, mode, op1, unsignedp);
      }
  
-  /* If we are computing the remainder and one of the operands is a volatile
-     MEM, copy it into a register.  */
+  /* If one of the operands is a volatile MEM, copy it into a register.  */
  
-  if (rem_flag && GET_CODE (op0) == MEM && MEM_VOLATILE_P (op0))
-    adjusted_op0 = op0 = force_reg (compute_mode, op0), can_clobber_op0 = 1;
-  if (rem_flag && GET_CODE (op1) == MEM && MEM_VOLATILE_P (op1))
+  if (GET_CODE (op0) == MEM && MEM_VOLATILE_P (op0))
+    op0 = force_reg (compute_mode, op0);
+  if (GET_CODE (op1) == MEM && MEM_VOLATILE_P (op1))
      op1 = force_reg (compute_mode, op1);
  
-  /* If we are computing the remainder, op0 will be needed later to calculate
-     X - Y * (X / Y), therefore cannot be clobbered. */
-  if (rem_flag)
-    can_clobber_op0 = 0;
+  /* If we need the remainder or if OP1 is constant, we need to
+     put OP0 in a register in case it has any queued subexpressions.  */
+  if (rem_flag || op1_is_constant)
+    op0 = force_reg (compute_mode, op0);
  
-  if (target == 0 || GET_MODE (target) != compute_mode)
-    target = gen_reg_rtx (compute_mode);
+  last = get_last_insn ();
  
-  switch (code)
+  /* Promote floor rounding to trunc rounding for unsigned operations.  */
+  if (unsignedp)
      {
-    case TRUNC_MOD_EXPR:
-    case TRUNC_DIV_EXPR:
-      if (log >= 0 && ! unsignedp)
-       {
-         /* Here we need to add OP1-1 if OP0 is negative, 0 otherwise.
-            This can be computed without jumps by arithmetically shifting
-            OP0 right LOG-1 places and then shifting right logically
-            SIZE-LOG bits.  The resulting value is unconditionally added
-            to OP0.  */
-         if (log == 1 || BRANCH_COST >= 3)
-           {
-             rtx temp = gen_reg_rtx (compute_mode);
-             if (! can_clobber_op0)
-               /* Copy op0 to a reg, to play safe,
-                  since this is done in the other path.  */
-               op0 = force_reg (compute_mode, op0);
-             temp = copy_to_suggested_reg (adjusted_op0, temp, compute_mode);
-             temp = expand_shift (RSHIFT_EXPR, compute_mode, temp,
-                                  build_int_2 (log - 1, 0), NULL_RTX, 0);
-             temp = expand_shift (RSHIFT_EXPR, compute_mode, temp,
-                                  build_int_2 (size - log, 0),
-                                  temp, 1);
-             /* We supply 0 as the target to make a new pseudo
-                for the value; that helps loop.c optimize the result.  */
-             adjusted_op0 = expand_binop (compute_mode, add_optab,
-                                          adjusted_op0, temp,
-                                          0, 0, OPTAB_LIB_WIDEN);
-           }
-         else
-           {
-             rtx label = gen_label_rtx ();
-             if (! can_clobber_op0)
-               {
-                 adjusted_op0 = copy_to_suggested_reg (adjusted_op0, target,
-                                                       compute_mode);
-                 /* Copy op0 to a reg, since emit_cmp_insn will call emit_queue
-                    which will screw up mem refs for autoincrements.  */
-                 op0 = force_reg (compute_mode, op0);
-               }
-             emit_cmp_insn (adjusted_op0, const0_rtx, GE, 
-                            NULL_RTX, compute_mode, 0, 0);
-             emit_jump_insn (gen_bge (label));
-             expand_inc (adjusted_op0, plus_constant (op1, -1));
-             emit_label (label);
-           }
-         mod_insn_no_good = 1;
-       }
-      break;
+      if (code == FLOOR_DIV_EXPR)
+       code = TRUNC_DIV_EXPR;
+      if (code == FLOOR_MOD_EXPR)
+       code = TRUNC_MOD_EXPR;
+    }
+
+  if (op1 != const0_rtx)
+    switch (code)
+      {
+      case TRUNC_MOD_EXPR:
+      case TRUNC_DIV_EXPR:
+       if (op1_is_constant)
+         {
+           if (unsignedp)
+             {
+               unsigned HOST_WIDE_INT mh, ml;
+               int pre_shift, post_shift;
+               int dummy;
+               unsigned HOST_WIDE_INT d = INTVAL (op1);
+
+               if (EXACT_POWER_OF_2_OR_ZERO_P (d))
+                 {
+                   pre_shift = floor_log2 (d);
+                   if (rem_flag)
+                     {
+                       remainder =
+                         expand_binop (compute_mode, and_optab, op0,
+                                       GEN_INT (((HOST_WIDE_INT) 1 << pre_shift) - 1),
+                                       remainder, 1,
+                                       OPTAB_LIB_WIDEN);
+                       if (remainder)
+                         return gen_lowpart (mode, remainder);
+                     }
+                   quotient = expand_shift (RSHIFT_EXPR, compute_mode, op0,
+                                            build_int_2 (pre_shift, 0),
+                                            tquotient, 1);
+                 }
+               else if (size <= HOST_BITS_PER_WIDE_INT)
+                 {
+                   if (d >= ((unsigned HOST_WIDE_INT) 1 << (size - 1)))
+                     {
+                       /* Most significant bit of divisor is set; emit an scc
+                          insn.  */
+                       quotient = emit_store_flag (tquotient, GEU, op0, op1,
+                                                   compute_mode, 1, 1);
+                       if (quotient == 0)
+                         goto fail1;
+                     }
+                   else
+                     {
+                       /* Find a suitable multiplier and right shift count
+                          instead of multiplying with D.  */
+
+                       mh = choose_multiplier (d, size, size,
+                                               &ml, &post_shift, &dummy);
+
+                       /* If the suggested multiplier is more than SIZE bits,
+                          we can do better for even divisors, using an
+                          initial right shift.  */
+                       if (mh != 0 && (d & 1) == 0)
+                         {
+                           pre_shift = floor_log2 (d & -d);
+                           mh = choose_multiplier (d >> pre_shift, size,
+                                                   size - pre_shift,
+                                                   &ml, &post_shift, &dummy);
+                           if (mh)
+                             abort ();
+                         }
+                       else
+                         pre_shift = 0;
+
+                       if (mh != 0)
+                         {
+                           rtx t1, t2, t3, t4;
+
+                           extra_cost = (shift_cost[post_shift - 1]
+                                         + shift_cost[1] + 2 * add_cost);
+                           t1 = expand_mult_highpart (compute_mode, op0, ml,
+                                                      NULL_RTX, 1,
+                                                      max_cost - extra_cost);
+                           if (t1 == 0)
+                             goto fail1;
+                           t2 = force_operand (gen_rtx (MINUS, compute_mode,
+                                                        op0, t1),
+                                               NULL_RTX);
+                           t3 = expand_shift (RSHIFT_EXPR, compute_mode, t2,
+                                              build_int_2 (1, 0), NULL_RTX,1);
+                           t4 = force_operand (gen_rtx (PLUS, compute_mode,
+                                                        t1, t3),
+                                               NULL_RTX);
+                           quotient =
+                             expand_shift (RSHIFT_EXPR, compute_mode, t4,
+                                           build_int_2 (post_shift - 1, 0),
+                                           tquotient, 1);
+                         }
+                       else
+                         {
+                           rtx t1, t2;
+
+                           t1 = expand_shift (RSHIFT_EXPR, compute_mode, op0,
+                                              build_int_2 (pre_shift, 0),
+                                              NULL_RTX, 1);
+                           extra_cost = (shift_cost[pre_shift]
+                                         + shift_cost[post_shift]);
+                           t2 = expand_mult_highpart (compute_mode, t1, ml,
+                                                      NULL_RTX, 1,
+                                                      max_cost - extra_cost);
+                           if (t2 == 0)
+                             goto fail1;
+                           quotient =
+                             expand_shift (RSHIFT_EXPR, compute_mode, t2,
+                                           build_int_2 (post_shift, 0),
+                                           tquotient, 1);
+                         }
+                     }
+                 }
+               else            /* Too wide mode to use tricky code */
+                 break;
+
+               insn = get_last_insn ();
+               if (insn != last
+                   && (set = single_set (insn)) != 0
+                   && SET_DEST (set) == quotient)
+                 REG_NOTES (insn)
+                   = gen_rtx (EXPR_LIST, REG_EQUAL,
+                              gen_rtx (UDIV, compute_mode, op0, op1),
+                              REG_NOTES (insn));
+             }
+           else                /* TRUNC_DIV, signed */
+             {
+               unsigned HOST_WIDE_INT ml;
+               int lgup, post_shift;
+               HOST_WIDE_INT d = INTVAL (op1);
+               unsigned HOST_WIDE_INT abs_d = d >= 0 ? d : -d;
+
+               /* n rem d = n rem -d */
+               if (rem_flag && d < 0)
+                 {
+                   d = abs_d;
+                   op1 = GEN_INT (abs_d);
+                 }
+
+               if (d == 1)
+                 quotient = op0;
+               else if (d == -1)
+                 quotient = expand_unop (compute_mode, neg_optab, op0,
+                                         tquotient, 0);
+               else if (abs_d == (unsigned HOST_WIDE_INT) 1 << (size - 1))
+                 {
+                   /* This case is not handled correctly below.  */
+                   quotient = emit_store_flag (tquotient, EQ, op0, op1,
+                                               compute_mode, 1, 1);
+                   if (quotient == 0)
+                     goto fail1;
+                 }
+               else if (EXACT_POWER_OF_2_OR_ZERO_P (d)
+                        && (rem_flag ? smod_pow2_cheap : sdiv_pow2_cheap))
+                 ;
+               else if (EXACT_POWER_OF_2_OR_ZERO_P (abs_d))
+                 {
+                   lgup = floor_log2 (abs_d);
+                   if (abs_d != 2 && BRANCH_COST < 3)
+                     {
+                       rtx label = gen_label_rtx ();
+                       rtx t1;
+
+                       t1 = copy_to_mode_reg (compute_mode, op0);
+                       emit_cmp_insn (t1, const0_rtx, GE, 
+                                      NULL_RTX, compute_mode, 0, 0);
+                       emit_jump_insn (gen_bge (label));
+                       expand_inc (t1, GEN_INT (abs_d - 1));
+                       emit_label (label);
+                       quotient = expand_shift (RSHIFT_EXPR, compute_mode, t1,
+                                                build_int_2 (lgup, 0),
+                                                tquotient, 0);
+                     }
+                   else
+                     {
+                       rtx t1, t2, t3;
+                       t1 = expand_shift (RSHIFT_EXPR, compute_mode, op0,
+                                          build_int_2 (size - 1, 0),
+                                          NULL_RTX, 0);
+                       t2 = expand_shift (RSHIFT_EXPR, compute_mode, t1,
+                                          build_int_2 (size - lgup, 0),
+                                          NULL_RTX, 1);
+                       t3 = force_operand (gen_rtx (PLUS, compute_mode,
+                                                    op0, t2),
+                                           NULL_RTX);
+                       quotient = expand_shift (RSHIFT_EXPR, compute_mode, t3,
+                                                build_int_2 (lgup, 0),
+                                                tquotient, 0);
+                     }
+
+                   /* We have computed OP0 / abs(OP1).  If OP1 is negative, negate
+                      the quotient.  */
+                   if (d < 0)
+                     {
+                       insn = get_last_insn ();
+                       if (insn != last
+                           && (set = single_set (insn)) != 0
+                           && SET_DEST (set) == quotient)
+                         REG_NOTES (insn)
+                           = gen_rtx (EXPR_LIST, REG_EQUAL,
+                                      gen_rtx (DIV, compute_mode, op0,
+                                               GEN_INT (abs_d)),
+                                      REG_NOTES (insn));
+
+                       quotient = expand_unop (compute_mode, neg_optab,
+                                               quotient, quotient, 0);
+                     }
+                 }
+               else if (size <= HOST_BITS_PER_WIDE_INT)
+                 {
+                   choose_multiplier (abs_d, size, size - 1,
+                                      &ml, &post_shift, &lgup);
+                   if (ml < (unsigned HOST_WIDE_INT) 1 << (size - 1))
+                     {
+                       rtx t1, t2, t3;
+
+                       extra_cost = (shift_cost[post_shift]
+                                     + shift_cost[size - 1] + add_cost);
+                       t1 = expand_mult_highpart (compute_mode, op0, ml,
+                                                  NULL_RTX, 0,
+                                                  max_cost - extra_cost);
+                       if (t1 == 0)
+                         goto fail1;
+                       t2 = expand_shift (RSHIFT_EXPR, compute_mode, t1,
+                                          build_int_2 (post_shift, 0), NULL_RTX, 0);
+                       t3 = expand_shift (RSHIFT_EXPR, compute_mode, op0,
+                                          build_int_2 (size - 1, 0), NULL_RTX, 0);
+                       if (d < 0)
+                         quotient = force_operand (gen_rtx (MINUS, compute_mode, t3, t2),
+                                                   tquotient);
+                       else
+                         quotient = force_operand (gen_rtx (MINUS, compute_mode, t2, t3),
+                                                   tquotient);
+                     }
+                   else
+                     {
+                       rtx t1, t2, t3, t4;
+
+                       ml |= (~(unsigned HOST_WIDE_INT) 0) << (size - 1);
+                       extra_cost = (shift_cost[post_shift]
+                                     + shift_cost[size - 1] + 2 * add_cost);
+                       t1 = expand_mult_highpart (compute_mode, op0, ml,
+                                                  NULL_RTX, 0,
+                                                  max_cost - extra_cost);
+                       if (t1 == 0)
+                         goto fail1;
+                       t2 = force_operand (gen_rtx (PLUS, compute_mode, t1, op0),
+                                           NULL_RTX);
+                       t3 = expand_shift (RSHIFT_EXPR, compute_mode, t2,
+                                          build_int_2 (post_shift, 0), NULL_RTX, 0);
+                       t4 = expand_shift (RSHIFT_EXPR, compute_mode, op0,
+                                          build_int_2 (size - 1, 0), NULL_RTX, 0);
+                       if (d < 0)
+                         quotient = force_operand (gen_rtx (MINUS, compute_mode, t4, t3),
+                                                   tquotient);
+                       else
+                         quotient = force_operand (gen_rtx (MINUS, compute_mode, t3, t4),
+                                                   tquotient);
+                     }
+                 }
+               else            /* Too wide mode to use tricky code */
+                 break;
+
+               insn = get_last_insn ();
+               if (insn != last
+                   && (set = single_set (insn)) != 0
+                   && SET_DEST (set) == quotient)
+                 REG_NOTES (insn)
+                   = gen_rtx (EXPR_LIST, REG_EQUAL,
+                              gen_rtx (DIV, compute_mode, op0, op1),
+                              REG_NOTES (insn));
+             }
+           break;
+         }
+      fail1:
+       delete_insns_since (last);
+       break;
+
+      case FLOOR_DIV_EXPR:
+      case FLOOR_MOD_EXPR:
+      /* We will come here only for signed operations.  */
+       if (op1_is_constant && HOST_BITS_PER_WIDE_INT >= size)
+         {
+           unsigned HOST_WIDE_INT mh, ml;
+           int pre_shift, lgup, post_shift;
+           HOST_WIDE_INT d = INTVAL (op1);
+
+           if (d > 0)
+             {
+               /* We could just as easily deal with negative constants here,
+                  but it does not seem worth the trouble for GCC 2.6.  */
+               if (EXACT_POWER_OF_2_OR_ZERO_P (d))
+                 {
+                   pre_shift = floor_log2 (d);
+                   if (rem_flag)
+                     {
+                       remainder = expand_binop (compute_mode, and_optab, op0,
+                                                 GEN_INT (((HOST_WIDE_INT) 1 << pre_shift) - 1),
+                                                 remainder, 0, OPTAB_LIB_WIDEN);
+                       if (remainder)
+                         return gen_lowpart (mode, remainder);
+                     }
+                   quotient = expand_shift (RSHIFT_EXPR, compute_mode, op0,
+                                            build_int_2 (pre_shift, 0),
+                                            tquotient, 0);
+                 }
+               else
+                 {
+                   rtx t1, t2, t3, t4;
+
+                   mh = choose_multiplier (d, size, size - 1,
+                                           &ml, &post_shift, &lgup);
+                   if (mh)
+                     abort ();
+
+                   t1 = expand_shift (RSHIFT_EXPR, compute_mode, op0,
+                                      build_int_2 (size - 1, 0), NULL_RTX, 0);
+                   t2 = expand_binop (compute_mode, xor_optab, op0, t1,
+                                      NULL_RTX, 0, OPTAB_WIDEN);
+                   extra_cost = (shift_cost[post_shift]
+                                 + shift_cost[size - 1] + 2 * add_cost);
+                   t3 = expand_mult_highpart (compute_mode, t2, ml,
+                                              NULL_RTX, 1,
+                                              max_cost - extra_cost);
+                   if (t3 != 0)
+                     {
+                       t4 = expand_shift (RSHIFT_EXPR, compute_mode, t3,
+                                          build_int_2 (post_shift, 0),
+                                          NULL_RTX, 1);
+                       quotient = expand_binop (compute_mode, xor_optab,
+                                                t4, t1, tquotient, 0,
+                                                OPTAB_WIDEN);
+                     }
+                 }
+             }
+           else
+             {
+               rtx nsign, t1, t2, t3, t4;
+               t1 = force_operand (gen_rtx (PLUS, compute_mode,
+                                            op0, constm1_rtx), NULL_RTX);
+               t2 = expand_binop (compute_mode, ior_optab, op0, t1, NULL_RTX,
+                                  0, OPTAB_WIDEN);
+               nsign = expand_shift (RSHIFT_EXPR, compute_mode, t2,
+                                     build_int_2 (size - 1, 0), NULL_RTX, 0);
+               t3 = force_operand (gen_rtx (MINUS, compute_mode, t1, nsign),
+                                   NULL_RTX);
+               t4 = expand_divmod (0, TRUNC_DIV_EXPR, compute_mode, t3, op1,
+                                   NULL_RTX, 0);
+               if (t4)
+                 {
+                   rtx t5;
+                   t5 = expand_unop (compute_mode, one_cmpl_optab, nsign,
+                                     NULL_RTX, 0);
+                   quotient = force_operand (gen_rtx (PLUS, compute_mode,
+                                                      t4, t5),
+                                             tquotient);
+                 }
+             }
+         }
+
+       if (quotient != 0)
+         break;
+       delete_insns_since (last);
  
-    case FLOOR_DIV_EXPR:
-    case FLOOR_MOD_EXPR:
-      if (log < 0 && ! unsignedp)
+       /* Try using an instruction that produces both the quotient and
+          remainder, using truncation.  We can easily compensate the quotient
+          or remainder to get floor rounding, once we have the remainder.
+          Notice that we compute also the final remainder value here,
+          and return the result right away.  */
+       if (target == 0 || GET_MODE (target) != compute_mode)
+         target = gen_reg_rtx (compute_mode);
+
+       if (rem_flag)
+         {
+           remainder
+             = GET_CODE (target) == REG ? target : gen_reg_rtx (compute_mode);
+           quotient = gen_reg_rtx (compute_mode);
+         }
+       else
+         {
+           quotient
+             = GET_CODE (target) == REG ? target : gen_reg_rtx (compute_mode);
+           remainder = gen_reg_rtx (compute_mode);
+         }
+
+       if (expand_twoval_binop (sdivmod_optab, op0, op1,
+                                quotient, remainder, 0))
+         {
+           /* This could be computed with a branch-less sequence.
+              Save that for later.  */
+           rtx tem;
+           rtx label = gen_label_rtx ();
+           emit_cmp_insn (remainder, const0_rtx, EQ, NULL_RTX,
+                          compute_mode, 0, 0);
+           emit_jump_insn (gen_beq (label));
+           tem = expand_binop (compute_mode, xor_optab, op0, op1,
+                               NULL_RTX, 0, OPTAB_WIDEN);
+           emit_cmp_insn (tem, const0_rtx, GE, NULL_RTX, compute_mode, 0, 0);
+           emit_jump_insn (gen_bge (label));
+           expand_dec (quotient, const1_rtx);
+           expand_inc (remainder, op1);
+           emit_label (label);
+           return gen_lowpart (mode, rem_flag ? remainder : quotient);
+         }
+
+       /* No luck with division elimination or divmod.  Have to do it
+          by conditionally adjusting op0 *and* the result.  */
         {
-         rtx label = gen_label_rtx ();
-         if (! can_clobber_op0)
-           {
-             adjusted_op0 = copy_to_suggested_reg (adjusted_op0, target,
-                                                   compute_mode);
-             /* Copy op0 to a reg, since emit_cmp_insn will call emit_queue
-                which will screw up mem refs for autoincrements.  */
-             op0 = force_reg (compute_mode, op0);
-           }
-         emit_cmp_insn (adjusted_op0, const0_rtx, GE, 
-                        NULL_RTX, compute_mode, 0, 0);
-         emit_jump_insn (gen_bge (label));
-         expand_dec (adjusted_op0, op1);
+         rtx label1, label2, label3, label4, label5;
+         rtx adjusted_op0;
+         rtx tem;
+
+         quotient = gen_reg_rtx (compute_mode);
+         adjusted_op0 = copy_to_mode_reg (compute_mode, op0);
+         label1 = gen_label_rtx ();
+         label2 = gen_label_rtx ();
+         label3 = gen_label_rtx ();
+         label4 = gen_label_rtx ();
+         label5 = gen_label_rtx ();
+         emit_cmp_insn (op1, const0_rtx, LT, NULL_RTX, compute_mode, 0, 0);
+         emit_jump_insn (gen_blt (label2));
+         emit_cmp_insn (adjusted_op0, const0_rtx, LT, NULL_RTX,
+                        compute_mode, 0, 0);
+         emit_jump_insn (gen_blt (label1));
+         tem = expand_binop (compute_mode, sdiv_optab, adjusted_op0, op1,
+                             quotient, 0, OPTAB_LIB_WIDEN);
+         if (tem != quotient)
+           emit_move_insn (quotient, tem);
+         emit_jump_insn (gen_jump (label5));
+         emit_barrier ();
+         emit_label (label1);
           expand_inc (adjusted_op0, const1_rtx);
-         emit_label (label);
-         mod_insn_no_good = 1;
+         emit_jump_insn (gen_jump (label4));
+         emit_barrier ();
+         emit_label (label2);
+         emit_cmp_insn (adjusted_op0, const0_rtx, GT, NULL_RTX,
+                        compute_mode, 0, 0);
+         emit_jump_insn (gen_bgt (label3));
+         tem = expand_binop (compute_mode, sdiv_optab, adjusted_op0, op1,
+                             quotient, 0, OPTAB_LIB_WIDEN);
+         if (tem != quotient)
+           emit_move_insn (quotient, tem);
+         emit_jump_insn (gen_jump (label5));
+         emit_barrier ();
+         emit_label (label3);
+         expand_dec (adjusted_op0, const1_rtx);
+         emit_label (label4);
+         tem = expand_binop (compute_mode, sdiv_optab, adjusted_op0, op1,
+                             quotient, 0, OPTAB_LIB_WIDEN);
+         if (tem != quotient)
+           emit_move_insn (quotient, tem);
+         expand_dec (quotient, const1_rtx);
+         emit_label (label5);
         }
-      break;
+       break;
  
-    case CEIL_DIV_EXPR:
-    case CEIL_MOD_EXPR:
-      if (! can_clobber_op0)
-       {
-         adjusted_op0 = copy_to_suggested_reg (adjusted_op0, target,
-                                               compute_mode);
-         /* Copy op0 to a reg, since emit_cmp_insn will call emit_queue
-            which will screw up mem refs for autoincrements.  */
-         op0 = force_reg (compute_mode, op0);
-       }
-      if (log < 0)
-       {
-         rtx label = 0;
-         if (! unsignedp)
+      case CEIL_DIV_EXPR:
+      case CEIL_MOD_EXPR:
+       if (unsignedp)
+         {
+           if (op1_is_constant && EXACT_POWER_OF_2_OR_ZERO_P (INTVAL (op1)))
+             {
+               rtx t1, t2, t3;
+               unsigned HOST_WIDE_INT d = INTVAL (op1);
+               t1 = expand_shift (RSHIFT_EXPR, compute_mode, op0,
+                                  build_int_2 (floor_log2 (d), 0),
+                                  tquotient, 1);
+               t2 = expand_binop (compute_mode, and_optab, op0,
+                                  GEN_INT (d - 1),
+                                  NULL_RTX, 1, OPTAB_LIB_WIDEN);
+               t3 = gen_reg_rtx (compute_mode);
+               t3 = emit_store_flag (t3, NE, t2, const0_rtx,
+                                     compute_mode, 1, 1);
+               if (t3 == 0)
+                 {
+                   rtx lab;
+                   lab = gen_label_rtx ();
+                   emit_cmp_insn (t2, const0_rtx, EQ, NULL_RTX,
+                                  compute_mode, 0, 0);
+                   emit_jump_insn (gen_beq (lab));
+                   expand_inc (t1, const1_rtx);
+                   emit_label (lab);
+                   quotient = t1;
+                 }
+               else
+                 quotient = force_operand (gen_rtx (PLUS, compute_mode,
+                                                    t1, t3),
+                                           tquotient);
+               break;
+             }
+
+           /* Try using an instruction that produces both the quotient and
+              remainder, using truncation.  We can easily compensate the
+              quotient or remainder to get ceiling rounding, once we have the
+              remainder.  Notice that we compute also the final remainder
+              value here, and return the result right away.  */
+           if (target == 0 || GET_MODE (target) != compute_mode)
+             target = gen_reg_rtx (compute_mode);
+
+           if (rem_flag)
+             {
+               remainder = (GET_CODE (target) == REG
+                            ? target : gen_reg_rtx (compute_mode));
+               quotient = gen_reg_rtx (compute_mode);
+             }
+           else
+             {
+               quotient = (GET_CODE (target) == REG
+                           ? target : gen_reg_rtx (compute_mode));
+               remainder = gen_reg_rtx (compute_mode);
+             }
+
+           if (expand_twoval_binop (udivmod_optab, op0, op1, quotient,
+                                    remainder, 1))
+             {
+               /* This could be computed with a branch-less sequence.
+                  Save that for later.  */
+               rtx label = gen_label_rtx ();
+               emit_cmp_insn (remainder, const0_rtx, EQ, NULL_RTX,
+                              compute_mode, 0, 0);
+               emit_jump_insn (gen_beq (label));
+               expand_inc (quotient, const1_rtx);
+               expand_dec (remainder, op1);
+               emit_label (label);
+               return gen_lowpart (mode, rem_flag ? remainder : quotient);
+             }
+
+           /* No luck with division elimination or divmod.  Have to do it
+              by conditionally adjusting op0 *and* the result.  */
             {
-             label = gen_label_rtx ();
-             emit_cmp_insn (adjusted_op0, const0_rtx, LE, 
-                            NULL_RTX, compute_mode, 0, 0);
-             emit_jump_insn (gen_ble (label));
+             rtx label1, label2;
+             rtx adjusted_op0, tem;
+
+             quotient = gen_reg_rtx (compute_mode);
+             adjusted_op0 = copy_to_mode_reg (compute_mode, op0);
+             label1 = gen_label_rtx ();
+             label2 = gen_label_rtx ();
+             emit_cmp_insn (adjusted_op0, const0_rtx, NE, NULL_RTX,
+                            compute_mode, 0, 0);
+             emit_jump_insn (gen_bne (label1));
+             emit_move_insn  (quotient, const0_rtx);
+             emit_jump_insn (gen_jump (label2));
+             emit_barrier ();
+             emit_label (label1);
+             expand_dec (adjusted_op0, const1_rtx);
+             tem = expand_binop (compute_mode, udiv_optab, adjusted_op0, op1,
+                                 quotient, 1, OPTAB_LIB_WIDEN);
+             if (tem != quotient)
+               emit_move_insn (quotient, tem);
+             expand_inc (quotient, const1_rtx);
+             emit_label (label2);
             }
-         expand_inc (adjusted_op0, op1);
-         expand_dec (adjusted_op0, const1_rtx);
-         if (! unsignedp)
-           emit_label (label);
-       }
-      else
-       {
-         adjusted_op0 = expand_binop (compute_mode, add_optab,
-                                      adjusted_op0, plus_constant (op1, -1),
-                                      NULL_RTX, 0, OPTAB_LIB_WIDEN);
-       }
-      mod_insn_no_good = 1;
-      break;
-
-    case ROUND_DIV_EXPR:
-    case ROUND_MOD_EXPR:
-      if (! can_clobber_op0)
-       {
-         adjusted_op0 = copy_to_suggested_reg (adjusted_op0, target,
-                                               compute_mode);
-         /* Copy op0 to a reg, since emit_cmp_insn will call emit_queue
-            which will screw up mem refs for autoincrements.  */
-         op0 = force_reg (compute_mode, op0);
-       }
-      if (log < 0)
-       {
-         op1 = expand_shift (RSHIFT_EXPR, compute_mode, op1,
-                             integer_one_node, NULL_RTX, 0);
-         if (! unsignedp)
+         }
+       else /* signed */
+         {
+           if (op1_is_constant && EXACT_POWER_OF_2_OR_ZERO_P (INTVAL (op1))
+               && INTVAL (op1) >= 0)
+             {
+               /* This is extremely similar to the code for the unsigned case
+                  above.  For 2.7 we should merge these variants, but for
+                  2.6.1 I don't want to touch the code for unsigned since that
+                  get used in C.  The signed case will only be used by other
+                  languages (Ada).  */
+
+               rtx t1, t2, t3;
+               unsigned HOST_WIDE_INT d = INTVAL (op1);
+               t1 = expand_shift (RSHIFT_EXPR, compute_mode, op0,
+                                  build_int_2 (floor_log2 (d), 0),
+                                  tquotient, 0);
+               t2 = expand_binop (compute_mode, and_optab, op0,
+                                  GEN_INT (d - 1),
+                                  NULL_RTX, 1, OPTAB_LIB_WIDEN);
+               t3 = gen_reg_rtx (compute_mode);
+               t3 = emit_store_flag (t3, NE, t2, const0_rtx,
+                                     compute_mode, 1, 1);
+               if (t3 == 0)
+                 {
+                   rtx lab;
+                   lab = gen_label_rtx ();
+                   emit_cmp_insn (t2, const0_rtx, EQ, NULL_RTX,
+                                  compute_mode, 0, 0);
+                   emit_jump_insn (gen_beq (lab));
+                   expand_inc (t1, const1_rtx);
+                   emit_label (lab);
+                   quotient = t1;
+                 }
+               else
+                 quotient = force_operand (gen_rtx (PLUS, compute_mode,
+                                                    t1, t3),
+                                           tquotient);
+               break;
+             }
+
+           /* Try using an instruction that produces both the quotient and
+              remainder, using truncation.  We can easily compensate the
+              quotient or remainder to get ceiling rounding, once we have the
+              remainder.  Notice that we compute also the final remainder
+              value here, and return the result right away.  */
+           if (target == 0 || GET_MODE (target) != compute_mode)
+             target = gen_reg_rtx (compute_mode);
+           if (rem_flag)
+             {
+               remainder= (GET_CODE (target) == REG
+                           ? target : gen_reg_rtx (compute_mode));
+               quotient = gen_reg_rtx (compute_mode);
+             }
+           else
+             {
+               quotient = (GET_CODE (target) == REG
+                           ? target : gen_reg_rtx (compute_mode));
+               remainder = gen_reg_rtx (compute_mode);
+             }
+
+           if (expand_twoval_binop (sdivmod_optab, op0, op1, quotient,
+                                    remainder, 0))
+             {
+               /* This could be computed with a branch-less sequence.
+                  Save that for later.  */
+               rtx tem;
+               rtx label = gen_label_rtx ();
+               emit_cmp_insn (remainder, const0_rtx, EQ, NULL_RTX,
+                              compute_mode, 0, 0);
+               emit_jump_insn (gen_beq (label));
+               tem = expand_binop (compute_mode, xor_optab, op0, op1,
+                                   NULL_RTX, 0, OPTAB_WIDEN);
+               emit_cmp_insn (tem, const0_rtx, LT, NULL_RTX,
+                              compute_mode, 0, 0);
+               emit_jump_insn (gen_blt (label));
+               expand_inc (quotient, const1_rtx);
+               expand_dec (remainder, op1);
+               emit_label (label);
+               return gen_lowpart (mode, rem_flag ? remainder : quotient);
+             }
+
+           /* No luck with division elimination or divmod.  Have to do it
+              by conditionally adjusting op0 *and* the result.  */
             {
-             if (BRANCH_COST >= 2)
-               {
-                 /* Negate OP1 if OP0 < 0.  Do this by computing a temporary
-                    that has all bits equal to the sign bit and exclusive
-                    or-ing it with OP1.  */
-                 rtx temp = gen_reg_rtx (compute_mode);
-                 temp = copy_to_suggested_reg (adjusted_op0, temp, compute_mode);
-                 temp = expand_shift (RSHIFT_EXPR, compute_mode, temp,
-                                      build_int_2 (size - 1, 0),
-                                      NULL_RTX, 0);
-                 op1 = expand_binop (compute_mode, xor_optab, op1, temp, op1,
-                                     unsignedp, OPTAB_LIB_WIDEN);
-               }
-             else
-               {
-                 rtx label = gen_label_rtx ();
-                 emit_cmp_insn (adjusted_op0, const0_rtx, GE, NULL_RTX,
-                                compute_mode, 0, 0);
-                 emit_jump_insn (gen_bge (label));
-                 expand_unop (compute_mode, neg_optab, op1, op1, 0);
-                 emit_label (label);
-               }
+             rtx label1, label2, label3, label4, label5;
+             rtx adjusted_op0;
+             rtx tem;
+
+             quotient = gen_reg_rtx (compute_mode);
+             adjusted_op0 = copy_to_mode_reg (compute_mode, op0);
+             label1 = gen_label_rtx ();
+             label2 = gen_label_rtx ();
+             label3 = gen_label_rtx ();
+             label4 = gen_label_rtx ();
+             label5 = gen_label_rtx ();
+             emit_cmp_insn (op1, const0_rtx, LT, NULL_RTX,
+                            compute_mode, 0, 0);
+             emit_jump_insn (gen_blt (label2));
+             emit_cmp_insn (adjusted_op0, const0_rtx, GT, NULL_RTX,
+                            compute_mode, 0, 0);
+             emit_jump_insn (gen_bgt (label1));
+             tem = expand_binop (compute_mode, sdiv_optab, adjusted_op0, op1,
+                                 quotient, 0, OPTAB_LIB_WIDEN);
+             if (tem != quotient)
+               emit_move_insn (quotient, tem);
+             emit_jump_insn (gen_jump (label5));
+             emit_barrier ();
+             emit_label (label1);
+             expand_dec (adjusted_op0, const1_rtx);
+             emit_jump_insn (gen_jump (label4));
+             emit_barrier ();
+             emit_label (label2);
+             emit_cmp_insn (adjusted_op0, const0_rtx, LT, NULL_RTX,
+                            compute_mode, 0, 0);
+             emit_jump_insn (gen_blt (label3));
+             tem = expand_binop (compute_mode, sdiv_optab, adjusted_op0, op1,
+                                 quotient, 0, OPTAB_LIB_WIDEN);
+             if (tem != quotient)
+               emit_move_insn (quotient, tem);
+             emit_jump_insn (gen_jump (label5));
+             emit_barrier ();
+             emit_label (label3);
+             expand_inc (adjusted_op0, const1_rtx);
+             emit_label (label4);
+             tem = expand_binop (compute_mode, sdiv_optab, adjusted_op0, op1,
+                                 quotient, 0, OPTAB_LIB_WIDEN);
+             if (tem != quotient)
+               emit_move_insn (quotient, tem);
+             expand_inc (quotient, const1_rtx);
+             emit_label (label5);
             }
-         expand_inc (adjusted_op0, op1);
-       }
-      else
-       {
-         op1 = GEN_INT (((HOST_WIDE_INT) 1 << log) / 2);
-         expand_inc (adjusted_op0, op1);
-       }
-      mod_insn_no_good = 1;
-      break;
-    }
+         }
+       break;
  
-  if (rem_flag && !mod_insn_no_good)
+      case EXACT_DIV_EXPR:
+       if (op1_is_constant && HOST_BITS_PER_WIDE_INT >= size)
+         {
+           HOST_WIDE_INT d = INTVAL (op1);
+           unsigned HOST_WIDE_INT ml;
+           int post_shift;
+           rtx t1;
+
+           post_shift = floor_log2 (d & -d);
+           ml = invert_mod2n (d >> post_shift, size);
+           t1 = expand_mult (compute_mode, op0, GEN_INT (ml), NULL_RTX,
+                             unsignedp);
+           quotient = expand_shift (RSHIFT_EXPR, compute_mode, t1,
+                                    build_int_2 (post_shift, 0),
+                                    NULL_RTX, unsignedp);
+
+           insn = get_last_insn ();
+           REG_NOTES (insn)
+             = gen_rtx (EXPR_LIST, REG_EQUAL,
+                        gen_rtx (unsignedp ? UDIV : DIV, compute_mode,
+                                 op0, op1),
+                        REG_NOTES (insn));
+         }
+       break;
+
+      case ROUND_DIV_EXPR:
+      case ROUND_MOD_EXPR:
+       if (unsignedp)
+         {
+           rtx tem;
+           rtx label;
+           label = gen_label_rtx ();
+           quotient = gen_reg_rtx (compute_mode);
+           remainder = gen_reg_rtx (compute_mode);
+           if (expand_twoval_binop (udivmod_optab, op0, op1, quotient, remainder, 1) == 0)
+             {
+               rtx tem;
+               quotient = expand_binop (compute_mode, udiv_optab, op0, op1,
+                                        quotient, 1, OPTAB_LIB_WIDEN);
+               tem = expand_mult (compute_mode, quotient, op1, NULL_RTX, 1);
+               remainder = expand_binop (compute_mode, sub_optab, op0, tem,
+                                         remainder, 1, OPTAB_LIB_WIDEN);
+             }
+           tem = plus_constant (op1, -1);
+           tem = expand_shift (RSHIFT_EXPR, compute_mode, tem,
+                               build_int_2 (1, 0), NULL_RTX, 1);
+           emit_cmp_insn (remainder, tem, LEU, NULL_RTX, compute_mode, 0, 0);
+           emit_jump_insn (gen_bleu (label));
+           expand_inc (quotient, const1_rtx);
+           expand_dec (remainder, op1);
+           emit_label (label);
+         }
+       else
+         {
+           rtx abs_rem, abs_op1, tem, mask;
+           rtx label;
+           label = gen_label_rtx ();
+           quotient = gen_reg_rtx (compute_mode);
+           remainder = gen_reg_rtx (compute_mode);
+           if (expand_twoval_binop (sdivmod_optab, op0, op1, quotient, remainder, 0) == 0)
+             {
+               rtx tem;
+               quotient = expand_binop (compute_mode, sdiv_optab, op0, op1,
+                                        quotient, 0, OPTAB_LIB_WIDEN);
+               tem = expand_mult (compute_mode, quotient, op1, NULL_RTX, 0);
+               remainder = expand_binop (compute_mode, sub_optab, op0, tem,
+                                         remainder, 0, OPTAB_LIB_WIDEN);
+             }
+           abs_rem = expand_abs (compute_mode, remainder, NULL_RTX, 0, 0);
+           abs_op1 = expand_abs (compute_mode, op1, NULL_RTX, 0, 0);
+           tem = expand_shift (LSHIFT_EXPR, compute_mode, abs_rem,
+                               build_int_2 (1, 0), NULL_RTX, 1);
+           emit_cmp_insn (tem, abs_op1, LTU, NULL_RTX, compute_mode, 0, 0);
+           emit_jump_insn (gen_bltu (label));
+           tem = expand_binop (compute_mode, xor_optab, op0, op1,
+                               NULL_RTX, 0, OPTAB_WIDEN);
+           mask = expand_shift (RSHIFT_EXPR, compute_mode, tem,
+                               build_int_2 (size - 1, 0), NULL_RTX, 0);
+           tem = expand_binop (compute_mode, xor_optab, mask, const1_rtx,
+                               NULL_RTX, 0, OPTAB_WIDEN);
+           tem = expand_binop (compute_mode, sub_optab, tem, mask,
+                               NULL_RTX, 0, OPTAB_WIDEN);
+           expand_inc (quotient, tem);
+           tem = expand_binop (compute_mode, xor_optab, mask, op1,
+                               NULL_RTX, 0, OPTAB_WIDEN);
+           tem = expand_binop (compute_mode, sub_optab, tem, mask,
+                               NULL_RTX, 0, OPTAB_WIDEN);
+           expand_dec (remainder, tem);
+           emit_label (label);
+         }
+       return gen_lowpart (mode, rem_flag ? remainder : quotient);
+      }
+
+  if (quotient == 0)
      {
-      /* Try to produce the remainder directly */
-      if (log >= 0)
-       result = expand_binop (compute_mode, and_optab, adjusted_op0,
-                              GEN_INT (((HOST_WIDE_INT) 1 << log) - 1),
-                              target, 1, OPTAB_LIB_WIDEN);
-      else
+      if (target && GET_MODE (target) != compute_mode)
+       target = 0;
+
+      if (rem_flag)
         {
-         /* See if we can do remainder without a library call.  */
-         result = sign_expand_binop (mode, umod_optab, smod_optab,
-                                     adjusted_op0, op1, target,
-                                     unsignedp, OPTAB_WIDEN);
-         if (result == 0)
+         /* Try to produce the remainder directly without a library call.  */
+         remainder = sign_expand_binop (compute_mode, umod_optab, smod_optab,
+                                        op0, op1, target,
+                                        unsignedp, OPTAB_WIDEN);
+         if (remainder == 0)
             {
               /* No luck there.  Can we do remainder and divide at once
                  without a library call?  */
-             result = gen_reg_rtx (compute_mode);
-             if (! expand_twoval_binop (unsignedp
-                                        ? udivmod_optab : sdivmod_optab,
-                                        adjusted_op0, op1,
-                                        NULL_RTX, result, unsignedp))
-               result = 0;
+             remainder = gen_reg_rtx (compute_mode);
+             if (! expand_twoval_binop ((unsignedp
+                                         ? udivmod_optab
+                                         : sdivmod_optab),
+                                        op0, op1,
+                                        NULL_RTX, remainder, unsignedp))
+               remainder = 0;
             }
+
+         if (remainder)
+           return gen_lowpart (mode, remainder);
         }
-    }
  
-  if (result)
-    return gen_lowpart (mode, result);
-
-  /* Produce the quotient.  */
-  if (log >= 0)
-    result = expand_shift (RSHIFT_EXPR, compute_mode, adjusted_op0,
-                          build_int_2 (log, 0), target, unsignedp);
-  else if (rem_flag && !mod_insn_no_good)
-    /* If producing quotient in order to subtract for remainder,
-       and a remainder subroutine would be ok,
-       don't use a divide subroutine.  */
-    result = sign_expand_binop (compute_mode, udiv_optab, sdiv_optab,
-                               adjusted_op0, op1, NULL_RTX, unsignedp,
-                               OPTAB_WIDEN);
-  else
-    {
-      /* Try a quotient insn, but not a library call.  */
-      result = sign_expand_binop (compute_mode, udiv_optab, sdiv_optab,
-                                 adjusted_op0, op1,
-                                 rem_flag ? NULL_RTX : target,
-                                 unsignedp, OPTAB_WIDEN);
-      if (result == 0)
+      /* Produce the quotient.  Try a quotient insn, but not a library call.
+        If we have a divmod in this mode, use it in preference to widening
+        the div (for this test we assume it will not fail). Note that optab2
+        is set to the one of the two optabs that the call below will use.  */
+      quotient
+       = sign_expand_binop (compute_mode, udiv_optab, sdiv_optab,
+                            op0, op1, rem_flag ? NULL_RTX : target,
+                            unsignedp,
+                            ((optab2->handlers[(int) compute_mode].insn_code
+                              != CODE_FOR_nothing)
+                             ? OPTAB_DIRECT : OPTAB_WIDEN));
+
+      if (quotient == 0)
         {
           /* No luck there.  Try a quotient-and-remainder insn,
              keeping the quotient alone.  */
-         result = gen_reg_rtx (mode);
+         quotient = gen_reg_rtx (compute_mode);
           if (! expand_twoval_binop (unsignedp ? udivmod_optab : sdivmod_optab,
-                                    adjusted_op0, op1,
-                                    result, NULL_RTX, unsignedp))
-           result = 0;
+                                    op0, op1,
+                                    quotient, NULL_RTX, unsignedp))
+           {
+             quotient = 0;
+             if (! rem_flag)
+               /* Still no luck.  If we are not computing the remainder,
+                  use a library call for the quotient.  */
+               quotient = sign_expand_binop (compute_mode,
+                                             udiv_optab, sdiv_optab,
+                                             op0, op1, target,
+                                             unsignedp, OPTAB_LIB_WIDEN);
+           }
         }
-
-      /* If still no luck, use a library call.  */
-      if (result == 0)
-       result = sign_expand_binop (compute_mode, udiv_optab, sdiv_optab,
-                                   adjusted_op0, op1,
-                                   rem_flag ? NULL_RTX : target,
-                                   unsignedp, OPTAB_LIB_WIDEN);
      }
  
-  /* If we really want the remainder, get it by subtraction.  */
    if (rem_flag)
      {
-      if (result == 0)
+      if (target && GET_MODE (target) != compute_mode)
+       target = 0;
+
+      if (quotient == 0)
         /* No divide instruction either.  Use library for remainder.  */
-       result = sign_expand_binop (compute_mode, umod_optab, smod_optab,
-                                   op0, op1, target,
-                                   unsignedp, OPTAB_LIB_WIDEN);
+       remainder = sign_expand_binop (compute_mode, umod_optab, smod_optab,
+                                      op0, op1, target,
+                                      unsignedp, OPTAB_LIB_WIDEN);
        else
         {
           /* We divided.  Now finish doing X - Y * (X / Y).  */
-         result = expand_mult (compute_mode, result, op1, target, unsignedp);
-         if (! result) abort ();
-         result = expand_binop (compute_mode, sub_optab, op0,
-                                result, target, unsignedp, OPTAB_LIB_WIDEN);
+         remainder = expand_mult (compute_mode, quotient, op1,
+                                  NULL_RTX, unsignedp);
+         remainder = expand_binop (compute_mode, sub_optab, op0,
+                                   remainder, target, unsignedp,
+                                   OPTAB_LIB_WIDEN);
         }
      }
  
-  if (result == 0)
-    abort ();
-
-  return gen_lowpart (mode, result);
+  return gen_lowpart (mode, rem_flag ? remainder : quotient);
  }
  \f
  /* Return a tree node with data type TYPE, describing the value of X.
@@ -2633,7 +3734,7 @@ make_tree (type, x)
      {
      case CONST_INT:
        t = build_int_2 (INTVAL (x),
-                      ! TREE_UNSIGNED (type) && INTVAL (x) >= 0 ? 0 : -1);
+                      TREE_UNSIGNED (type) || INTVAL (x) >= 0 ? 0 : -1);
        TREE_TYPE (t) = type;
        return t;
  
@@ -2780,7 +3881,7 @@ expand_and (op0, op1, target)
     to perform the operation.  It says to use zero-extension.
  
     NORMALIZEP is 1 if we should convert the result to be either zero
-   or one one.  Normalize is -1 if we should convert the result to be
+   or one.  Normalize is -1 if we should convert the result to be
     either zero or -1.  If NORMALIZEP is zero, the result will be left
     "raw" out of the scc insn.  */
  
@@ -2798,12 +3899,9 @@ emit_store_flag (target, code, op0, op1, mode, unsignedp, normalizep)
    enum machine_mode compare_mode;
    enum machine_mode target_mode = GET_MODE (target);
    rtx tem;
-  rtx last = 0;
+  rtx last = get_last_insn ();
    rtx pattern, comparison;
  
-  if (mode == VOIDmode)
-    mode = GET_MODE (op0);
-
    /* If one operand is constant, make it the second one.  Only do this
       if the other operand is not constant as well.  */
  
@@ -2816,9 +3914,12 @@ emit_store_flag (target, code, op0, op1, mode, unsignedp, normalizep)
        code = swap_condition (code);
      }
  
+  if (mode == VOIDmode)
+    mode = GET_MODE (op0);
+
    /* For some comparisons with 1 and -1, we can convert this to 
       comparisons with zero.  This will often produce more opportunities for
-     store-flag insns. */
+     store-flag insns.  */
  
    switch (code)
      {
@@ -2857,7 +3958,7 @@ emit_store_flag (target, code, op0, op1, mode, unsignedp, normalizep)
        && GET_MODE_CLASS (mode) == MODE_INT
        && (normalizep || STORE_FLAG_VALUE == 1
           || (GET_MODE_BITSIZE (mode) <= HOST_BITS_PER_WIDE_INT
-             && (STORE_FLAG_VALUE 
+             && ((STORE_FLAG_VALUE & GET_MODE_MASK (mode))
                   == (HOST_WIDE_INT) 1 << (GET_MODE_BITSIZE (mode) - 1)))))
      {
        subtarget = target;
@@ -2868,7 +3969,7 @@ emit_store_flag (target, code, op0, op1, mode, unsignedp, normalizep)
        if (GET_MODE_SIZE (target_mode) > GET_MODE_SIZE (mode))
         {
           op0 = protect_from_queue (op0, 0);
-         op0 = convert_to_mode (target_mode, op0, 0);
+         op0 = convert_modes (target_mode, mode, op0, 0);
           mode = target_mode;
         }
  
@@ -2876,9 +3977,11 @@ emit_store_flag (target, code, op0, op1, mode, unsignedp, normalizep)
         subtarget = 0;
  
        if (code == GE)
-       op0 = expand_unop (mode, one_cmpl_optab, op0, subtarget, 0);
+       op0 = expand_unop (mode, one_cmpl_optab, op0,
+                          ((STORE_FLAG_VALUE == 1 || normalizep)
+                           ? 0 : subtarget), 0);
  
-      if (normalizep || STORE_FLAG_VALUE == 1)
+      if (STORE_FLAG_VALUE == 1 || normalizep)
         /* If we are supposed to produce a 0/1 value, we want to do
            a logical shift from the sign bit to the low-order bit; for
            a -1/0 value, we do an arithmetic shift.  */
@@ -2887,7 +3990,7 @@ emit_store_flag (target, code, op0, op1, mode, unsignedp, normalizep)
                             subtarget, normalizep != -1);
  
        if (mode != target_mode)
-       op0 = convert_to_mode (target_mode, op0, 0);
+       op0 = convert_modes (target_mode, mode, op0, 0);
  
        return op0;
      }
@@ -2996,10 +4099,13 @@ emit_store_flag (target, code, op0, op1, mode, unsignedp, normalizep)
         }
      }
  
-  if (last)
-    delete_insns_since (last);
+  delete_insns_since (last);
  
-  subtarget = target_mode == mode ? target : 0;
+  /* If expensive optimizations, use different pseudo registers for each
+     insn, instead of reusing the same pseudo.  This leads to better CSE,
+     but slows down the compiler, since there are more pseudos */
+  subtarget = (!flag_expensive_optimizations
+              && (target_mode == mode)) ? target : NULL_RTX;
  
    /* If we reached here, we can't do this with a scc insn.  However, there
       are some comparisons that can be done directly.  For example, if
@@ -3046,7 +4152,7 @@ emit_store_flag (target, code, op0, op1, mode, unsignedp, normalizep)
         normalizep = STORE_FLAG_VALUE;
  
        else if (GET_MODE_BITSIZE (mode) <= HOST_BITS_PER_WIDE_INT
-              && (STORE_FLAG_VALUE
+              && ((STORE_FLAG_VALUE & GET_MODE_MASK (mode))
                    == (HOST_WIDE_INT) 1 << (GET_MODE_BITSIZE (mode) - 1)))
         ;
        else
@@ -3108,9 +4214,9 @@ emit_store_flag (target, code, op0, op1, mode, unsignedp, normalizep)
         tem = expand_unop (mode, ffs_optab, op0, subtarget, 1);
        else if (GET_MODE_SIZE (mode) < UNITS_PER_WORD)
         {
-         mode = word_mode;
           op0 = protect_from_queue (op0, 0);
-         tem = convert_to_mode (mode, op0, 1);
+         tem = convert_modes (word_mode, mode, op0, 1);
+         mode = word_mode;
         }
  
        if (tem != 0)
@@ -3144,21 +4250,65 @@ emit_store_flag (target, code, op0, op1, mode, unsignedp, normalizep)
    if (tem && normalizep)
      tem = expand_shift (RSHIFT_EXPR, mode, tem,
                         size_int (GET_MODE_BITSIZE (mode) - 1),
-                       tem, normalizep == 1);
+                       subtarget, normalizep == 1);
  
-  if (tem && GET_MODE (tem) != target_mode)
+  if (tem)
      {
-      convert_move (target, tem, 0);
-      tem = target;
+      if (GET_MODE (tem) != target_mode)
+       {
+         convert_move (target, tem, 0);
+         tem = target;
+       }
+      else if (!subtarget)
+       {
+         emit_move_insn (target, tem);
+         tem = target;
+       }
      }
-
-  if (tem == 0)
+  else
      delete_insns_since (last);
  
    return tem;
  }
-  emit_jump_insn ((*bcc_gen_fctn[(int) code]) (label));
+
+/* Like emit_store_flag, but always succeeds.  */
+
+rtx
+emit_store_flag_force (target, code, op0, op1, mode, unsignedp, normalizep)
+     rtx target;
+     enum rtx_code code;
+     rtx op0, op1;
+     enum machine_mode mode;
+     int unsignedp;
+     int normalizep;
+{
+  rtx tem, label;
+
+  /* First see if emit_store_flag can do the job.  */
+  tem = emit_store_flag (target, code, op0, op1, mode, unsignedp, normalizep);
+  if (tem != 0)
+    return tem;
+
+  if (normalizep == 0)
+    normalizep = 1;
+
+  /* If this failed, we have to do this with set/compare/jump/set code.  */
+
+  if (GET_CODE (target) != REG
+      || reg_mentioned_p (target, op0) || reg_mentioned_p (target, op1))
+    target = gen_reg_rtx (GET_MODE (target));
+
    emit_move_insn (target, const1_rtx);
+  tem = compare_from_rtx (op0, op1, code, unsignedp, mode, NULL_RTX, 0);
+  if (GET_CODE (tem) == CONST_INT)
+    return tem;
+
+  label = gen_label_rtx ();
+  if (bcc_gen_fctn[(int) code] == 0)
+    abort ();
+
+  emit_jump_insn ((*bcc_gen_fctn[(int) code]) (label));
+  emit_move_insn (target, const0_rtx);
    emit_label (label);
  
    return target;