./:

[pf3gnuchains/gcc-fork.git] / gcc / expmed.c
diff --git a/gcc/expmed.c b/gcc/expmed.c

index 84a709a..16f7415 100644 (file)
--- a/gcc/expmed.c
+++ b/gcc/expmed.c
@@ -1,7 +1,7 @@
  /* Medium-level subroutines: convert bit-field store and extract
     and shifts, multiplies and divides to rtl instructions.
     Copyright (C) 1987, 1988, 1989, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
-   1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007
+   1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
     Free Software Foundation, Inc.
  
  This file is part of GCC.
@@ -64,8 +64,8 @@ static rtx expand_sdiv_pow2 (enum machine_mode, rtx, HOST_WIDE_INT);
     Usually, this will mean that the MD file will emit non-branch
     sequences.  */
  
-static bool sdiv_pow2_cheap[NUM_MACHINE_MODES];
-static bool smod_pow2_cheap[NUM_MACHINE_MODES];
+static bool sdiv_pow2_cheap[2][NUM_MACHINE_MODES];
+static bool smod_pow2_cheap[2][NUM_MACHINE_MODES];
  
  #ifndef SLOW_UNALIGNED_ACCESS
  #define SLOW_UNALIGNED_ACCESS(MODE, ALIGN) STRICT_ALIGNMENT
@@ -98,17 +98,18 @@ static bool smod_pow2_cheap[NUM_MACHINE_MODES];
  
  /* Cost of various pieces of RTL.  Note that some of these are indexed by
     shift count and some by mode.  */
-static int zero_cost;
-static int add_cost[NUM_MACHINE_MODES];
-static int neg_cost[NUM_MACHINE_MODES];
-static int shift_cost[NUM_MACHINE_MODES][MAX_BITS_PER_WORD];
-static int shiftadd_cost[NUM_MACHINE_MODES][MAX_BITS_PER_WORD];
-static int shiftsub_cost[NUM_MACHINE_MODES][MAX_BITS_PER_WORD];
-static int mul_cost[NUM_MACHINE_MODES];
-static int sdiv_cost[NUM_MACHINE_MODES];
-static int udiv_cost[NUM_MACHINE_MODES];
-static int mul_widen_cost[NUM_MACHINE_MODES];
-static int mul_highpart_cost[NUM_MACHINE_MODES];
+static int zero_cost[2];
+static int add_cost[2][NUM_MACHINE_MODES];
+static int neg_cost[2][NUM_MACHINE_MODES];
+static int shift_cost[2][NUM_MACHINE_MODES][MAX_BITS_PER_WORD];
+static int shiftadd_cost[2][NUM_MACHINE_MODES][MAX_BITS_PER_WORD];
+static int shiftsub0_cost[2][NUM_MACHINE_MODES][MAX_BITS_PER_WORD];
+static int shiftsub1_cost[2][NUM_MACHINE_MODES][MAX_BITS_PER_WORD];
+static int mul_cost[2][NUM_MACHINE_MODES];
+static int sdiv_cost[2][NUM_MACHINE_MODES];
+static int udiv_cost[2][NUM_MACHINE_MODES];
+static int mul_widen_cost[2][NUM_MACHINE_MODES];
+static int mul_highpart_cost[2][NUM_MACHINE_MODES];
  
  void
  init_expmed (void)
@@ -130,22 +131,22 @@ init_expmed (void)
      struct rtx_def shift;      rtunion shift_fld1;
      struct rtx_def shift_mult; rtunion shift_mult_fld1;
      struct rtx_def shift_add;  rtunion shift_add_fld1;
-    struct rtx_def shift_sub;  rtunion shift_sub_fld1;
+    struct rtx_def shift_sub0; rtunion shift_sub0_fld1;
+    struct rtx_def shift_sub1; rtunion shift_sub1_fld1;
    } all;
  
    rtx pow2[MAX_BITS_PER_WORD];
    rtx cint[MAX_BITS_PER_WORD];
    int m, n;
    enum machine_mode mode, wider_mode;
+  int speed;
  
-  zero_cost = rtx_cost (const0_rtx, 0);
  
    for (m = 1; m < MAX_BITS_PER_WORD; m++)
      {
        pow2[m] = GEN_INT ((HOST_WIDE_INT) 1 << m);
        cint[m] = GEN_INT (m);
      }
-
    memset (&all, 0, sizeof all);
  
    PUT_CODE (&all.reg, REG);
@@ -202,65 +203,81 @@ init_expmed (void)
    XEXP (&all.shift_add, 0) = &all.shift_mult;
    XEXP (&all.shift_add, 1) = &all.reg;
  
-  PUT_CODE (&all.shift_sub, MINUS);
-  XEXP (&all.shift_sub, 0) = &all.shift_mult;
-  XEXP (&all.shift_sub, 1) = &all.reg;
+  PUT_CODE (&all.shift_sub0, MINUS);
+  XEXP (&all.shift_sub0, 0) = &all.shift_mult;
+  XEXP (&all.shift_sub0, 1) = &all.reg;
  
-  for (mode = GET_CLASS_NARROWEST_MODE (MODE_INT);
-       mode != VOIDmode;
-       mode = GET_MODE_WIDER_MODE (mode))
+  PUT_CODE (&all.shift_sub1, MINUS);
+  XEXP (&all.shift_sub1, 0) = &all.reg;
+  XEXP (&all.shift_sub1, 1) = &all.shift_mult;
+
+  for (speed = 0; speed < 2; speed++)
      {
-      PUT_MODE (&all.reg, mode);
-      PUT_MODE (&all.plus, mode);
-      PUT_MODE (&all.neg, mode);
-      PUT_MODE (&all.mult, mode);
-      PUT_MODE (&all.sdiv, mode);
-      PUT_MODE (&all.udiv, mode);
-      PUT_MODE (&all.sdiv_32, mode);
-      PUT_MODE (&all.smod_32, mode);
-      PUT_MODE (&all.wide_trunc, mode);
-      PUT_MODE (&all.shift, mode);
-      PUT_MODE (&all.shift_mult, mode);
-      PUT_MODE (&all.shift_add, mode);
-      PUT_MODE (&all.shift_sub, mode);
-
-      add_cost[mode] = rtx_cost (&all.plus, SET);
-      neg_cost[mode] = rtx_cost (&all.neg, SET);
-      mul_cost[mode] = rtx_cost (&all.mult, SET);
-      sdiv_cost[mode] = rtx_cost (&all.sdiv, SET);
-      udiv_cost[mode] = rtx_cost (&all.udiv, SET);
-
-      sdiv_pow2_cheap[mode] = (rtx_cost (&all.sdiv_32, SET)
-                              <= 2 * add_cost[mode]);
-      smod_pow2_cheap[mode] = (rtx_cost (&all.smod_32, SET)
-                              <= 4 * add_cost[mode]);
-
-      wider_mode = GET_MODE_WIDER_MODE (mode);
-      if (wider_mode != VOIDmode)
-       {
-         PUT_MODE (&all.zext, wider_mode);
-         PUT_MODE (&all.wide_mult, wider_mode);
-         PUT_MODE (&all.wide_lshr, wider_mode);
-         XEXP (&all.wide_lshr, 1) = GEN_INT (GET_MODE_BITSIZE (mode));
+      crtl->maybe_hot_insn_p = speed;
+      zero_cost[speed] = rtx_cost (const0_rtx, SET, speed);
  
-         mul_widen_cost[wider_mode] = rtx_cost (&all.wide_mult, SET);
-         mul_highpart_cost[mode] = rtx_cost (&all.wide_trunc, SET);
-       }
+      for (mode = GET_CLASS_NARROWEST_MODE (MODE_INT);
+          mode != VOIDmode;
+          mode = GET_MODE_WIDER_MODE (mode))
+       {
+         PUT_MODE (&all.reg, mode);
+         PUT_MODE (&all.plus, mode);
+         PUT_MODE (&all.neg, mode);
+         PUT_MODE (&all.mult, mode);
+         PUT_MODE (&all.sdiv, mode);
+         PUT_MODE (&all.udiv, mode);
+         PUT_MODE (&all.sdiv_32, mode);
+         PUT_MODE (&all.smod_32, mode);
+         PUT_MODE (&all.wide_trunc, mode);
+         PUT_MODE (&all.shift, mode);
+         PUT_MODE (&all.shift_mult, mode);
+         PUT_MODE (&all.shift_add, mode);
+         PUT_MODE (&all.shift_sub0, mode);
+         PUT_MODE (&all.shift_sub1, mode);
+
+         add_cost[speed][mode] = rtx_cost (&all.plus, SET, speed);
+         neg_cost[speed][mode] = rtx_cost (&all.neg, SET, speed);
+         mul_cost[speed][mode] = rtx_cost (&all.mult, SET, speed);
+         sdiv_cost[speed][mode] = rtx_cost (&all.sdiv, SET, speed);
+         udiv_cost[speed][mode] = rtx_cost (&all.udiv, SET, speed);
+
+         sdiv_pow2_cheap[speed][mode] = (rtx_cost (&all.sdiv_32, SET, speed)
+                                         <= 2 * add_cost[speed][mode]);
+         smod_pow2_cheap[speed][mode] = (rtx_cost (&all.smod_32, SET, speed)
+                                         <= 4 * add_cost[speed][mode]);
+
+         wider_mode = GET_MODE_WIDER_MODE (mode);
+         if (wider_mode != VOIDmode)
+           {
+             PUT_MODE (&all.zext, wider_mode);
+             PUT_MODE (&all.wide_mult, wider_mode);
+             PUT_MODE (&all.wide_lshr, wider_mode);
+             XEXP (&all.wide_lshr, 1) = GEN_INT (GET_MODE_BITSIZE (mode));
+
+             mul_widen_cost[speed][wider_mode]
+               = rtx_cost (&all.wide_mult, SET, speed);
+             mul_highpart_cost[speed][mode]
+               = rtx_cost (&all.wide_trunc, SET, speed);
+           }
  
-      shift_cost[mode][0] = 0;
-      shiftadd_cost[mode][0] = shiftsub_cost[mode][0] = add_cost[mode];
+         shift_cost[speed][mode][0] = 0;
+         shiftadd_cost[speed][mode][0] = shiftsub0_cost[speed][mode][0]
+           = shiftsub1_cost[speed][mode][0] = add_cost[speed][mode];
  
-      n = MIN (MAX_BITS_PER_WORD, GET_MODE_BITSIZE (mode));
-      for (m = 1; m < n; m++)
-       {
-         XEXP (&all.shift, 1) = cint[m];
-         XEXP (&all.shift_mult, 1) = pow2[m];
+         n = MIN (MAX_BITS_PER_WORD, GET_MODE_BITSIZE (mode));
+         for (m = 1; m < n; m++)
+           {
+             XEXP (&all.shift, 1) = cint[m];
+             XEXP (&all.shift_mult, 1) = pow2[m];
  
-         shift_cost[mode][m] = rtx_cost (&all.shift, SET);
-         shiftadd_cost[mode][m] = rtx_cost (&all.shift_add, SET);
-         shiftsub_cost[mode][m] = rtx_cost (&all.shift_sub, SET);
+             shift_cost[speed][mode][m] = rtx_cost (&all.shift, SET, speed);
+             shiftadd_cost[speed][mode][m] = rtx_cost (&all.shift_add, SET, speed);
+             shiftsub0_cost[speed][mode][m] = rtx_cost (&all.shift_sub0, SET, speed);
+             shiftsub1_cost[speed][mode][m] = rtx_cost (&all.shift_sub1, SET, speed);
+           }
         }
      }
+  default_rtl_profile ();
  }
  
  /* Return an rtx representing minus the value of X.
@@ -521,6 +538,9 @@ store_bit_field_1 (rtx str_rtx, unsigned HOST_WIDE_INT bitsize,
           != CODE_FOR_nothing))
      {
        int icode = optab_handler (movstrict_optab, fieldmode)->insn_code;
+      rtx insn;
+      rtx start = get_last_insn ();
+      rtx arg0 = op0;
  
        /* Get appropriate low part of the value being stored.  */
        if (GET_CODE (value) == CONST_INT || REG_P (value))
@@ -541,16 +561,20 @@ store_bit_field_1 (rtx str_rtx, unsigned HOST_WIDE_INT bitsize,
           gcc_assert (GET_MODE (SUBREG_REG (op0)) == fieldmode
                       || GET_MODE_CLASS (fieldmode) == MODE_INT
                       || GET_MODE_CLASS (fieldmode) == MODE_PARTIAL_INT);
-         op0 = SUBREG_REG (op0);
+         arg0 = SUBREG_REG (op0);
         }
  
-      emit_insn (GEN_FCN (icode)
-                (gen_rtx_SUBREG (fieldmode, op0,
+      insn = (GEN_FCN (icode)
+                (gen_rtx_SUBREG (fieldmode, arg0,
                                   (bitnum % BITS_PER_WORD) / BITS_PER_UNIT
                                   + (offset * UNITS_PER_WORD)),
                                   value));
-
-      return true;
+      if (insn)
+       {
+         emit_insn (insn);
+         return true;
+       }
+      delete_insns_since (start);
      }
  
    /* Handle fields bigger than a word.  */
@@ -733,6 +757,16 @@ store_bit_field_1 (rtx str_rtx, unsigned HOST_WIDE_INT bitsize,
        if (pat)
         {
           emit_insn (pat);
+
+         /* If the mode of the insertion is wider than the mode of the
+            target register we created a paradoxical subreg for the
+            target.  Truncate the paradoxical subreg of the target to
+            itself properly.  */
+         if (!TRULY_NOOP_TRUNCATION (GET_MODE_BITSIZE (GET_MODE (op0)),
+                                     GET_MODE_BITSIZE (op_mode))
+             && (REG_P (xop0)
+                 || GET_CODE (xop0) == SUBREG))
+             convert_move (op0, xop0, true);
           return true;
         }
        delete_insns_since (last);
@@ -934,13 +968,7 @@ store_fixed_bit_field (rtx op0, unsigned HOST_WIDE_INT offset,
                       && bitpos + bitsize != GET_MODE_BITSIZE (mode));
  
        if (GET_MODE (value) != mode)
-       {
-         if ((REG_P (value) || GET_CODE (value) == SUBREG)
-             && GET_MODE_SIZE (mode) < GET_MODE_SIZE (GET_MODE (value)))
-           value = gen_lowpart (mode, value);
-         else
-           value = convert_to_mode (mode, value, 1);
-       }
+       value = convert_to_mode (mode, value, 1);
  
        if (must_and)
         value = expand_binop (mode, and_optab, value,
@@ -976,7 +1004,10 @@ store_fixed_bit_field (rtx op0, unsigned HOST_WIDE_INT offset,
      }
  
    if (op0 != temp)
-    emit_move_insn (op0, temp);
+    {
+      op0 = copy_rtx (op0);
+      emit_move_insn (op0, temp);
+    }
  }
  \f
  /* Store a bit field that is split across multiple accessible memory objects.
@@ -1195,7 +1226,7 @@ extract_bit_field_1 (rtx str_rtx, unsigned HOST_WIDE_INT bitsize,
  
        for (; new_mode != VOIDmode ; new_mode = GET_MODE_WIDER_MODE (new_mode))
         if (GET_MODE_NUNITS (new_mode) == nunits
-           && GET_MODE_INNER (new_mode) == tmode
+           && GET_MODE_SIZE (new_mode) == GET_MODE_SIZE (GET_MODE (op0))
             && targetm.vector_mode_supported_p (new_mode))
           break;
        if (new_mode != VOIDmode)
@@ -1266,9 +1297,8 @@ extract_bit_field_1 (rtx str_rtx, unsigned HOST_WIDE_INT bitsize,
        {
         if (MEM_P (op0))
           op0 = adjust_address (op0, imode, 0);
-       else
+       else if (imode != BLKmode)
           {
-           gcc_assert (imode != BLKmode);
             op0 = gen_lowpart (imode, op0);
  
             /* If we got a SUBREG, force it into a register since we
@@ -1276,6 +1306,24 @@ extract_bit_field_1 (rtx str_rtx, unsigned HOST_WIDE_INT bitsize,
             if (GET_CODE (op0) == SUBREG)
               op0 = force_reg (imode, op0);
           }
+       else if (REG_P (op0))
+         {
+           rtx reg, subreg;
+           imode = smallest_mode_for_size (GET_MODE_BITSIZE (GET_MODE (op0)),
+                                           MODE_INT);
+           reg = gen_reg_rtx (imode);
+           subreg = gen_lowpart_SUBREG (GET_MODE (op0), reg);
+           emit_move_insn (subreg, op0);
+           op0 = reg;
+           bitnum += SUBREG_BYTE (subreg) * BITS_PER_UNIT;
+         }
+       else
+         {
+           rtx mem = assign_stack_temp (GET_MODE (op0),
+                                        GET_MODE_SIZE (GET_MODE (op0)), 0);
+           emit_move_insn (mem, op0);
+           op0 = adjust_address (mem, BLKmode, 0);
+         }
        }
    }
  
@@ -1330,7 +1378,7 @@ extract_bit_field_1 (rtx str_rtx, unsigned HOST_WIDE_INT bitsize,
                ? bitpos + bitsize == BITS_PER_WORD
                : bitpos == 0)))
        && ((!MEM_P (op0)
-          && TRULY_NOOP_TRUNCATION (GET_MODE_BITSIZE (mode),
+          && TRULY_NOOP_TRUNCATION (GET_MODE_BITSIZE (mode1),
                                      GET_MODE_BITSIZE (GET_MODE (op0)))
            && GET_MODE_SIZE (mode1) != 0
            && byte_offset % GET_MODE_SIZE (mode1) == 0)
@@ -1339,18 +1387,15 @@ extract_bit_field_1 (rtx str_rtx, unsigned HOST_WIDE_INT bitsize,
                   || (offset * BITS_PER_UNIT % bitsize == 0
                       && MEM_ALIGN (op0) % bitsize == 0)))))
      {
-      if (mode1 != GET_MODE (op0))
+      if (MEM_P (op0))
+       op0 = adjust_address (op0, mode1, offset);
+      else if (mode1 != GET_MODE (op0))
         {
-         if (MEM_P (op0))
-           op0 = adjust_address (op0, mode1, offset);
-         else
-           {
-             rtx sub = simplify_gen_subreg (mode1, op0, GET_MODE (op0),
-                                            byte_offset);
-             if (sub == NULL)
-               goto no_subreg_mode_swap;
-             op0 = sub;
-           }
+         rtx sub = simplify_gen_subreg (mode1, op0, GET_MODE (op0),
+                                        byte_offset);
+         if (sub == NULL)
+           goto no_subreg_mode_swap;
+         op0 = sub;
         }
        if (mode1 != mode)
         return convert_to_mode (tmode, op0, unsignedp);
@@ -1374,7 +1419,7 @@ extract_bit_field_1 (rtx str_rtx, unsigned HOST_WIDE_INT bitsize,
         target = gen_reg_rtx (mode);
  
        /* Indicate for flow that the entire target reg is being set.  */
-      emit_insn (gen_rtx_CLOBBER (VOIDmode, target));
+      emit_clobber (target);
  
        for (i = 0; i < nwords; i++)
         {
@@ -1507,7 +1552,13 @@ extract_bit_field_1 (rtx str_rtx, unsigned HOST_WIDE_INT bitsize,
  
        if (GET_MODE (xtarget) != ext_mode)
         {
-         if (REG_P (xtarget))
+         /* Don't use LHS paradoxical subreg if explicit truncation is needed
+            between the mode of the extraction (word_mode) and the target
+            mode.  Instead, create a temporary and use convert_move to set
+            the target.  */
+         if (REG_P (xtarget)
+             && TRULY_NOOP_TRUNCATION (GET_MODE_BITSIZE (GET_MODE (xtarget)),
+                                       GET_MODE_BITSIZE (ext_mode)))
             {
               xtarget = gen_lowpart (ext_mode, xtarget);
               if (GET_MODE_SIZE (ext_mode)
@@ -1838,152 +1889,6 @@ lshift_value (enum machine_mode mode, rtx value, int bitpos, int bitsize)
    return immed_double_const (low, high, mode);
  }
  \f
-/* Extract a bit field from a memory by forcing the alignment of the
-   memory.  This efficient only if the field spans at least 4 boundaries.
-
-   OP0 is the MEM.
-   BITSIZE is the field width; BITPOS is the position of the first bit.
-   UNSIGNEDP is true if the result should be zero-extended.  */
-
-static rtx
-extract_force_align_mem_bit_field (rtx op0, unsigned HOST_WIDE_INT bitsize,
-                                  unsigned HOST_WIDE_INT bitpos,
-                                  int unsignedp)
-{
-  enum machine_mode mode, dmode;
-  unsigned int m_bitsize, m_size;
-  unsigned int sign_shift_up, sign_shift_dn;
-  rtx base, a1, a2, v1, v2, comb, shift, result, start;
-
-  /* Choose a mode that will fit BITSIZE.  */
-  mode = smallest_mode_for_size (bitsize, MODE_INT);
-  m_size = GET_MODE_SIZE (mode);
-  m_bitsize = GET_MODE_BITSIZE (mode);
-
-  /* Choose a mode twice as wide.  Fail if no such mode exists.  */
-  dmode = mode_for_size (m_bitsize * 2, MODE_INT, false);
-  if (dmode == BLKmode)
-    return NULL;
-
-  do_pending_stack_adjust ();
-  start = get_last_insn ();
-
-  /* At the end, we'll need an additional shift to deal with sign/zero
-     extension.  By default this will be a left+right shift of the
-     appropriate size.  But we may be able to eliminate one of them.  */
-  sign_shift_up = sign_shift_dn = m_bitsize - bitsize;
-
-  if (STRICT_ALIGNMENT)
-    {
-      base = plus_constant (XEXP (op0, 0), bitpos / BITS_PER_UNIT);
-      bitpos %= BITS_PER_UNIT;
-
-      /* We load two values to be concatenate.  There's an edge condition
-        that bears notice -- an aligned value at the end of a page can
-        only load one value lest we segfault.  So the two values we load
-        are at "base & -size" and "(base + size - 1) & -size".  If base
-        is unaligned, the addresses will be aligned and sequential; if
-        base is aligned, the addresses will both be equal to base.  */
-
-      a1 = expand_simple_binop (Pmode, AND, force_operand (base, NULL),
-                               GEN_INT (-(HOST_WIDE_INT)m_size),
-                               NULL, true, OPTAB_LIB_WIDEN);
-      mark_reg_pointer (a1, m_bitsize);
-      v1 = gen_rtx_MEM (mode, a1);
-      set_mem_align (v1, m_bitsize);
-      v1 = force_reg (mode, validize_mem (v1));
-
-      a2 = plus_constant (base, GET_MODE_SIZE (mode) - 1);
-      a2 = expand_simple_binop (Pmode, AND, force_operand (a2, NULL),
-                               GEN_INT (-(HOST_WIDE_INT)m_size),
-                               NULL, true, OPTAB_LIB_WIDEN);
-      v2 = gen_rtx_MEM (mode, a2);
-      set_mem_align (v2, m_bitsize);
-      v2 = force_reg (mode, validize_mem (v2));
-
-      /* Combine these two values into a double-word value.  */
-      if (m_bitsize == BITS_PER_WORD)
-       {
-         comb = gen_reg_rtx (dmode);
-         emit_insn (gen_rtx_CLOBBER (VOIDmode, comb));
-         emit_move_insn (gen_rtx_SUBREG (mode, comb, 0), v1);
-         emit_move_insn (gen_rtx_SUBREG (mode, comb, m_size), v2);
-       }
-      else
-       {
-         if (BYTES_BIG_ENDIAN)
-           comb = v1, v1 = v2, v2 = comb;
-         v1 = convert_modes (dmode, mode, v1, true);
-         if (v1 == NULL)
-           goto fail;
-         v2 = convert_modes (dmode, mode, v2, true);
-         v2 = expand_simple_binop (dmode, ASHIFT, v2, GEN_INT (m_bitsize),
-                                   NULL, true, OPTAB_LIB_WIDEN);
-         if (v2 == NULL)
-           goto fail;
-         comb = expand_simple_binop (dmode, IOR, v1, v2, NULL,
-                                     true, OPTAB_LIB_WIDEN);
-         if (comb == NULL)
-           goto fail;
-       }
-
-      shift = expand_simple_binop (Pmode, AND, base, GEN_INT (m_size - 1),
-                                  NULL, true, OPTAB_LIB_WIDEN);
-      shift = expand_mult (Pmode, shift, GEN_INT (BITS_PER_UNIT), NULL, 1);
-
-      if (bitpos != 0)
-       {
-         if (sign_shift_up <= bitpos)
-           bitpos -= sign_shift_up, sign_shift_up = 0;
-         shift = expand_simple_binop (Pmode, PLUS, shift, GEN_INT (bitpos),
-                                      NULL, true, OPTAB_LIB_WIDEN);
-       }
-    }
-  else
-    {
-      unsigned HOST_WIDE_INT offset = bitpos / BITS_PER_UNIT;
-      bitpos %= BITS_PER_UNIT;
-
-      /* When strict alignment is not required, we can just load directly
-        from memory without masking.  If the remaining BITPOS offset is
-        small enough, we may be able to do all operations in MODE as 
-        opposed to DMODE.  */
-      if (bitpos + bitsize <= m_bitsize)
-       dmode = mode;
-      comb = adjust_address (op0, dmode, offset);
-
-      if (sign_shift_up <= bitpos)
-       bitpos -= sign_shift_up, sign_shift_up = 0;
-      shift = GEN_INT (bitpos);
-    }
-
-  /* Shift down the double-word such that the requested value is at bit 0.  */
-  if (shift != const0_rtx)
-    comb = expand_simple_binop (dmode, unsignedp ? LSHIFTRT : ASHIFTRT,
-                               comb, shift, NULL, unsignedp, OPTAB_LIB_WIDEN);
-  if (comb == NULL)
-    goto fail;
-
-  /* If the field exactly matches MODE, then all we need to do is return the
-     lowpart.  Otherwise, shift to get the sign bits set properly.  */
-  result = force_reg (mode, gen_lowpart (mode, comb));
-
-  if (sign_shift_up)
-    result = expand_simple_binop (mode, ASHIFT, result,
-                                 GEN_INT (sign_shift_up),
-                                 NULL_RTX, 0, OPTAB_LIB_WIDEN);
-  if (sign_shift_dn)
-    result = expand_simple_binop (mode, unsignedp ? LSHIFTRT : ASHIFTRT,
-                                 result, GEN_INT (sign_shift_dn),
-                                 NULL_RTX, 0, OPTAB_LIB_WIDEN);
-
-  return result;
-
- fail:
-  delete_insns_since (start);
-  return NULL;
-}
-
  /* Extract a bit field that is split across two words
     and return an RTX for the result.
  
@@ -2005,16 +1910,7 @@ extract_split_bit_field (rtx op0, unsigned HOST_WIDE_INT bitsize,
    if (REG_P (op0) || GET_CODE (op0) == SUBREG)
      unit = BITS_PER_WORD;
    else
-    {
-      unit = MIN (MEM_ALIGN (op0), BITS_PER_WORD);
-      if (0 && bitsize / unit > 2)
-       {
-         rtx tmp = extract_force_align_mem_bit_field (op0, bitsize, bitpos,
-                                                      unsignedp);
-         if (tmp)
-           return tmp;
-       }
-    }
+    unit = MIN (MEM_ALIGN (op0), BITS_PER_WORD);
  
    while (bitsdone < bitsize)
      {
@@ -2101,6 +1997,80 @@ extract_split_bit_field (rtx op0, unsigned HOST_WIDE_INT bitsize,
                        NULL_RTX, 0);
  }
  \f
+/* Try to read the low bits of SRC as an rvalue of mode MODE, preserving
+   the bit pattern.  SRC_MODE is the mode of SRC; if this is smaller than
+   MODE, fill the upper bits with zeros.  Fail if the layout of either
+   mode is unknown (as for CC modes) or if the extraction would involve
+   unprofitable mode punning.  Return the value on success, otherwise
+   return null.
+
+   This is different from gen_lowpart* in these respects:
+
+     - the returned value must always be considered an rvalue
+
+     - when MODE is wider than SRC_MODE, the extraction involves
+       a zero extension
+
+     - when MODE is smaller than SRC_MODE, the extraction involves
+       a truncation (and is thus subject to TRULY_NOOP_TRUNCATION).
+
+   In other words, this routine performs a computation, whereas the
+   gen_lowpart* routines are conceptually lvalue or rvalue subreg
+   operations.  */
+
+rtx
+extract_low_bits (enum machine_mode mode, enum machine_mode src_mode, rtx src)
+{
+  enum machine_mode int_mode, src_int_mode;
+
+  if (mode == src_mode)
+    return src;
+
+  if (CONSTANT_P (src))
+    {
+      /* simplify_gen_subreg can't be used here, as if simplify_subreg
+        fails, it will happily create (subreg (symbol_ref)) or similar
+        invalid SUBREGs.  */
+      unsigned int byte = subreg_lowpart_offset (mode, src_mode);
+      rtx ret = simplify_subreg (mode, src, src_mode, byte);
+      if (ret)
+       return ret;
+
+      if (GET_MODE (src) == VOIDmode
+         || !validate_subreg (mode, src_mode, src, byte))
+       return NULL_RTX;
+
+      src = force_reg (GET_MODE (src), src);
+      return gen_rtx_SUBREG (mode, src, byte);
+    }
+
+  if (GET_MODE_CLASS (mode) == MODE_CC || GET_MODE_CLASS (src_mode) == MODE_CC)
+    return NULL_RTX;
+
+  if (GET_MODE_BITSIZE (mode) == GET_MODE_BITSIZE (src_mode)
+      && MODES_TIEABLE_P (mode, src_mode))
+    {
+      rtx x = gen_lowpart_common (mode, src);
+      if (x)
+        return x;
+    }
+
+  src_int_mode = int_mode_for_mode (src_mode);
+  int_mode = int_mode_for_mode (mode);
+  if (src_int_mode == BLKmode || int_mode == BLKmode)
+    return NULL_RTX;
+
+  if (!MODES_TIEABLE_P (src_int_mode, src_mode))
+    return NULL_RTX;
+  if (!MODES_TIEABLE_P (int_mode, mode))
+    return NULL_RTX;
+
+  src = gen_lowpart (src_int_mode, src);
+  src = convert_modes (int_mode, src_int_mode, src, true);
+  src = gen_lowpart (mode, src);
+  return src;
+}
+\f
  /* Add INC into TARGET.  */
  
  void
@@ -2139,14 +2109,33 @@ expand_shift (enum tree_code code, enum machine_mode mode, rtx shifted,
    rtx op1, temp = 0;
    int left = (code == LSHIFT_EXPR || code == LROTATE_EXPR);
    int rotate = (code == LROTATE_EXPR || code == RROTATE_EXPR);
-  int try;
+  optab lshift_optab = ashl_optab;
+  optab rshift_arith_optab = ashr_optab;
+  optab rshift_uns_optab = lshr_optab;
+  optab lrotate_optab = rotl_optab;
+  optab rrotate_optab = rotr_optab;
+  enum machine_mode op1_mode;
+  int attempt;
+  bool speed = optimize_insn_for_speed_p ();
+
+  op1 = expand_normal (amount);
+  op1_mode = GET_MODE (op1);
+
+  /* Determine whether the shift/rotate amount is a vector, or scalar.  If the
+     shift amount is a vector, use the vector/vector shift patterns.  */
+  if (VECTOR_MODE_P (mode) && VECTOR_MODE_P (op1_mode))
+    {
+      lshift_optab = vashl_optab;
+      rshift_arith_optab = vashr_optab;
+      rshift_uns_optab = vlshr_optab;
+      lrotate_optab = vrotl_optab;
+      rrotate_optab = vrotr_optab;
+    }
  
    /* Previously detected shift-counts computed by NEGATE_EXPR
       and shifted in the other direction; but that does not work
       on all machines.  */
  
-  op1 = expand_normal (amount);
-
    if (SHIFT_COUNT_TRUNCATED)
      {
        if (GET_CODE (op1) == CONST_INT
@@ -2155,7 +2144,8 @@ expand_shift (enum tree_code code, enum machine_mode mode, rtx shifted,
         op1 = GEN_INT ((unsigned HOST_WIDE_INT) INTVAL (op1)
                        % GET_MODE_BITSIZE (mode));
        else if (GET_CODE (op1) == SUBREG
-              && subreg_lowpart_p (op1))
+              && subreg_lowpart_p (op1)
+              && INTEGRAL_MODE_P (GET_MODE (SUBREG_REG (op1))))
         op1 = SUBREG_REG (op1);
      }
  
@@ -2169,8 +2159,8 @@ expand_shift (enum tree_code code, enum machine_mode mode, rtx shifted,
        && INTVAL (op1) > 0
        && INTVAL (op1) < GET_MODE_BITSIZE (mode)
        && INTVAL (op1) < MAX_BITS_PER_WORD
-      && shift_cost[mode][INTVAL (op1)] > INTVAL (op1) * add_cost[mode]
-      && shift_cost[mode][INTVAL (op1)] != MAX_COST)
+      && shift_cost[speed][mode][INTVAL (op1)] > INTVAL (op1) * add_cost[speed][mode]
+      && shift_cost[speed][mode][INTVAL (op1)] != MAX_COST)
      {
        int i;
        for (i = 0; i < INTVAL (op1); i++)
@@ -2182,13 +2172,13 @@ expand_shift (enum tree_code code, enum machine_mode mode, rtx shifted,
        return shifted;
      }
  
-  for (try = 0; temp == 0 && try < 3; try++)
+  for (attempt = 0; temp == 0 && attempt < 3; attempt++)
      {
        enum optab_methods methods;
  
-      if (try == 0)
+      if (attempt == 0)
         methods = OPTAB_DIRECT;
-      else if (try == 1)
+      else if (attempt == 1)
         methods = OPTAB_WIDEN;
        else
         methods = OPTAB_LIB_WIDEN;
@@ -2236,12 +2226,12 @@ expand_shift (enum tree_code code, enum machine_mode mode, rtx shifted,
             }
  
           temp = expand_binop (mode,
-                              left ? rotl_optab : rotr_optab,
+                              left ? lrotate_optab : rrotate_optab,
                                shifted, op1, target, unsignedp, methods);
         }
        else if (unsignedp)
         temp = expand_binop (mode,
-                            left ? ashl_optab : lshr_optab,
+                            left ? lshift_optab : rshift_uns_optab,
                              shifted, op1, target, unsignedp, methods);
  
        /* Do arithmetic shifts.
@@ -2260,7 +2250,7 @@ expand_shift (enum tree_code code, enum machine_mode mode, rtx shifted,
           /* Arithmetic shift */
  
           temp = expand_binop (mode,
-                              left ? ashl_optab : ashr_optab,
+                              left ? lshift_optab : rshift_arith_optab,
                                shifted, op1, target, unsignedp, methods1);
         }
  
@@ -2364,6 +2354,9 @@ struct alg_hash_entry {
       Otherwise, the cost within which multiplication by T is
       impossible.  */
    struct mult_cost cost;
+ 
+  /* OPtimized for speed? */
+  bool speed;
  };
  
  /* The number of cache/hash entries.  */
@@ -2412,11 +2405,13 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
    struct mult_cost best_cost;
    struct mult_cost new_limit;
    int op_cost, op_latency;
+  unsigned HOST_WIDE_INT orig_t = t;
    unsigned HOST_WIDE_INT q;
    int maxm = MIN (BITS_PER_WORD, GET_MODE_BITSIZE (mode));
    int hash_index;
    bool cache_hit = false;
    enum alg_code cache_alg = alg_zero;
+  bool speed = optimize_insn_for_speed_p ();
  
    /* Indicate that no algorithm is yet found.  If no algorithm
       is found, this value will be returned and indicate failure.  */
@@ -2444,13 +2439,13 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
       fail now.  */
    if (t == 0)
      {
-      if (MULT_COST_LESS (cost_limit, zero_cost))
+      if (MULT_COST_LESS (cost_limit, zero_cost[speed]))
         return;
        else
         {
           alg_out->ops = 1;
-         alg_out->cost.cost = zero_cost;
-         alg_out->cost.latency = zero_cost;
+         alg_out->cost.cost = zero_cost[speed];
+         alg_out->cost.latency = zero_cost[speed];
           alg_out->op[0] = alg_zero;
           return;
         }
@@ -2458,16 +2453,18 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
  
    /* We'll be needing a couple extra algorithm structures now.  */
  
-  alg_in = alloca (sizeof (struct algorithm));
-  best_alg = alloca (sizeof (struct algorithm));
+  alg_in = XALLOCA (struct algorithm);
+  best_alg = XALLOCA (struct algorithm);
    best_cost = *cost_limit;
  
    /* Compute the hash index.  */
-  hash_index = (t ^ (unsigned int) mode) % NUM_ALG_HASH_ENTRIES;
+  hash_index = (t ^ (unsigned int) mode ^ (speed * 256)) % NUM_ALG_HASH_ENTRIES;
  
    /* See if we already know what to do for T.  */
    if (alg_hash[hash_index].t == t
        && alg_hash[hash_index].mode == mode
+      && alg_hash[hash_index].mode == mode
+      && alg_hash[hash_index].speed == speed
        && alg_hash[hash_index].alg != alg_unknown)
      {
        cache_alg = alg_hash[hash_index].alg;
@@ -2536,10 +2533,10 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
           q = t >> m;
           /* The function expand_shift will choose between a shift and
              a sequence of additions, so the observed cost is given as
-            MIN (m * add_cost[mode], shift_cost[mode][m]).  */
-         op_cost = m * add_cost[mode];
-         if (shift_cost[mode][m] < op_cost)
-           op_cost = shift_cost[mode][m];
+            MIN (m * add_cost[speed][mode], shift_cost[speed][mode][m]).  */
+         op_cost = m * add_cost[speed][mode];
+         if (shift_cost[speed][mode][m] < op_cost)
+           op_cost = shift_cost[speed][mode][m];
           new_limit.cost = best_cost.cost - op_cost;
           new_limit.latency = best_cost.latency - op_cost;
           synth_mult (alg_in, q, &new_limit, mode);
@@ -2554,6 +2551,38 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
               best_alg->log[best_alg->ops] = m;
               best_alg->op[best_alg->ops] = alg_shift;
             }
+
+         /* See if treating ORIG_T as a signed number yields a better
+            sequence.  Try this sequence only for a negative ORIG_T
+            as it would be useless for a non-negative ORIG_T.  */
+         if ((HOST_WIDE_INT) orig_t < 0)
+           {
+             /* Shift ORIG_T as follows because a right shift of a
+                negative-valued signed type is implementation
+                defined.  */
+             q = ~(~orig_t >> m);
+             /* The function expand_shift will choose between a shift
+                and a sequence of additions, so the observed cost is
+                given as MIN (m * add_cost[speed][mode],
+                shift_cost[speed][mode][m]).  */
+             op_cost = m * add_cost[speed][mode];
+             if (shift_cost[speed][mode][m] < op_cost)
+               op_cost = shift_cost[speed][mode][m];
+             new_limit.cost = best_cost.cost - op_cost;
+             new_limit.latency = best_cost.latency - op_cost;
+             synth_mult (alg_in, q, &new_limit, mode);
+
+             alg_in->cost.cost += op_cost;
+             alg_in->cost.latency += op_cost;
+             if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
+               {
+                 struct algorithm *x;
+                 best_cost = alg_in->cost;
+                 x = alg_in, alg_in = best_alg, best_alg = x;
+                 best_alg->log[best_alg->ops] = m;
+                 best_alg->op[best_alg->ops] = alg_shift;
+               }
+           }
         }
        if (cache_hit)
         goto done;
@@ -2580,7 +2609,7 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
         {
           /* T ends with ...111.  Multiply by (T + 1) and subtract 1.  */
  
-         op_cost = add_cost[mode];
+         op_cost = add_cost[speed][mode];
           new_limit.cost = best_cost.cost - op_cost;
           new_limit.latency = best_cost.latency - op_cost;
           synth_mult (alg_in, t + 1, &new_limit, mode);
@@ -2600,7 +2629,7 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
         {
           /* T ends with ...01 or ...011.  Multiply by (T - 1) and add 1.  */
  
-         op_cost = add_cost[mode];
+         op_cost = add_cost[speed][mode];
           new_limit.cost = best_cost.cost - op_cost;
           new_limit.latency = best_cost.latency - op_cost;
           synth_mult (alg_in, t - 1, &new_limit, mode);
@@ -2616,6 +2645,29 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
               best_alg->op[best_alg->ops] = alg_add_t_m2;
             }
         }
+
+      /* We may be able to calculate a * -7, a * -15, a * -31, etc
+        quickly with a - a * n for some appropriate constant n.  */
+      m = exact_log2 (-orig_t + 1);
+      if (m >= 0 && m < maxm)
+       {
+         op_cost = shiftsub1_cost[speed][mode][m];
+         new_limit.cost = best_cost.cost - op_cost;
+         new_limit.latency = best_cost.latency - op_cost;
+         synth_mult (alg_in, (unsigned HOST_WIDE_INT) (-orig_t + 1) >> m, &new_limit, mode);
+
+         alg_in->cost.cost += op_cost;
+         alg_in->cost.latency += op_cost;
+         if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
+           {
+             struct algorithm *x;
+             best_cost = alg_in->cost;
+             x = alg_in, alg_in = best_alg, best_alg = x;
+             best_alg->log[best_alg->ops] = m;
+             best_alg->op[best_alg->ops] = alg_sub_t_m2;
+           }
+       }
+
        if (cache_hit)
         goto done;
      }
@@ -2645,14 +2697,14 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
              equal to its cost, otherwise assume that on superscalar
              hardware the shift may be executed concurrently with the
              earlier steps in the algorithm.  */
-         op_cost = add_cost[mode] + shift_cost[mode][m];
-         if (shiftadd_cost[mode][m] < op_cost)
+         op_cost = add_cost[speed][mode] + shift_cost[speed][mode][m];
+         if (shiftadd_cost[speed][mode][m] < op_cost)
             {
-             op_cost = shiftadd_cost[mode][m];
+             op_cost = shiftadd_cost[speed][mode][m];
               op_latency = op_cost;
             }
           else
-           op_latency = add_cost[mode];
+           op_latency = add_cost[speed][mode];
  
           new_limit.cost = best_cost.cost - op_cost;
           new_limit.latency = best_cost.latency - op_latency;
@@ -2684,14 +2736,14 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
              equal to it's cost, otherwise assume that on superscalar
              hardware the shift may be executed concurrently with the
              earlier steps in the algorithm.  */
-         op_cost = add_cost[mode] + shift_cost[mode][m];
-         if (shiftsub_cost[mode][m] < op_cost)
+         op_cost = add_cost[speed][mode] + shift_cost[speed][mode][m];
+         if (shiftsub0_cost[speed][mode][m] < op_cost)
             {
-             op_cost = shiftsub_cost[mode][m];
+             op_cost = shiftsub0_cost[speed][mode][m];
               op_latency = op_cost;
             }
           else
-           op_latency = add_cost[mode];
+           op_latency = add_cost[speed][mode];
  
           new_limit.cost = best_cost.cost - op_cost;
           new_limit.latency = best_cost.latency - op_latency;
@@ -2725,7 +2777,7 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
        m = exact_log2 (q);
        if (m >= 0 && m < maxm)
         {
-         op_cost = shiftadd_cost[mode][m];
+         op_cost = shiftadd_cost[speed][mode][m];
           new_limit.cost = best_cost.cost - op_cost;
           new_limit.latency = best_cost.latency - op_cost;
           synth_mult (alg_in, (t - 1) >> m, &new_limit, mode);
@@ -2750,7 +2802,7 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
        m = exact_log2 (q);
        if (m >= 0 && m < maxm)
         {
-         op_cost = shiftsub_cost[mode][m];
+         op_cost = shiftsub0_cost[speed][mode][m];
           new_limit.cost = best_cost.cost - op_cost;
           new_limit.latency = best_cost.latency - op_cost;
           synth_mult (alg_in, (t + 1) >> m, &new_limit, mode);
@@ -2781,6 +2833,7 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
          caller.  */
        alg_hash[hash_index].t = t;
        alg_hash[hash_index].mode = mode;
+      alg_hash[hash_index].speed = speed;
        alg_hash[hash_index].alg = alg_impossible;
        alg_hash[hash_index].cost = *cost_limit;
        return;
@@ -2791,6 +2844,7 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
      {
        alg_hash[hash_index].t = t;
        alg_hash[hash_index].mode = mode;
+      alg_hash[hash_index].speed = speed;
        alg_hash[hash_index].alg = best_alg->op[best_alg->ops];
        alg_hash[hash_index].cost.cost = best_cost.cost;
        alg_hash[hash_index].cost.latency = best_cost.latency;
@@ -2830,6 +2884,7 @@ choose_mult_variant (enum machine_mode mode, HOST_WIDE_INT val,
    struct algorithm alg2;
    struct mult_cost limit;
    int op_cost;
+  bool speed = optimize_insn_for_speed_p ();
  
    /* Fail quickly for impossible bounds.  */
    if (mult_cost < 0)
@@ -2838,7 +2893,7 @@ choose_mult_variant (enum machine_mode mode, HOST_WIDE_INT val,
    /* Ensure that mult_cost provides a reasonable upper bound.
       Any constant multiplication can be performed with less
       than 2 * bits additions.  */
-  op_cost = 2 * GET_MODE_BITSIZE (mode) * add_cost[mode];
+  op_cost = 2 * GET_MODE_BITSIZE (mode) * add_cost[speed][mode];
    if (mult_cost > op_cost)
      mult_cost = op_cost;
  
@@ -2851,7 +2906,7 @@ choose_mult_variant (enum machine_mode mode, HOST_WIDE_INT val,
       `unsigned int' */
    if (HOST_BITS_PER_INT >= GET_MODE_BITSIZE (mode))
      {
-      op_cost = neg_cost[mode];
+      op_cost = neg_cost[speed][mode];
        if (MULT_COST_LESS (&alg->cost, mult_cost))
         {
           limit.cost = alg->cost.cost - op_cost;
@@ -2871,7 +2926,7 @@ choose_mult_variant (enum machine_mode mode, HOST_WIDE_INT val,
      }
  
    /* This proves very useful for division-by-constant.  */
-  op_cost = add_cost[mode];
+  op_cost = add_cost[speed][mode];
    if (MULT_COST_LESS (&alg->cost, mult_cost))
      {
        limit.cost = alg->cost.cost - op_cost;
@@ -3059,6 +3114,7 @@ expand_mult (enum machine_mode mode, rtx op0, rtx op1, rtx target,
    enum mult_variant variant;
    struct algorithm algorithm;
    int max_cost;
+  bool speed = optimize_insn_for_speed_p ();
  
    /* Handling const0_rtx here allows us to use zero as a rogue value for
       coeff below.  */
@@ -3100,8 +3156,8 @@ expand_mult (enum machine_mode mode, rtx op0, rtx op1, rtx target,
                  result is interpreted as an unsigned coefficient.
                  Exclude cost of op0 from max_cost to match the cost
                  calculation of the synth_mult.  */
-             max_cost = rtx_cost (gen_rtx_MULT (mode, fake_reg, op1), SET)
-                        - neg_cost[mode];
+             max_cost = rtx_cost (gen_rtx_MULT (mode, fake_reg, op1), SET, speed)
+                        - neg_cost[speed][mode];
               if (max_cost > 0
                   && choose_mult_variant (mode, -INTVAL (op1), &algorithm,
                                           &variant, max_cost))
@@ -3118,7 +3174,8 @@ expand_mult (enum machine_mode mode, rtx op0, rtx op1, rtx target,
         {
           /* If we are multiplying in DImode, it may still be a win
              to try to work with shifts and adds.  */
-         if (CONST_DOUBLE_HIGH (op1) == 0)
+         if (CONST_DOUBLE_HIGH (op1) == 0
+             && CONST_DOUBLE_LOW (op1) > 0)
             coeff = CONST_DOUBLE_LOW (op1);
           else if (CONST_DOUBLE_LOW (op1) == 0
                    && EXACT_POWER_OF_2_OR_ZERO_P (CONST_DOUBLE_HIGH (op1)))
@@ -3145,7 +3202,7 @@ expand_mult (enum machine_mode mode, rtx op0, rtx op1, rtx target,
  
           /* Exclude cost of op0 from max_cost to match the cost
              calculation of the synth_mult.  */
-         max_cost = rtx_cost (gen_rtx_MULT (mode, fake_reg, op1), SET);
+         max_cost = rtx_cost (gen_rtx_MULT (mode, fake_reg, op1), SET, speed);
           if (choose_mult_variant (mode, coeff, &algorithm, &variant,
                                    max_cost))
             return expand_mult_const (mode, op0, coeff, target,
@@ -3388,6 +3445,7 @@ expand_mult_highpart_optab (enum machine_mode mode, rtx op0, rtx op1,
    optab moptab;
    rtx tem;
    int size;
+  bool speed = optimize_insn_for_speed_p ();
  
    gcc_assert (!SCALAR_FLOAT_MODE_P (mode));
  
@@ -3396,7 +3454,7 @@ expand_mult_highpart_optab (enum machine_mode mode, rtx op0, rtx op1,
  
    /* Firstly, try using a multiplication insn that only generates the needed
       high part of the product, and in the sign flavor of unsignedp.  */
-  if (mul_highpart_cost[mode] < max_cost)
+  if (mul_highpart_cost[speed][mode] < max_cost)
      {
        moptab = unsignedp ? umul_highpart_optab : smul_highpart_optab;
        tem = expand_binop (mode, moptab, op0, narrow_op1, target,
@@ -3408,8 +3466,8 @@ expand_mult_highpart_optab (enum machine_mode mode, rtx op0, rtx op1,
    /* Secondly, same as above, but use sign flavor opposite of unsignedp.
       Need to adjust the result after the multiplication.  */
    if (size - 1 < BITS_PER_WORD
-      && (mul_highpart_cost[mode] + 2 * shift_cost[mode][size-1]
-         + 4 * add_cost[mode] < max_cost))
+      && (mul_highpart_cost[speed][mode] + 2 * shift_cost[speed][mode][size-1]
+         + 4 * add_cost[speed][mode] < max_cost))
      {
        moptab = unsignedp ? smul_highpart_optab : umul_highpart_optab;
        tem = expand_binop (mode, moptab, op0, narrow_op1, target,
@@ -3423,7 +3481,7 @@ expand_mult_highpart_optab (enum machine_mode mode, rtx op0, rtx op1,
    /* Try widening multiplication.  */
    moptab = unsignedp ? umul_widen_optab : smul_widen_optab;
    if (optab_handler (moptab, wider_mode)->insn_code != CODE_FOR_nothing
-      && mul_widen_cost[wider_mode] < max_cost)
+      && mul_widen_cost[speed][wider_mode] < max_cost)
      {
        tem = expand_binop (wider_mode, moptab, op0, narrow_op1, 0,
                           unsignedp, OPTAB_WIDEN);
@@ -3434,7 +3492,7 @@ expand_mult_highpart_optab (enum machine_mode mode, rtx op0, rtx op1,
    /* Try widening the mode and perform a non-widening multiplication.  */
    if (optab_handler (smul_optab, wider_mode)->insn_code != CODE_FOR_nothing
        && size - 1 < BITS_PER_WORD
-      && mul_cost[wider_mode] + shift_cost[mode][size-1] < max_cost)
+      && mul_cost[speed][wider_mode] + shift_cost[speed][mode][size-1] < max_cost)
      {
        rtx insns, wop0, wop1;
  
@@ -3461,8 +3519,8 @@ expand_mult_highpart_optab (enum machine_mode mode, rtx op0, rtx op1,
    moptab = unsignedp ? smul_widen_optab : umul_widen_optab;
    if (optab_handler (moptab, wider_mode)->insn_code != CODE_FOR_nothing
        && size - 1 < BITS_PER_WORD
-      && (mul_widen_cost[wider_mode] + 2 * shift_cost[mode][size-1]
-         + 4 * add_cost[mode] < max_cost))
+      && (mul_widen_cost[speed][wider_mode] + 2 * shift_cost[speed][mode][size-1]
+         + 4 * add_cost[speed][mode] < max_cost))
      {
        tem = expand_binop (wider_mode, moptab, op0, narrow_op1,
                           NULL_RTX, ! unsignedp, OPTAB_WIDEN);
@@ -3500,6 +3558,7 @@ expand_mult_highpart (enum machine_mode mode, rtx op0, rtx op1,
    enum mult_variant variant;
    struct algorithm alg;
    rtx tem;
+  bool speed = optimize_insn_for_speed_p ();
  
    gcc_assert (!SCALAR_FLOAT_MODE_P (mode));
    /* We can't support modes wider than HOST_BITS_PER_INT.  */
@@ -3515,13 +3574,13 @@ expand_mult_highpart (enum machine_mode mode, rtx op0, rtx op1,
      return expand_mult_highpart_optab (mode, op0, op1, target,
                                        unsignedp, max_cost);
  
-  extra_cost = shift_cost[mode][GET_MODE_BITSIZE (mode) - 1];
+  extra_cost = shift_cost[speed][mode][GET_MODE_BITSIZE (mode) - 1];
  
    /* Check whether we try to multiply by a negative constant.  */
    if (!unsignedp && ((cnst1 >> (GET_MODE_BITSIZE (mode) - 1)) & 1))
      {
        sign_adjust = true;
-      extra_cost += add_cost[mode];
+      extra_cost += add_cost[speed][mode];
      }
  
    /* See whether shift/add multiplication is cheap enough.  */
@@ -3563,8 +3622,8 @@ expand_smod_pow2 (enum machine_mode mode, rtx op0, HOST_WIDE_INT d)
    result = gen_reg_rtx (mode);
  
    /* Avoid conditional branches when they're expensive.  */
-  if (BRANCH_COST >= 2
-      && !optimize_size)
+  if (BRANCH_COST (optimize_insn_for_speed_p (), false) >= 2
+      && optimize_insn_for_speed_p ())
      {
        rtx signmask = emit_store_flag (result, LT, op0, const0_rtx,
                                       mode, 0, -1);
@@ -3581,7 +3640,7 @@ expand_smod_pow2 (enum machine_mode mode, rtx op0, HOST_WIDE_INT d)
  
           temp = gen_rtx_LSHIFTRT (mode, result, shift);
           if (optab_handler (lshr_optab, mode)->insn_code == CODE_FOR_nothing
-             || rtx_cost (temp, SET) > COSTS_N_INSNS (2))
+             || rtx_cost (temp, SET, optimize_insn_for_speed_p ()) > COSTS_N_INSNS (2))
             {
               temp = expand_binop (mode, xor_optab, op0, signmask,
                                    NULL_RTX, 1, OPTAB_LIB_WIDEN);
@@ -3663,7 +3722,9 @@ expand_sdiv_pow2 (enum machine_mode mode, rtx op0, HOST_WIDE_INT d)
    logd = floor_log2 (d);
    shift = build_int_cst (NULL_TREE, logd);
  
-  if (d == 2 && BRANCH_COST >= 1)
+  if (d == 2
+      && BRANCH_COST (optimize_insn_for_speed_p (),
+                     false) >= 1)
      {
        temp = gen_reg_rtx (mode);
        temp = emit_store_flag (temp, LT, op0, const0_rtx, mode, 0, 1);
@@ -3673,7 +3734,8 @@ expand_sdiv_pow2 (enum machine_mode mode, rtx op0, HOST_WIDE_INT d)
      }
  
  #ifdef HAVE_conditional_move
-  if (BRANCH_COST >= 2)
+  if (BRANCH_COST (optimize_insn_for_speed_p (), false)
+      >= 2)
      {
        rtx temp2;
  
@@ -3702,13 +3764,14 @@ expand_sdiv_pow2 (enum machine_mode mode, rtx op0, HOST_WIDE_INT d)
      }
  #endif
  
-  if (BRANCH_COST >= 2)
+  if (BRANCH_COST (optimize_insn_for_speed_p (),
+                  false) >= 2)
      {
        int ushift = GET_MODE_BITSIZE (mode) - logd;
  
        temp = gen_reg_rtx (mode);
        temp = emit_store_flag (temp, LT, op0, const0_rtx, mode, 0, -1);
-      if (shift_cost[mode][ushift] > COSTS_N_INSNS (1))
+      if (shift_cost[optimize_insn_for_speed_p ()][mode][ushift] > COSTS_N_INSNS (1))
         temp = expand_binop (mode, and_optab, temp, GEN_INT (d - 1),
                              NULL_RTX, 0, OPTAB_LIB_WIDEN);
        else
@@ -3781,6 +3844,7 @@ expand_divmod (int rem_flag, enum tree_code code, enum machine_mode mode,
    int max_cost, extra_cost;
    static HOST_WIDE_INT last_div_const = 0;
    static HOST_WIDE_INT ext_op1;
+  bool speed = optimize_insn_for_speed_p ();
  
    op1_is_constant = GET_CODE (op1) == CONST_INT;
    if (op1_is_constant)
@@ -3886,8 +3950,8 @@ expand_divmod (int rem_flag, enum tree_code code, enum machine_mode mode,
    if (compute_mode == VOIDmode)
      for (compute_mode = mode; compute_mode != VOIDmode;
          compute_mode = GET_MODE_WIDER_MODE (compute_mode))
-      if (optab_handler (optab1, compute_mode)->libfunc
-         || optab_handler (optab2, compute_mode)->libfunc)
+      if (optab_libfunc (optab1, compute_mode)
+         || optab_libfunc (optab2, compute_mode))
         break;
  
    /* If we still couldn't find a mode, use MODE, but expand_binop will
@@ -3911,10 +3975,10 @@ expand_divmod (int rem_flag, enum tree_code code, enum machine_mode mode,
    /* Only deduct something for a REM if the last divide done was
       for a different constant.   Then set the constant of the last
       divide.  */
-  max_cost = unsignedp ? udiv_cost[compute_mode] : sdiv_cost[compute_mode];
+  max_cost = unsignedp ? udiv_cost[speed][compute_mode] : sdiv_cost[speed][compute_mode];
    if (rem_flag && ! (last_div_const != 0 && op1_is_constant
                      && INTVAL (op1) == last_div_const))
-    max_cost -= mul_cost[compute_mode] + add_cost[compute_mode];
+    max_cost -= mul_cost[speed][compute_mode] + add_cost[speed][compute_mode];
  
    last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
  
@@ -3998,10 +4062,8 @@ expand_divmod (int rem_flag, enum tree_code code, enum machine_mode mode,
                       {
                         /* Most significant bit of divisor is set; emit an scc
                            insn.  */
-                       quotient = emit_store_flag (tquotient, GEU, op0, op1,
-                                                   compute_mode, 1, 1);
-                       if (quotient == 0)
-                         goto fail1;
+                       quotient = emit_store_flag_force (tquotient, GEU, op0, op1,
+                                                         compute_mode, 1, 1);
                       }
                     else
                       {
@@ -4033,9 +4095,9 @@ expand_divmod (int rem_flag, enum tree_code code, enum machine_mode mode,
                               goto fail1;
  
                             extra_cost
-                             = (shift_cost[compute_mode][post_shift - 1]
-                                + shift_cost[compute_mode][1]
-                                + 2 * add_cost[compute_mode]);
+                             = (shift_cost[speed][compute_mode][post_shift - 1]
+                                + shift_cost[speed][compute_mode][1]
+                                + 2 * add_cost[speed][compute_mode]);
                             t1 = expand_mult_highpart (compute_mode, op0, ml,
                                                        NULL_RTX, 1,
                                                        max_cost - extra_cost);
@@ -4069,8 +4131,8 @@ expand_divmod (int rem_flag, enum tree_code code, enum machine_mode mode,
                                build_int_cst (NULL_TREE, pre_shift),
                                NULL_RTX, 1);
                             extra_cost
-                             = (shift_cost[compute_mode][pre_shift]
-                                + shift_cost[compute_mode][post_shift]);
+                             = (shift_cost[speed][compute_mode][pre_shift]
+                                + shift_cost[speed][compute_mode][post_shift]);
                             t2 = expand_mult_highpart (compute_mode, t1, ml,
                                                        NULL_RTX, 1,
                                                        max_cost - extra_cost);
@@ -4130,8 +4192,8 @@ expand_divmod (int rem_flag, enum tree_code code, enum machine_mode mode,
                       goto fail1;
                   }
                 else if (EXACT_POWER_OF_2_OR_ZERO_P (d)
-                        && (rem_flag ? smod_pow2_cheap[compute_mode]
-                                     : sdiv_pow2_cheap[compute_mode])
+                        && (rem_flag ? smod_pow2_cheap[speed][compute_mode]
+                                     : sdiv_pow2_cheap[speed][compute_mode])
                          /* We assume that cheap metric is true if the
                             optab has an expander for this mode.  */
                          && ((optab_handler ((rem_flag ? smod_optab
@@ -4151,7 +4213,7 @@ expand_divmod (int rem_flag, enum tree_code code, enum machine_mode mode,
                           return gen_lowpart (mode, remainder);
                       }
  
-                   if (sdiv_pow2_cheap[compute_mode]
+                   if (sdiv_pow2_cheap[speed][compute_mode]
                         && ((optab_handler (sdiv_optab, compute_mode)->insn_code
                              != CODE_FOR_nothing)
                             || (optab_handler (sdivmod_optab, compute_mode)->insn_code
@@ -4200,9 +4262,9 @@ expand_divmod (int rem_flag, enum tree_code code, enum machine_mode mode,
                             || size - 1 >= BITS_PER_WORD)
                           goto fail1;
  
-                       extra_cost = (shift_cost[compute_mode][post_shift]
-                                     + shift_cost[compute_mode][size - 1]
-                                     + add_cost[compute_mode]);
+                       extra_cost = (shift_cost[speed][compute_mode][post_shift]
+                                     + shift_cost[speed][compute_mode][size - 1]
+                                     + add_cost[speed][compute_mode]);
                         t1 = expand_mult_highpart (compute_mode, op0, mlr,
                                                    NULL_RTX, 0,
                                                    max_cost - extra_cost);
@@ -4237,9 +4299,9 @@ expand_divmod (int rem_flag, enum tree_code code, enum machine_mode mode,
  
                         ml |= (~(unsigned HOST_WIDE_INT) 0) << (size - 1);
                         mlr = gen_int_mode (ml, compute_mode);
-                       extra_cost = (shift_cost[compute_mode][post_shift]
-                                     + shift_cost[compute_mode][size - 1]
-                                     + 2 * add_cost[compute_mode]);
+                       extra_cost = (shift_cost[speed][compute_mode][post_shift]
+                                     + shift_cost[speed][compute_mode][size - 1]
+                                     + 2 * add_cost[speed][compute_mode]);
                         t1 = expand_mult_highpart (compute_mode, op0, mlr,
                                                    NULL_RTX, 0,
                                                    max_cost - extra_cost);
@@ -4332,9 +4394,9 @@ expand_divmod (int rem_flag, enum tree_code code, enum machine_mode mode,
                            NULL_RTX, 0);
                         t2 = expand_binop (compute_mode, xor_optab, op0, t1,
                                            NULL_RTX, 0, OPTAB_WIDEN);
-                       extra_cost = (shift_cost[compute_mode][post_shift]
-                                     + shift_cost[compute_mode][size - 1]
-                                     + 2 * add_cost[compute_mode]);
+                       extra_cost = (shift_cost[speed][compute_mode][post_shift]
+                                     + shift_cost[speed][compute_mode][size - 1]
+                                     + 2 * add_cost[speed][compute_mode]);
                         t3 = expand_mult_highpart (compute_mode, t2, ml,
                                                    NULL_RTX, 1,
                                                    max_cost - extra_cost);
@@ -5145,8 +5207,9 @@ emit_store_flag (rtx target, enum rtx_code code, rtx op0, rtx op1,
    enum insn_code icode;
    enum machine_mode compare_mode;
    enum machine_mode target_mode = GET_MODE (target);
+  enum mode_class mclass;
    rtx tem;
-  rtx last = get_last_insn ();
+  rtx last;
    rtx pattern, comparison;
  
    if (unsignedp)
@@ -5280,117 +5343,41 @@ emit_store_flag (rtx target, enum rtx_code code, rtx op0, rtx op1,
        return op0;
      }
  
-  icode = setcc_gen_code[(int) code];
-
-  if (icode != CODE_FOR_nothing)
+  mclass = GET_MODE_CLASS (mode);
+  for (compare_mode = mode; compare_mode != VOIDmode;
+       compare_mode = GET_MODE_WIDER_MODE (compare_mode))
      {
-      insn_operand_predicate_fn pred;
-
-      /* We think we may be able to do this with a scc insn.  Emit the
-        comparison and then the scc insn.  */
-
-      do_pending_stack_adjust ();
-      last = get_last_insn ();
-
-      comparison
-       = compare_from_rtx (op0, op1, code, unsignedp, mode, NULL_RTX);
-      if (CONSTANT_P (comparison))
-       {
-         switch (GET_CODE (comparison))
-           {
-           case CONST_INT:
-             if (comparison == const0_rtx)
-               return const0_rtx;
-             break;
-             
-#ifdef FLOAT_STORE_FLAG_VALUE
-           case CONST_DOUBLE:
-             if (comparison == CONST0_RTX (GET_MODE (comparison)))
-               return const0_rtx;
-             break;
-#endif
-           default:
-             gcc_unreachable ();
-           }
-         
-         if (normalizep == 1)
-           return const1_rtx;
-         if (normalizep == -1)
-           return constm1_rtx;
-         return const_true_rtx;
-       }
-
-      /* The code of COMPARISON may not match CODE if compare_from_rtx
-        decided to swap its operands and reverse the original code.
-
-        We know that compare_from_rtx returns either a CONST_INT or
-        a new comparison code, so it is safe to just extract the
-        code from COMPARISON.  */
-      code = GET_CODE (comparison);
-
-      /* Get a reference to the target in the proper mode for this insn.  */
-      compare_mode = insn_data[(int) icode].operand[0].mode;
-      subtarget = target;
-      pred = insn_data[(int) icode].operand[0].predicate;
-      if (optimize || ! (*pred) (subtarget, compare_mode))
-       subtarget = gen_reg_rtx (compare_mode);
-
-      pattern = GEN_FCN (icode) (subtarget);
-      if (pattern)
-       {
-         emit_insn (pattern);
-         return emit_store_flag_1 (target, subtarget, compare_mode,
-                                   normalizep);
-       }
-    }
-  else
-    {
-      /* We don't have an scc insn, so try a cstore insn.  */
-
-      for (compare_mode = mode; compare_mode != VOIDmode;
-          compare_mode = GET_MODE_WIDER_MODE (compare_mode))
-       {
-         icode = optab_handler (cstore_optab, compare_mode)->insn_code;
-         if (icode != CODE_FOR_nothing)
-           break;
-       }
-
-      if (icode != CODE_FOR_nothing)
+     enum machine_mode optab_mode = mclass == MODE_CC ? CCmode : compare_mode;
+     icode = optab_handler (cstore_optab, optab_mode)->insn_code;
+     if (icode != CODE_FOR_nothing)
         {
+         rtx x, y;
           enum machine_mode result_mode
             = insn_data[(int) icode].operand[0].mode;
-         rtx cstore_op0 = op0;
-         rtx cstore_op1 = op1;
  
           do_pending_stack_adjust ();
           last = get_last_insn ();
  
-         if (compare_mode != mode)
+          x = prepare_operand (icode, op0, 2, mode, compare_mode, unsignedp);
+          y = prepare_operand (icode, op1, 3, mode, compare_mode, unsignedp);
+         comparison = gen_rtx_fmt_ee (code, result_mode, x, y);
+         if (!x || !y
+             || !insn_data[icode].operand[2].predicate
+                 (x, insn_data[icode].operand[2].mode)
+             || !insn_data[icode].operand[3].predicate
+                 (y, insn_data[icode].operand[3].mode)
+             || !insn_data[icode].operand[1].predicate (comparison, VOIDmode))
             {
-             cstore_op0 = convert_modes (compare_mode, mode, cstore_op0,
-                                         unsignedp);
-             cstore_op1 = convert_modes (compare_mode, mode, cstore_op1,
-                                         unsignedp);
+             delete_insns_since (last);
+             continue;
             }
-         
-         if (!insn_data[(int) icode].operand[2].predicate (cstore_op0,
-                                                           compare_mode))
-           cstore_op0 = copy_to_mode_reg (compare_mode, cstore_op0);
  
-         if (!insn_data[(int) icode].operand[3].predicate (cstore_op1,
-                                                           compare_mode))
-           cstore_op1 = copy_to_mode_reg (compare_mode, cstore_op1);
-
-         comparison = gen_rtx_fmt_ee (code, result_mode, cstore_op0,
-                                      cstore_op1);
           subtarget = target;
-
           if (optimize || !(insn_data[(int) icode].operand[0].predicate
                             (subtarget, result_mode)))
             subtarget = gen_reg_rtx (result_mode);
  
-         pattern = GEN_FCN (icode) (subtarget, comparison, cstore_op0,
-                                    cstore_op1);
+         pattern = GEN_FCN (icode) (subtarget, comparison, x, y);
  
           if (pattern)
             {
@@ -5398,10 +5385,13 @@ emit_store_flag (rtx target, enum rtx_code code, rtx op0, rtx op1,
               return emit_store_flag_1 (target, subtarget, result_mode,
                                         normalizep);
             }
+
+         delete_insns_since (last);
+         break;
         }
      }
  
-  delete_insns_since (last);
+  last = get_last_insn ();
  
    /* If optimizing, use different pseudo registers for each insn, instead
       of reusing the same pseudo.  This leads to better CSE, but slows
@@ -5416,7 +5406,8 @@ emit_store_flag (rtx target, enum rtx_code code, rtx op0, rtx op1,
       comparison with zero.  Don't do any of these cases if branches are
       very cheap.  */
  
-  if (BRANCH_COST > 0
+  if (BRANCH_COST (optimize_insn_for_speed_p (),
+                  false) > 0
        && GET_MODE_CLASS (mode) == MODE_INT && (code == EQ || code == NE)
        && op1 != const0_rtx)
      {
@@ -5439,10 +5430,12 @@ emit_store_flag (rtx target, enum rtx_code code, rtx op0, rtx op1,
       do LE and GT if branches are expensive since they are expensive on
       2-operand machines.  */
  
-  if (BRANCH_COST == 0
+  if (BRANCH_COST (optimize_insn_for_speed_p (),
+                  false) == 0
        || GET_MODE_CLASS (mode) != MODE_INT || op1 != const0_rtx
        || (code != EQ && code != NE
-         && (BRANCH_COST <= 1 || (code != LE && code != GT))))
+         && (BRANCH_COST (optimize_insn_for_speed_p (),
+                          false) <= 1 || (code != LE && code != GT))))
      return 0;
  
    /* See what we need to return.  We can only return a 1, -1, or the
@@ -5538,7 +5531,10 @@ emit_store_flag (rtx target, enum rtx_code code, rtx op0, rtx op1,
          that "or", which is an extra insn, so we only handle EQ if branches
          are expensive.  */
  
-      if (tem == 0 && (code == NE || BRANCH_COST > 1))
+      if (tem == 0
+         && (code == NE
+             || BRANCH_COST (optimize_insn_for_speed_p (),
+                             false) > 1))
         {
           if (rtx_equal_p (subtarget, op0))
             subtarget = 0;