Daily bump.

[pf3gnuchains/gcc-fork.git] / gcc / optabs.c
diff --git a/gcc/optabs.c b/gcc/optabs.c

index 7448450..fd353d7 100644 (file)
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -1,7 +1,7 @@
  /* Expand the basic unary and binary arithmetic operations, for GNU compiler.
     Copyright (C) 1987, 1988, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
-   1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
-   Free Software Foundation, Inc.
+   1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+   2011, 2012  Free Software Foundation, Inc.
  
  This file is part of GCC.
  
@@ -216,7 +216,32 @@ add_equal_note (rtx insns, rtx target, enum rtx_code code, rtx op0, rtx op1)
      }
  
    if (GET_RTX_CLASS (code) == RTX_UNARY)
-    note = gen_rtx_fmt_e (code, GET_MODE (target), copy_rtx (op0));
+    switch (code)
+      {
+      case FFS:
+      case CLZ:
+      case CTZ:
+      case CLRSB:
+      case POPCOUNT:
+      case PARITY:
+      case BSWAP:
+       if (GET_MODE (op0) != VOIDmode && GET_MODE (target) != GET_MODE (op0))
+         {
+           note = gen_rtx_fmt_e (code, GET_MODE (op0), copy_rtx (op0));
+           if (GET_MODE_SIZE (GET_MODE (op0))
+               > GET_MODE_SIZE (GET_MODE (target)))
+             note = simplify_gen_unary (TRUNCATE, GET_MODE (target),
+                                        note, GET_MODE (op0));
+           else
+             note = simplify_gen_unary (ZERO_EXTEND, GET_MODE (target),
+                                        note, GET_MODE (op0));
+           break;
+         }
+       /* FALLTHRU */
+      default:
+       note = gen_rtx_fmt_e (code, GET_MODE (target), copy_rtx (op0));
+       break;
+      }
    else
      note = gen_rtx_fmt_ee (code, GET_MODE (target), copy_rtx (op0), copy_rtx (op1));
  
@@ -225,6 +250,61 @@ add_equal_note (rtx insns, rtx target, enum rtx_code code, rtx op0, rtx op1)
    return 1;
  }
  \f
+/* Given two input operands, OP0 and OP1, determine what the correct from_mode
+   for a widening operation would be.  In most cases this would be OP0, but if
+   that's a constant it'll be VOIDmode, which isn't useful.  */
+
+static enum machine_mode
+widened_mode (enum machine_mode to_mode, rtx op0, rtx op1)
+{
+  enum machine_mode m0 = GET_MODE (op0);
+  enum machine_mode m1 = GET_MODE (op1);
+  enum machine_mode result;
+
+  if (m0 == VOIDmode && m1 == VOIDmode)
+    return to_mode;
+  else if (m0 == VOIDmode || GET_MODE_SIZE (m0) < GET_MODE_SIZE (m1))
+    result = m1;
+  else
+    result = m0;
+
+  if (GET_MODE_SIZE (result) > GET_MODE_SIZE (to_mode))
+    return to_mode;
+
+  return result;
+}
+\f
+/* Find a widening optab even if it doesn't widen as much as we want.
+   E.g. if from_mode is HImode, and to_mode is DImode, and there is no
+   direct HI->SI insn, then return SI->DI, if that exists.
+   If PERMIT_NON_WIDENING is non-zero then this can be used with
+   non-widening optabs also.  */
+
+enum insn_code
+find_widening_optab_handler_and_mode (optab op, enum machine_mode to_mode,
+                                     enum machine_mode from_mode,
+                                     int permit_non_widening,
+                                     enum machine_mode *found_mode)
+{
+  for (; (permit_non_widening || from_mode != to_mode)
+        && GET_MODE_SIZE (from_mode) <= GET_MODE_SIZE (to_mode)
+        && from_mode != VOIDmode;
+       from_mode = GET_MODE_WIDER_MODE (from_mode))
+    {
+      enum insn_code handler = widening_optab_handler (op, to_mode,
+                                                      from_mode);
+
+      if (handler != CODE_FOR_nothing)
+       {
+         if (found_mode)
+           *found_mode = from_mode;
+         return handler;
+       }
+    }
+
+  return CODE_FOR_nothing;
+}
+\f
  /* Widen OP to MODE and return the rtx for the widened operand.  UNSIGNEDP
     says whether OP is signed or unsigned.  NO_EXTEND is nonzero if we need
     not actually do a sign-extend or zero-extend, but can leave the
@@ -399,6 +479,14 @@ optab_for_tree_code (enum tree_code code, const_tree type,
        return TYPE_UNSIGNED (type) ?
         vec_widen_umult_lo_optab : vec_widen_smult_lo_optab;
  
+    case VEC_WIDEN_LSHIFT_HI_EXPR:
+      return TYPE_UNSIGNED (type) ?
+        vec_widen_ushiftl_hi_optab : vec_widen_sshiftl_hi_optab;
+
+    case VEC_WIDEN_LSHIFT_LO_EXPR:
+      return TYPE_UNSIGNED (type) ?
+        vec_widen_ushiftl_lo_optab : vec_widen_sshiftl_lo_optab;
+
      case VEC_UNPACK_HI_EXPR:
        return TYPE_UNSIGNED (type) ?
         vec_unpacku_hi_optab : vec_unpacks_hi_optab;
@@ -459,18 +547,6 @@ optab_for_tree_code (enum tree_code code, const_tree type,
      case ABS_EXPR:
        return trapv ? absv_optab : abs_optab;
  
-    case VEC_EXTRACT_EVEN_EXPR:
-      return vec_extract_even_optab;
-
-    case VEC_EXTRACT_ODD_EXPR:
-      return vec_extract_odd_optab;
-
-    case VEC_INTERLEAVE_HIGH_EXPR:
-      return vec_interleave_high_optab;
-
-    case VEC_INTERLEAVE_LOW_EXPR:
-      return vec_interleave_low_optab;
-
      default:
        return NULL;
      }
@@ -515,8 +591,9 @@ expand_widen_pattern_expr (sepops ops, rtx op0, rtx op1, rtx wide_op,
      optab_for_tree_code (ops->code, TREE_TYPE (oprnd0), optab_default);
    if (ops->code == WIDEN_MULT_PLUS_EXPR
        || ops->code == WIDEN_MULT_MINUS_EXPR)
-    icode = optab_handler (widen_pattern_optab,
-                          TYPE_MODE (TREE_TYPE (ops->op2)));
+    icode = find_widening_optab_handler (widen_pattern_optab,
+                                        TYPE_MODE (TREE_TYPE (ops->op2)),
+                                        tmode0, 0);
    else
      icode = optab_handler (widen_pattern_optab, tmode0);
    gcc_assert (icode != CODE_FOR_nothing);
@@ -582,7 +659,7 @@ expand_ternary_op (enum machine_mode mode, optab ternary_optab, rtx op0,
     calculated at compile time.  The arguments and return value are
     otherwise the same as for expand_binop.  */
  
-static rtx
+rtx
  simplify_expand_binop (enum machine_mode mode, optab binoptab,
                        rtx op0, rtx op1, rtx target, int unsignedp,
                        enum optab_methods methods)
@@ -654,6 +731,41 @@ expand_vec_shift_expr (sepops ops, rtx target)
    return eops[0].value;
  }
  
+/* Create a new vector value in VMODE with all elements set to OP.  The
+   mode of OP must be the element mode of VMODE.  If OP is a constant,
+   then the return value will be a constant.  */
+
+static rtx
+expand_vector_broadcast (enum machine_mode vmode, rtx op)
+{
+  enum insn_code icode;
+  rtvec vec;
+  rtx ret;
+  int i, n;
+
+  gcc_checking_assert (VECTOR_MODE_P (vmode));
+
+  n = GET_MODE_NUNITS (vmode);
+  vec = rtvec_alloc (n);
+  for (i = 0; i < n; ++i)
+    RTVEC_ELT (vec, i) = op;
+
+  if (CONSTANT_P (op))
+    return gen_rtx_CONST_VECTOR (vmode, vec);
+
+  /* ??? If the target doesn't have a vec_init, then we have no easy way
+     of performing this operation.  Most of this sort of generic support
+     is hidden away in the vector lowering support in gimple.  */
+  icode = optab_handler (vec_init_optab, vmode);
+  if (icode == CODE_FOR_nothing)
+    return NULL;
+
+  ret = gen_reg_rtx (vmode);
+  emit_insn (GEN_FCN (icode) (ret, gen_rtx_PARALLEL (vmode, vec)));
+
+  return ret;
+}
+
  /* This subroutine of expand_doubleword_shift handles the cases in which
     the effective shift value is >= BITS_PER_WORD.  The arguments and return
     value are the same as for the parent routine, except that SUPERWORD_OP1
@@ -1204,21 +1316,21 @@ commutative_optab_p (optab binoptab)
           || binoptab == umul_highpart_optab);
  }
  
-/* X is to be used in mode MODE as an operand to BINOPTAB.  If we're
+/* X is to be used in mode MODE as operand OPN to BINOPTAB.  If we're
     optimizing, and if the operand is a constant that costs more than
     1 instruction, force the constant into a register and return that
     register.  Return X otherwise.  UNSIGNEDP says whether X is unsigned.  */
  
  static rtx
  avoid_expensive_constant (enum machine_mode mode, optab binoptab,
-                         rtx x, bool unsignedp)
+                         int opn, rtx x, bool unsignedp)
  {
    bool speed = optimize_insn_for_speed_p ();
  
    if (mode != VOIDmode
        && optimize
        && CONSTANT_P (x)
-      && rtx_cost (x, binoptab->code, speed) > rtx_cost (x, SET, speed))
+      && rtx_cost (x, binoptab->code, opn, speed) > set_src_cost (x, speed))
      {
        if (CONST_INT_P (x))
         {
@@ -1242,7 +1354,9 @@ expand_binop_directly (enum machine_mode mode, optab binoptab,
                        rtx target, int unsignedp, enum optab_methods methods,
                        rtx last)
  {
-  enum insn_code icode = optab_handler (binoptab, mode);
+  enum machine_mode from_mode = widened_mode (mode, op0, op1);
+  enum insn_code icode = find_widening_optab_handler (binoptab, mode,
+                                                     from_mode, 1);
    enum machine_mode xmode0 = insn_data[(int) icode].operand[1].mode;
    enum machine_mode xmode1 = insn_data[(int) icode].operand[2].mode;
    enum machine_mode mode0, mode1, tmp_mode;
@@ -1265,9 +1379,9 @@ expand_binop_directly (enum machine_mode mode, optab binoptab,
      }
  
    /* If we are optimizing, force expensive constants into a register.  */
-  xop0 = avoid_expensive_constant (xmode0, binoptab, xop0, unsignedp);
+  xop0 = avoid_expensive_constant (xmode0, binoptab, 0, xop0, unsignedp);
    if (!shift_optab_p (binoptab))
-    xop1 = avoid_expensive_constant (xmode1, binoptab, xop1, unsignedp);
+    xop1 = avoid_expensive_constant (xmode1, binoptab, 1, xop1, unsignedp);
  
    /* In case the insn wants input operands in modes different from
       those of the actual operands, convert the operands.  It would
@@ -1389,7 +1503,9 @@ expand_binop (enum machine_mode mode, optab binoptab, rtx op0, rtx op1,
    /* If we can do it with a three-operand insn, do so.  */
  
    if (methods != OPTAB_MUST_WIDEN
-      && optab_handler (binoptab, mode) != CODE_FOR_nothing)
+      && find_widening_optab_handler (binoptab, mode,
+                                     widened_mode (mode, op0, op1), 1)
+           != CODE_FOR_nothing)
      {
        temp = expand_binop_directly (mode, binoptab, op0, op1, target,
                                     unsignedp, methods, last);
@@ -1407,7 +1523,7 @@ expand_binop (enum machine_mode mode, optab binoptab, rtx op0, rtx op1,
      {
        optab otheroptab = (binoptab == rotl_optab ? rotr_optab : rotl_optab);
        rtx newop1;
-      unsigned int bits = GET_MODE_BITSIZE (mode);
+      unsigned int bits = GET_MODE_PRECISION (mode);
  
        if (CONST_INT_P (op1))
          newop1 = GEN_INT (bits - INTVAL (op1));
@@ -1428,26 +1544,56 @@ expand_binop (enum machine_mode mode, optab binoptab, rtx op0, rtx op1,
       takes operands of this mode and makes a wider mode.  */
  
    if (binoptab == smul_optab
-      && GET_MODE_WIDER_MODE (mode) != VOIDmode
-      && (optab_handler ((unsignedp ? umul_widen_optab : smul_widen_optab),
-                        GET_MODE_WIDER_MODE (mode))
+      && GET_MODE_2XWIDER_MODE (mode) != VOIDmode
+      && (widening_optab_handler ((unsignedp ? umul_widen_optab
+                                            : smul_widen_optab),
+                                 GET_MODE_2XWIDER_MODE (mode), mode)
           != CODE_FOR_nothing))
      {
-      temp = expand_binop (GET_MODE_WIDER_MODE (mode),
+      temp = expand_binop (GET_MODE_2XWIDER_MODE (mode),
                            unsignedp ? umul_widen_optab : smul_widen_optab,
                            op0, op1, NULL_RTX, unsignedp, OPTAB_DIRECT);
  
        if (temp != 0)
         {
           if (GET_MODE_CLASS (mode) == MODE_INT
-             && TRULY_NOOP_TRUNCATION (GET_MODE_BITSIZE (mode),
-                                        GET_MODE_BITSIZE (GET_MODE (temp))))
+             && TRULY_NOOP_TRUNCATION_MODES_P (mode, GET_MODE (temp)))
             return gen_lowpart (mode, temp);
           else
             return convert_to_mode (mode, temp, unsignedp);
         }
      }
  
+  /* If this is a vector shift by a scalar, see if we can do a vector
+     shift by a vector.  If so, broadcast the scalar into a vector.  */
+  if (mclass == MODE_VECTOR_INT)
+    {
+      optab otheroptab = NULL;
+
+      if (binoptab == ashl_optab)
+       otheroptab = vashl_optab;
+      else if (binoptab == ashr_optab)
+       otheroptab = vashr_optab;
+      else if (binoptab == lshr_optab)
+       otheroptab = vlshr_optab;
+      else if (binoptab == rotl_optab)
+       otheroptab = vrotl_optab;
+      else if (binoptab == rotr_optab)
+       otheroptab = vrotr_optab;
+
+      if (otheroptab && optab_handler (otheroptab, mode) != CODE_FOR_nothing)
+       {
+         rtx vop1 = expand_vector_broadcast (mode, op1);
+         if (vop1)
+           {
+             temp = expand_binop_directly (mode, otheroptab, op0, vop1,
+                                           target, unsignedp, methods, last);
+             if (temp)
+               return temp;
+           }
+       }
+    }
+
    /* Look for a wider mode of the same class for which we think we
       can open-code the operation.  Check for a widening multiply at the
       wider mode as well.  */
@@ -1461,9 +1607,11 @@ expand_binop (enum machine_mode mode, optab binoptab, rtx op0, rtx op1,
         if (optab_handler (binoptab, wider_mode) != CODE_FOR_nothing
             || (binoptab == smul_optab
                 && GET_MODE_WIDER_MODE (wider_mode) != VOIDmode
-               && (optab_handler ((unsignedp ? umul_widen_optab
-                                   : smul_widen_optab),
-                                  GET_MODE_WIDER_MODE (wider_mode))
+               && (find_widening_optab_handler ((unsignedp
+                                                 ? umul_widen_optab
+                                                 : smul_widen_optab),
+                                                GET_MODE_WIDER_MODE (wider_mode),
+                                                mode, 0)
                     != CODE_FOR_nothing)))
           {
             rtx xop0 = op0, xop1 = op1;
@@ -1480,10 +1628,10 @@ expand_binop (enum machine_mode mode, optab binoptab, rtx op0, rtx op1,
                 && mclass == MODE_INT)
               {
                 no_extend = 1;
-               xop0 = avoid_expensive_constant (mode, binoptab,
+               xop0 = avoid_expensive_constant (mode, binoptab, 0,
                                                  xop0, unsignedp);
                 if (binoptab != ashl_optab)
-                 xop1 = avoid_expensive_constant (mode, binoptab,
+                 xop1 = avoid_expensive_constant (mode, binoptab, 1,
                                                    xop1, unsignedp);
               }
  
@@ -1498,8 +1646,7 @@ expand_binop (enum machine_mode mode, optab binoptab, rtx op0, rtx op1,
             if (temp)
               {
                 if (mclass != MODE_INT
-                    || !TRULY_NOOP_TRUNCATION (GET_MODE_BITSIZE (mode),
-                                               GET_MODE_BITSIZE (wider_mode)))
+                    || !TRULY_NOOP_TRUNCATION_MODES_P (mode, wider_mode))
                   {
                     if (target == 0)
                       target = gen_reg_rtx (mode);
@@ -1577,6 +1724,7 @@ expand_binop (enum machine_mode mode, optab binoptab, rtx op0, rtx op1,
        && mclass == MODE_INT
        && (CONST_INT_P (op1) || optimize_insn_for_speed_p ())
        && GET_MODE_SIZE (mode) == 2 * UNITS_PER_WORD
+      && GET_MODE_PRECISION (mode) == GET_MODE_BITSIZE (mode)
        && optab_handler (binoptab, word_mode) != CODE_FOR_nothing
        && optab_handler (ashl_optab, word_mode) != CODE_FOR_nothing
        && optab_handler (lshr_optab, word_mode) != CODE_FOR_nothing)
@@ -1649,7 +1797,7 @@ expand_binop (enum machine_mode mode, optab binoptab, rtx op0, rtx op1,
    if ((binoptab == rotl_optab || binoptab == rotr_optab)
        && mclass == MODE_INT
        && CONST_INT_P (op1)
-      && GET_MODE_SIZE (mode) == 2 * UNITS_PER_WORD
+      && GET_MODE_PRECISION (mode) == 2 * BITS_PER_WORD
        && optab_handler (ashl_optab, word_mode) != CODE_FOR_nothing
        && optab_handler (lshr_optab, word_mode) != CODE_FOR_nothing)
      {
@@ -1868,11 +2016,11 @@ expand_binop (enum machine_mode mode, optab binoptab, rtx op0, rtx op1,
             {
               rtx temp = emit_move_insn (target, xtarget);
  
-             set_unique_reg_note (temp,
-                                  REG_EQUAL,
-                                  gen_rtx_fmt_ee (binoptab->code, mode,
-                                                  copy_rtx (xop0),
-                                                  copy_rtx (xop1)));
+             set_dst_reg_note (temp, REG_EQUAL,
+                               gen_rtx_fmt_ee (binoptab->code, mode,
+                                               copy_rtx (xop0),
+                                               copy_rtx (xop1)),
+                               target);
             }
           else
             target = xtarget;
@@ -1896,8 +2044,8 @@ expand_binop (enum machine_mode mode, optab binoptab, rtx op0, rtx op1,
        && optab_handler (add_optab, word_mode) != CODE_FOR_nothing)
      {
        rtx product = NULL_RTX;
-
-      if (optab_handler (umul_widen_optab, mode) != CODE_FOR_nothing)
+      if (widening_optab_handler (umul_widen_optab, mode, word_mode)
+           != CODE_FOR_nothing)
         {
           product = expand_doubleword_mult (mode, op0, op1, target,
                                             true, methods);
@@ -1906,7 +2054,8 @@ expand_binop (enum machine_mode mode, optab binoptab, rtx op0, rtx op1,
         }
  
        if (product == NULL_RTX
-         && optab_handler (smul_widen_optab, mode) != CODE_FOR_nothing)
+         && widening_optab_handler (smul_widen_optab, mode, word_mode)
+               != CODE_FOR_nothing)
         {
           product = expand_doubleword_mult (mode, op0, op1, target,
                                             false, methods);
@@ -1919,11 +2068,12 @@ expand_binop (enum machine_mode mode, optab binoptab, rtx op0, rtx op1,
           if (optab_handler (mov_optab, mode) != CODE_FOR_nothing)
             {
               temp = emit_move_insn (target ? target : product, product);
-             set_unique_reg_note (temp,
-                                  REG_EQUAL,
-                                  gen_rtx_fmt_ee (MULT, mode,
-                                                  copy_rtx (op0),
-                                                  copy_rtx (op1)));
+             set_dst_reg_note (temp,
+                               REG_EQUAL,
+                               gen_rtx_fmt_ee (MULT, mode,
+                                               copy_rtx (op0),
+                                               copy_rtx (op1)),
+                               target ? target : product);
             }
           return product;
         }
@@ -1997,7 +2147,8 @@ expand_binop (enum machine_mode mode, optab binoptab, rtx op0, rtx op1,
            wider_mode != VOIDmode;
            wider_mode = GET_MODE_WIDER_MODE (wider_mode))
         {
-         if (optab_handler (binoptab, wider_mode) != CODE_FOR_nothing
+         if (find_widening_optab_handler (binoptab, wider_mode, mode, 1)
+                 != CODE_FOR_nothing
               || (methods == OPTAB_LIB
                   && optab_libfunc (binoptab, wider_mode)))
             {
@@ -2027,8 +2178,7 @@ expand_binop (enum machine_mode mode, optab binoptab, rtx op0, rtx op1,
               if (temp)
                 {
                   if (mclass != MODE_INT
-                     || !TRULY_NOOP_TRUNCATION (GET_MODE_BITSIZE (mode),
-                                                GET_MODE_BITSIZE (wider_mode)))
+                     || !TRULY_NOOP_TRUNCATION_MODES_P (mode, wider_mode))
                     {
                       if (target == 0)
                         target = gen_reg_rtx (mode);
@@ -2218,8 +2368,8 @@ expand_twoval_binop (optab binoptab, rtx op0, rtx op1, rtx targ0, rtx targ1,
        rtx xop0 = op0, xop1 = op1;
  
        /* If we are optimizing, force expensive constants into a register.  */
-      xop0 = avoid_expensive_constant (mode0, binoptab, xop0, unsignedp);
-      xop1 = avoid_expensive_constant (mode1, binoptab, xop1, unsignedp);
+      xop0 = avoid_expensive_constant (mode0, binoptab, 0, xop0, unsignedp);
+      xop1 = avoid_expensive_constant (mode1, binoptab, 1, xop1, unsignedp);
  
        create_fixed_operand (&ops[0], targ0);
        create_convert_operand_from (&ops[1], op0, mode, unsignedp);
@@ -2350,12 +2500,14 @@ widen_leading (enum machine_mode mode, rtx op0, rtx target, optab unoptab)
  
               if (target == 0)
                 target = gen_reg_rtx (mode);
-             xop0 = widen_operand (op0, wider_mode, mode, true, false);
-             temp = expand_unop (wider_mode, unoptab, xop0, NULL_RTX, true);
+             xop0 = widen_operand (op0, wider_mode, mode,
+                                   unoptab != clrsb_optab, false);
+             temp = expand_unop (wider_mode, unoptab, xop0, NULL_RTX,
+                                 unoptab != clrsb_optab);
               if (temp != 0)
                 temp = expand_binop (wider_mode, sub_optab, temp,
-                                    GEN_INT (GET_MODE_BITSIZE (wider_mode)
-                                             - GET_MODE_BITSIZE (mode)),
+                                    GEN_INT (GET_MODE_PRECISION (wider_mode)
+                                             - GET_MODE_PRECISION (mode)),
                                      target, true, OPTAB_DIRECT);
               if (temp == 0)
                 delete_insns_since (last);
@@ -2464,6 +2616,8 @@ widen_bswap (enum machine_mode mode, rtx op0, rtx target)
    x = widen_operand (op0, wider_mode, mode, true, true);
    x = expand_unop (wider_mode, bswap_optab, x, NULL_RTX, true);
  
+  gcc_assert (GET_MODE_PRECISION (wider_mode) == GET_MODE_BITSIZE (wider_mode)
+             && GET_MODE_PRECISION (mode) == GET_MODE_BITSIZE (mode));
    if (x != 0)
      x = expand_shift (RSHIFT_EXPR, wider_mode, x,
                       GET_MODE_BITSIZE (wider_mode)
@@ -2541,7 +2695,7 @@ expand_parity (enum machine_mode mode, rtx op0, rtx target)
  }
  
  /* Try calculating ctz(x) as K - clz(x & -x) ,
-   where K is GET_MODE_BITSIZE(mode) - 1.
+   where K is GET_MODE_PRECISION(mode) - 1.
  
     Both __builtin_ctz and __builtin_clz are undefined at zero, so we
     don't have to worry about what the hardware does in that case.  (If
@@ -2569,7 +2723,7 @@ expand_ctz (enum machine_mode mode, rtx op0, rtx target)
    if (temp)
      temp = expand_unop_direct (mode, clz_optab, temp, NULL_RTX, true);
    if (temp)
-    temp = expand_binop (mode, sub_optab, GEN_INT (GET_MODE_BITSIZE (mode) - 1),
+    temp = expand_binop (mode, sub_optab, GEN_INT (GET_MODE_PRECISION (mode) - 1),
                          temp, target,
                          true, OPTAB_DIRECT);
    if (temp == 0)
@@ -2620,7 +2774,7 @@ expand_ffs (enum machine_mode mode, rtx op0, rtx target)
        if (CLZ_DEFINED_VALUE_AT_ZERO (mode, val) == 2)
         {
           defined_at_zero = true;
-         val = (GET_MODE_BITSIZE (mode) - 1) - val;
+         val = (GET_MODE_PRECISION (mode) - 1) - val;
         }
      }
    else
@@ -2777,8 +2931,9 @@ expand_absneg_bit (enum rtx_code code, enum machine_mode mode,
                            gen_lowpart (imode, target), 1, OPTAB_LIB_WIDEN);
        target = lowpart_subreg_maybe_copy (mode, temp, imode);
  
-      set_unique_reg_note (get_last_insn (), REG_EQUAL,
-                          gen_rtx_fmt_e (code, mode, copy_rtx (op0)));
+      set_dst_reg_note (get_last_insn (), REG_EQUAL,
+                       gen_rtx_fmt_e (code, mode, copy_rtx (op0)),
+                       target);
      }
  
    return target;
@@ -2913,8 +3068,7 @@ expand_unop (enum machine_mode mode, optab unoptab, rtx op0, rtx target,
             if (temp)
               {
                 if (mclass != MODE_INT
-                   || !TRULY_NOOP_TRUNCATION (GET_MODE_BITSIZE (mode),
-                                              GET_MODE_BITSIZE (wider_mode)))
+                   || !TRULY_NOOP_TRUNCATION_MODES_P (mode, wider_mode))
                   {
                     if (target == 0)
                       target = gen_reg_rtx (mode);
@@ -3075,11 +3229,12 @@ expand_unop (enum machine_mode mode, optab unoptab, rtx op0, rtx target,
                                   unsignedp);
  
               /* If we are generating clz using wider mode, adjust the
-                result.  */
-             if (unoptab == clz_optab && temp != 0)
+                result.  Similarly for clrsb.  */
+             if ((unoptab == clz_optab || unoptab == clrsb_optab)
+                 && temp != 0)
                 temp = expand_binop (wider_mode, sub_optab, temp,
-                                    GEN_INT (GET_MODE_BITSIZE (wider_mode)
-                                             - GET_MODE_BITSIZE (mode)),
+                                    GEN_INT (GET_MODE_PRECISION (wider_mode)
+                                             - GET_MODE_PRECISION (mode)),
                                      target, true, OPTAB_DIRECT);
  
               if (temp)
@@ -3174,7 +3329,7 @@ expand_abs_nojump (enum machine_mode mode, rtx op0, rtx target,
                       false) >= 2)
      {
        rtx extended = expand_shift (RSHIFT_EXPR, mode, op0,
-                                  GET_MODE_BITSIZE (mode) - 1,
+                                  GET_MODE_PRECISION (mode) - 1,
                                    NULL_RTX, 0);
  
        temp = expand_binop (mode, xor_optab, extended, op0, target, 0,
@@ -3275,7 +3430,7 @@ expand_one_cmpl_abs_nojump (enum machine_mode mode, rtx op0, rtx target)
                      false) >= 2)
      {
        rtx extended = expand_shift (RSHIFT_EXPR, mode, op0,
-                                  GET_MODE_BITSIZE (mode) - 1,
+                                  GET_MODE_PRECISION (mode) - 1,
                                    NULL_RTX, 0);
  
        temp = expand_binop (mode, xor_optab, extended, op0, target, 0,
@@ -3710,8 +3865,7 @@ emit_libcall_block (rtx insns, rtx target, rtx result, rtx equiv)
      }
  
    last = emit_move_insn (target, result);
-  if (optab_handler (mov_optab, GET_MODE (target)) != CODE_FOR_nothing)
-    set_unique_reg_note (last, REG_EQUAL, copy_rtx (equiv));
+  set_dst_reg_note (last, REG_EQUAL, copy_rtx (equiv), target);
  
    if (final_dest != target)
      emit_move_insn (final_dest, target);
@@ -3788,12 +3942,12 @@ prepare_cmp_insn (rtx x, rtx y, enum rtx_code comparison, rtx size,
  
    /* If we are optimizing, force expensive constants into a register.  */
    if (CONSTANT_P (x) && optimize
-      && (rtx_cost (x, COMPARE, optimize_insn_for_speed_p ())
+      && (rtx_cost (x, COMPARE, 0, optimize_insn_for_speed_p ())
            > COSTS_N_INSNS (1)))
      x = force_reg (mode, x);
  
    if (CONSTANT_P (y) && optimize
-      && (rtx_cost (y, COMPARE, optimize_insn_for_speed_p ())
+      && (rtx_cost (y, COMPARE, 1, optimize_insn_for_speed_p ())
            > COSTS_N_INSNS (1)))
      y = force_reg (mode, y);
  
@@ -3957,11 +4111,13 @@ prepare_cmp_insn (rtx x, rtx y, enum rtx_code comparison, rtx size,
          result against 1 in the biased case, and zero in the unbiased
          case. For unsigned comparisons always compare against 1 after
          biasing the unbiased result by adding 1. This gives us a way to
-        represent LTU. */
+        represent LTU.
+        The comparisons in the fixed-point helper library are always
+        biased.  */
        x = result;
        y = const1_rtx;
  
-      if (!TARGET_LIB_INT_CMP_BIASED)
+      if (!TARGET_LIB_INT_CMP_BIASED && !ALL_FIXED_POINT_MODE_P (mode))
         {
           if (unsignedp)
             x = plus_constant (result, 1);
@@ -4624,7 +4780,7 @@ can_fix_p (enum machine_mode fixmode, enum machine_mode fltmode,
    return CODE_FOR_nothing;
  }
  
-static enum insn_code
+enum insn_code
  can_float_p (enum machine_mode fltmode, enum machine_mode fixmode,
              int unsignedp)
  {
@@ -4633,6 +4789,60 @@ can_float_p (enum machine_mode fltmode, enum machine_mode fixmode,
    tab = unsignedp ? ufloat_optab : sfloat_optab;
    return convert_optab_handler (tab, fltmode, fixmode);
  }
+
+/* Function supportable_convert_operation
+
+   Check whether an operation represented by the code CODE is a
+   convert operation that is supported by the target platform in
+   vector form (i.e., when operating on arguments of type VECTYPE_IN
+   producing a result of type VECTYPE_OUT).
+   
+   Convert operations we currently support directly are FIX_TRUNC and FLOAT.
+   This function checks if these operations are supported
+   by the target platform either directly (via vector tree-codes), or via
+   target builtins.
+   
+   Output:
+   - CODE1 is code of vector operation to be used when
+   vectorizing the operation, if available.
+   - DECL is decl of target builtin functions to be used
+   when vectorizing the operation, if available.  In this case,
+   CODE1 is CALL_EXPR.  */
+
+bool
+supportable_convert_operation (enum tree_code code,
+                                    tree vectype_out, tree vectype_in,
+                                    tree *decl, enum tree_code *code1)
+{
+  enum machine_mode m1,m2;
+  int truncp;
+
+  m1 = TYPE_MODE (vectype_out);
+  m2 = TYPE_MODE (vectype_in);
+
+  /* First check if we can done conversion directly.  */
+  if ((code == FIX_TRUNC_EXPR 
+       && can_fix_p (m1,m2,TYPE_UNSIGNED (vectype_out), &truncp) 
+          != CODE_FOR_nothing)
+      || (code == FLOAT_EXPR
+          && can_float_p (m1,m2,TYPE_UNSIGNED (vectype_in))
+            != CODE_FOR_nothing))
+    {
+      *code1 = code;
+      return true;
+    }
+
+  /* Now check for builtin.  */
+  if (targetm.vectorize.builtin_conversion
+      && targetm.vectorize.builtin_conversion (code, vectype_out, vectype_in))
+    {
+      *code1 = CALL_EXPR;
+      *decl = targetm.vectorize.builtin_conversion (code, vectype_out, vectype_in);
+      return true;
+    }
+  return false;
+}
+
  \f
  /* Generate code to convert FROM to floating point
     and store in TO.  FROM must be fixed point and not VOIDmode.
@@ -4664,7 +4874,7 @@ expand_float (rtx to, rtx from, int unsignedp)
         int doing_unsigned = unsignedp;
  
         if (fmode != GET_MODE (to)
-           && significand_size (fmode) < GET_MODE_BITSIZE (GET_MODE (from)))
+           && significand_size (fmode) < GET_MODE_PRECISION (GET_MODE (from)))
           continue;
  
         icode = can_float_p (fmode, imode, unsignedp);
@@ -4708,7 +4918,7 @@ expand_float (rtx to, rtx from, int unsignedp)
  
        for (fmode = GET_MODE (to);  fmode != VOIDmode;
            fmode = GET_MODE_WIDER_MODE (fmode))
-       if (GET_MODE_BITSIZE (GET_MODE (from)) < GET_MODE_BITSIZE (fmode)
+       if (GET_MODE_PRECISION (GET_MODE (from)) < GET_MODE_BITSIZE (fmode)
             && can_float_p (fmode, GET_MODE (from), 0) != CODE_FOR_nothing)
           break;
  
@@ -4719,7 +4929,7 @@ expand_float (rtx to, rtx from, int unsignedp)
  
           /* Avoid double-rounding when TO is narrower than FROM.  */
           if ((significand_size (fmode) + 1)
-             < GET_MODE_BITSIZE (GET_MODE (from)))
+             < GET_MODE_PRECISION (GET_MODE (from)))
             {
               rtx temp1;
               rtx neglabel = gen_label_rtx ();
@@ -4786,7 +4996,7 @@ expand_float (rtx to, rtx from, int unsignedp)
                                0, label);
  
  
-      real_2expN (&offset, GET_MODE_BITSIZE (GET_MODE (from)), fmode);
+      real_2expN (&offset, GET_MODE_PRECISION (GET_MODE (from)), fmode);
        temp = expand_binop (fmode, add_optab, target,
                            CONST_DOUBLE_FROM_REAL_VALUE (offset, fmode),
                            target, 0, OPTAB_LIB_WIDEN);
@@ -4916,18 +5126,18 @@ expand_fix (rtx to, rtx from, int unsignedp)
       2^63.  The subtraction of 2^63 should not generate any rounding as it
       simply clears out that bit.  The rest is trivial.  */
  
-  if (unsignedp && GET_MODE_BITSIZE (GET_MODE (to)) <= HOST_BITS_PER_WIDE_INT)
+  if (unsignedp && GET_MODE_PRECISION (GET_MODE (to)) <= HOST_BITS_PER_WIDE_INT)
      for (fmode = GET_MODE (from); fmode != VOIDmode;
          fmode = GET_MODE_WIDER_MODE (fmode))
        if (CODE_FOR_nothing != can_fix_p (GET_MODE (to), fmode, 0, &must_trunc)
           && (!DECIMAL_FLOAT_MODE_P (fmode)
-             || GET_MODE_BITSIZE (fmode) > GET_MODE_BITSIZE (GET_MODE (to))))
+             || GET_MODE_BITSIZE (fmode) > GET_MODE_PRECISION (GET_MODE (to))))
         {
           int bitsize;
           REAL_VALUE_TYPE offset;
           rtx limit, lab1, lab2, insn;
  
-         bitsize = GET_MODE_BITSIZE (GET_MODE (to));
+         bitsize = GET_MODE_PRECISION (GET_MODE (to));
           real_2expN (&offset, bitsize - 1, fmode);
           limit = CONST_DOUBLE_FROM_REAL_VALUE (offset, fmode);
           lab1 = gen_label_rtx ();
@@ -4968,11 +5178,10 @@ expand_fix (rtx to, rtx from, int unsignedp)
             {
               /* Make a place for a REG_NOTE and add it.  */
               insn = emit_move_insn (to, to);
-             set_unique_reg_note (insn,
-                                  REG_EQUAL,
-                                  gen_rtx_fmt_e (UNSIGNED_FIX,
-                                                 GET_MODE (to),
-                                                 copy_rtx (from)));
+             set_dst_reg_note (insn, REG_EQUAL,
+                               gen_rtx_fmt_e (UNSIGNED_FIX, GET_MODE (to),
+                                              copy_rtx (from)),
+                               to);
             }
  
           return;
@@ -5922,6 +6131,9 @@ init_optabs (void)
    init_optab (usashl_optab, US_ASHIFT);
    init_optab (ashr_optab, ASHIFTRT);
    init_optab (lshr_optab, LSHIFTRT);
+  init_optabv (vashl_optab, ASHIFT);
+  init_optabv (vashr_optab, ASHIFTRT);
+  init_optabv (vlshr_optab, LSHIFTRT);
    init_optab (rotl_optab, ROTATE);
    init_optab (rotr_optab, ROTATERT);
    init_optab (smin_optab, SMIN);
@@ -6021,10 +6233,6 @@ init_optabs (void)
    init_optab (udot_prod_optab, UNKNOWN);
  
    init_optab (vec_extract_optab, UNKNOWN);
-  init_optab (vec_extract_even_optab, UNKNOWN);
-  init_optab (vec_extract_odd_optab, UNKNOWN);
-  init_optab (vec_interleave_high_optab, UNKNOWN);
-  init_optab (vec_interleave_low_optab, UNKNOWN);
    init_optab (vec_set_optab, UNKNOWN);
    init_optab (vec_init_optab, UNKNOWN);
    init_optab (vec_shl_optab, UNKNOWN);
@@ -6035,6 +6243,10 @@ init_optabs (void)
    init_optab (vec_widen_umult_lo_optab, UNKNOWN);
    init_optab (vec_widen_smult_hi_optab, UNKNOWN);
    init_optab (vec_widen_smult_lo_optab, UNKNOWN);
+  init_optab (vec_widen_ushiftl_hi_optab, UNKNOWN);
+  init_optab (vec_widen_ushiftl_lo_optab, UNKNOWN);
+  init_optab (vec_widen_sshiftl_hi_optab, UNKNOWN);
+  init_optab (vec_widen_sshiftl_lo_optab, UNKNOWN);
    init_optab (vec_unpacks_hi_optab, UNKNOWN);
    init_optab (vec_unpacks_lo_optab, UNKNOWN);
    init_optab (vec_unpacku_hi_optab, UNKNOWN);
@@ -6334,6 +6546,57 @@ init_optabs (void)
    targetm.init_libfuncs ();
  }
  
+/* A helper function for init_sync_libfuncs.  Using the basename BASE,
+   install libfuncs into TAB for BASE_N for 1 <= N <= MAX.  */
+
+static void
+init_sync_libfuncs_1 (optab tab, const char *base, int max)
+{
+  enum machine_mode mode;
+  char buf[64];
+  size_t len = strlen (base);
+  int i;
+
+  gcc_assert (max <= 8);
+  gcc_assert (len + 3 < sizeof (buf));
+
+  memcpy (buf, base, len);
+  buf[len] = '_';
+  buf[len + 1] = '0';
+  buf[len + 2] = '\0';
+
+  mode = QImode;
+  for (i = 1; i <= max; i *= 2)
+    {
+      buf[len + 1] = '0' + i;
+      set_optab_libfunc (tab, mode, buf);
+      mode = GET_MODE_2XWIDER_MODE (mode);
+    }
+}
+
+void
+init_sync_libfuncs (int max)
+{
+  init_sync_libfuncs_1 (sync_compare_and_swap_optab,
+                       "__sync_val_compare_and_swap", max);
+  init_sync_libfuncs_1 (sync_lock_test_and_set_optab,
+                       "__sync_lock_test_and_set", max);
+
+  init_sync_libfuncs_1 (sync_old_add_optab, "__sync_fetch_and_add", max);
+  init_sync_libfuncs_1 (sync_old_sub_optab, "__sync_fetch_and_sub", max);
+  init_sync_libfuncs_1 (sync_old_ior_optab, "__sync_fetch_and_or", max);
+  init_sync_libfuncs_1 (sync_old_and_optab, "__sync_fetch_and_and", max);
+  init_sync_libfuncs_1 (sync_old_xor_optab, "__sync_fetch_and_xor", max);
+  init_sync_libfuncs_1 (sync_old_nand_optab, "__sync_fetch_and_nand", max);
+
+  init_sync_libfuncs_1 (sync_new_add_optab, "__sync_add_and_fetch", max);
+  init_sync_libfuncs_1 (sync_new_sub_optab, "__sync_sub_and_fetch", max);
+  init_sync_libfuncs_1 (sync_new_ior_optab, "__sync_or_and_fetch", max);
+  init_sync_libfuncs_1 (sync_new_and_optab, "__sync_and_and_fetch", max);
+  init_sync_libfuncs_1 (sync_new_xor_optab, "__sync_xor_and_fetch", max);
+  init_sync_libfuncs_1 (sync_new_nand_optab, "__sync_nand_and_fetch", max);
+}
+
  /* Print information about the current contents of the optabs on
     STDERR.  */
  
@@ -6523,27 +6786,257 @@ vector_compare_rtx (tree cond, bool unsignedp, enum insn_code icode)
    return gen_rtx_fmt_ee (rcode, VOIDmode, ops[0].value, ops[1].value);
  }
  
-/* Return insn code for TYPE, the type of a VEC_COND_EXPR.  */
+/* Return true if VEC_PERM_EXPR can be expanded using SIMD extensions
+   of the CPU.  SEL may be NULL, which stands for an unknown constant.  */
+
+bool
+can_vec_perm_p (enum machine_mode mode, bool variable,
+               const unsigned char *sel)
+{
+  enum machine_mode qimode;
+
+  /* If the target doesn't implement a vector mode for the vector type,
+     then no operations are supported.  */
+  if (!VECTOR_MODE_P (mode))
+    return false;
+
+  if (!variable)
+    {
+      if (direct_optab_handler (vec_perm_const_optab, mode) != CODE_FOR_nothing
+         && (sel == NULL
+             || targetm.vectorize.vec_perm_const_ok == NULL
+             || targetm.vectorize.vec_perm_const_ok (mode, sel)))
+       return true;
+    }
+
+  if (direct_optab_handler (vec_perm_optab, mode) != CODE_FOR_nothing)
+    return true;
+
+  /* We allow fallback to a QI vector mode, and adjust the mask.  */
+  if (GET_MODE_INNER (mode) == QImode)
+    return false;
+  qimode = mode_for_vector (QImode, GET_MODE_SIZE (mode));
+  if (!VECTOR_MODE_P (qimode))
+    return false;
+
+  /* ??? For completeness, we ought to check the QImode version of
+      vec_perm_const_optab.  But all users of this implicit lowering
+      feature implement the variable vec_perm_optab.  */
+  if (direct_optab_handler (vec_perm_optab, qimode) == CODE_FOR_nothing)
+    return false;
+
+  /* In order to support the lowering of variable permutations,
+     we need to support shifts and adds.  */
+  if (variable)
+    {
+      if (GET_MODE_UNIT_SIZE (mode) > 2
+         && optab_handler (ashl_optab, mode) == CODE_FOR_nothing
+         && optab_handler (vashl_optab, mode) == CODE_FOR_nothing)
+       return false;
+      if (optab_handler (add_optab, qimode) == CODE_FOR_nothing)
+       return false;
+    }
+
+  return true;
+}
+
+/* A subroutine of expand_vec_perm for expanding one vec_perm insn.  */
+
+static rtx
+expand_vec_perm_1 (enum insn_code icode, rtx target,
+                  rtx v0, rtx v1, rtx sel)
+{
+  enum machine_mode tmode = GET_MODE (target);
+  enum machine_mode smode = GET_MODE (sel);
+  struct expand_operand ops[4];
+
+  create_output_operand (&ops[0], target, tmode);
+  create_input_operand (&ops[3], sel, smode);
+
+  /* Make an effort to preserve v0 == v1.  The target expander is able to
+     rely on this to determine if we're permuting a single input operand.  */
+  if (rtx_equal_p (v0, v1))
+    {
+      if (!insn_operand_matches (icode, 1, v0))
+        v0 = force_reg (tmode, v0);
+      gcc_checking_assert (insn_operand_matches (icode, 1, v0));
+      gcc_checking_assert (insn_operand_matches (icode, 2, v0));
+
+      create_fixed_operand (&ops[1], v0);
+      create_fixed_operand (&ops[2], v0);
+    }
+  else
+    {
+      create_input_operand (&ops[1], v0, tmode);
+      create_input_operand (&ops[2], v1, tmode);
+    }
+
+  if (maybe_expand_insn (icode, 4, ops))
+    return ops[0].value;
+  return NULL_RTX;
+}
+
+/* Generate instructions for vec_perm optab given its mode
+   and three operands.  */
+
+rtx
+expand_vec_perm (enum machine_mode mode, rtx v0, rtx v1, rtx sel, rtx target)
+{
+  enum insn_code icode;
+  enum machine_mode qimode;
+  unsigned int i, w, e, u;
+  rtx tmp, sel_qi = NULL;
+  rtvec vec;
+
+  if (!target || GET_MODE (target) != mode)
+    target = gen_reg_rtx (mode);
+
+  w = GET_MODE_SIZE (mode);
+  e = GET_MODE_NUNITS (mode);
+  u = GET_MODE_UNIT_SIZE (mode);
+
+  /* Set QIMODE to a different vector mode with byte elements.
+     If no such mode, or if MODE already has byte elements, use VOIDmode.  */
+  qimode = VOIDmode;
+  if (GET_MODE_INNER (mode) != QImode)
+    {
+      qimode = mode_for_vector (QImode, w);
+      if (!VECTOR_MODE_P (qimode))
+       qimode = VOIDmode;
+    }
+
+  /* If the input is a constant, expand it specially.  */
+  gcc_assert (GET_MODE_CLASS (GET_MODE (sel)) == MODE_VECTOR_INT);
+  if (GET_CODE (sel) == CONST_VECTOR)
+    {
+      icode = direct_optab_handler (vec_perm_const_optab, mode);
+      if (icode != CODE_FOR_nothing)
+       {
+         tmp = expand_vec_perm_1 (icode, target, v0, v1, sel);
+         if (tmp)
+           return tmp;
+       }
+
+      /* Fall back to a constant byte-based permutation.  */
+      if (qimode != VOIDmode)
+       {
+         vec = rtvec_alloc (w);
+         for (i = 0; i < e; ++i)
+           {
+             unsigned int j, this_e;
+
+             this_e = INTVAL (CONST_VECTOR_ELT (sel, i));
+             this_e &= 2 * e - 1;
+             this_e *= u;
+
+             for (j = 0; j < u; ++j)
+               RTVEC_ELT (vec, i * u + j) = GEN_INT (this_e + j);
+           }
+         sel_qi = gen_rtx_CONST_VECTOR (qimode, vec);
+
+         icode = direct_optab_handler (vec_perm_const_optab, qimode);
+         if (icode != CODE_FOR_nothing)
+           {
+             tmp = expand_vec_perm_1 (icode, gen_lowpart (qimode, target),
+                                      gen_lowpart (qimode, v0),
+                                      gen_lowpart (qimode, v1), sel_qi);
+             if (tmp)
+               return gen_lowpart (mode, tmp);
+           }
+       }
+    }
+
+  /* Otherwise expand as a fully variable permuation.  */
+  icode = direct_optab_handler (vec_perm_optab, mode);
+  if (icode != CODE_FOR_nothing)
+    {
+      tmp = expand_vec_perm_1 (icode, target, v0, v1, sel);
+      if (tmp)
+       return tmp;
+    }
+
+  /* As a special case to aid several targets, lower the element-based
+     permutation to a byte-based permutation and try again.  */
+  if (qimode == VOIDmode)
+    return NULL_RTX;
+  icode = direct_optab_handler (vec_perm_optab, qimode);
+  if (icode == CODE_FOR_nothing)
+    return NULL_RTX;
+
+  if (sel_qi == NULL)
+    {
+      /* Multiply each element by its byte size.  */
+      enum machine_mode selmode = GET_MODE (sel);
+      if (u == 2)
+       sel = expand_simple_binop (selmode, PLUS, sel, sel,
+                                  sel, 0, OPTAB_DIRECT);
+      else
+       sel = expand_simple_binop (selmode, ASHIFT, sel,
+                                  GEN_INT (exact_log2 (u)),
+                                  sel, 0, OPTAB_DIRECT);
+      gcc_assert (sel != NULL);
+
+      /* Broadcast the low byte each element into each of its bytes.  */
+      vec = rtvec_alloc (w);
+      for (i = 0; i < w; ++i)
+       {
+         int this_e = i / u * u;
+         if (BYTES_BIG_ENDIAN)
+           this_e += u - 1;
+         RTVEC_ELT (vec, i) = GEN_INT (this_e);
+       }
+      tmp = gen_rtx_CONST_VECTOR (qimode, vec);
+      sel = gen_lowpart (qimode, sel);
+      sel = expand_vec_perm (qimode, sel, sel, tmp, NULL);
+      gcc_assert (sel != NULL);
+
+      /* Add the byte offset to each byte element.  */
+      /* Note that the definition of the indicies here is memory ordering,
+        so there should be no difference between big and little endian.  */
+      vec = rtvec_alloc (w);
+      for (i = 0; i < w; ++i)
+       RTVEC_ELT (vec, i) = GEN_INT (i % u);
+      tmp = gen_rtx_CONST_VECTOR (qimode, vec);
+      sel_qi = expand_simple_binop (qimode, PLUS, sel, tmp,
+                                   sel, 0, OPTAB_DIRECT);
+      gcc_assert (sel_qi != NULL);
+    }
+
+  tmp = expand_vec_perm_1 (icode, gen_lowpart (qimode, target),
+                          gen_lowpart (qimode, v0),
+                          gen_lowpart (qimode, v1), sel_qi);
+  if (tmp)
+    tmp = gen_lowpart (mode, tmp);
+  return tmp;
+}
+
+/* Return insn code for a conditional operator with a comparison in
+   mode CMODE, unsigned if UNS is true, resulting in a value of mode VMODE.  */
  
  static inline enum insn_code
-get_vcond_icode (tree type, enum machine_mode mode)
+get_vcond_icode (enum machine_mode vmode, enum machine_mode cmode, bool uns)
  {
    enum insn_code icode = CODE_FOR_nothing;
-
-  if (TYPE_UNSIGNED (type))
-    icode = direct_optab_handler (vcondu_optab, mode);
+  if (uns)
+    icode = convert_optab_handler (vcondu_optab, vmode, cmode);
    else
-    icode = direct_optab_handler (vcond_optab, mode);
+    icode = convert_optab_handler (vcond_optab, vmode, cmode);
    return icode;
  }
  
  /* Return TRUE iff, appropriate vector insns are available
-   for vector cond expr with type TYPE in VMODE mode.  */
+   for vector cond expr with vector type VALUE_TYPE and a comparison
+   with operand vector types in CMP_OP_TYPE.  */
  
  bool
-expand_vec_cond_expr_p (tree type, enum machine_mode vmode)
-{
-  if (get_vcond_icode (type, vmode) == CODE_FOR_nothing)
+expand_vec_cond_expr_p (tree value_type, tree cmp_op_type)
+{
+  enum machine_mode value_mode = TYPE_MODE (value_type);
+  enum machine_mode cmp_op_mode = TYPE_MODE (cmp_op_type);
+  if (GET_MODE_SIZE (value_mode) != GET_MODE_SIZE (cmp_op_mode)
+      || GET_MODE_NUNITS (value_mode) != GET_MODE_NUNITS (cmp_op_mode)
+      || get_vcond_icode (TYPE_MODE (value_type), TYPE_MODE (cmp_op_type),
+                         TYPE_UNSIGNED (cmp_op_type)) == CODE_FOR_nothing)
      return false;
    return true;
  }
@@ -6559,9 +7052,18 @@ expand_vec_cond_expr (tree vec_cond_type, tree op0, tree op1, tree op2,
    enum insn_code icode;
    rtx comparison, rtx_op1, rtx_op2;
    enum machine_mode mode = TYPE_MODE (vec_cond_type);
-  bool unsignedp = TYPE_UNSIGNED (vec_cond_type);
+  enum machine_mode cmp_op_mode;
+  bool unsignedp;
  
-  icode = get_vcond_icode (vec_cond_type, mode);
+  gcc_assert (COMPARISON_CLASS_P (op0));
+
+  unsignedp = TYPE_UNSIGNED (TREE_TYPE (TREE_OPERAND (op0, 0)));
+  cmp_op_mode = TYPE_MODE (TREE_TYPE (TREE_OPERAND (op0, 0)));
+
+  gcc_assert (GET_MODE_SIZE (mode) == GET_MODE_SIZE (cmp_op_mode)
+             && GET_MODE_NUNITS (mode) == GET_MODE_NUNITS (cmp_op_mode));
+
+  icode = get_vcond_icode (mode, cmp_op_mode, unsignedp);
    if (icode == CODE_FOR_nothing)
      return 0;
  
@@ -6580,45 +7082,49 @@ expand_vec_cond_expr (tree vec_cond_type, tree op0, tree op1, tree op2,
  }
  
  \f
-/* This is an internal subroutine of the other compare_and_swap expanders.
-   MEM, OLD_VAL and NEW_VAL are as you'd expect for a compare-and-swap
-   operation.  TARGET is an optional place to store the value result of
-   the operation.  ICODE is the particular instruction to expand.  Return
-   the result of the operation.  */
+/* Return true if there is a compare_and_swap pattern.  */
  
-static rtx
-expand_val_compare_and_swap_1 (rtx mem, rtx old_val, rtx new_val,
-                              rtx target, enum insn_code icode)
+bool
+can_compare_and_swap_p (enum machine_mode mode, bool allow_libcall)
  {
-  struct expand_operand ops[4];
-  enum machine_mode mode = GET_MODE (mem);
+  enum insn_code icode;
  
-  create_output_operand (&ops[0], target, mode);
-  create_fixed_operand (&ops[1], mem);
-  /* OLD_VAL and NEW_VAL may have been promoted to a wider mode.
-     Shrink them if so.  */
-  create_convert_operand_to (&ops[2], old_val, mode, true);
-  create_convert_operand_to (&ops[3], new_val, mode, true);
-  if (maybe_expand_insn (icode, 4, ops))
-    return ops[0].value;
-  return NULL_RTX;
+  /* Check for __atomic_compare_and_swap.  */
+  icode = direct_optab_handler (atomic_compare_and_swap_optab, mode);
+  if (icode != CODE_FOR_nothing)
+    return true;
+
+  /* Check for __sync_compare_and_swap.  */
+  icode = optab_handler (sync_compare_and_swap_optab, mode);
+  if (icode != CODE_FOR_nothing)
+    return true;
+  if (allow_libcall && optab_libfunc (sync_compare_and_swap_optab, mode))
+    return true;
+
+  /* No inline compare and swap.  */
+  return false;
  }
  
-/* Expand a compare-and-swap operation and return its value.  */
+/* Return true if an atomic exchange can be performed.  */
  
-rtx
-expand_val_compare_and_swap (rtx mem, rtx old_val, rtx new_val, rtx target)
+bool
+can_atomic_exchange_p (enum machine_mode mode, bool allow_libcall)
  {
-  enum machine_mode mode = GET_MODE (mem);
-  enum insn_code icode
-    = direct_optab_handler (sync_compare_and_swap_optab, mode);
+  enum insn_code icode;
  
-  if (icode == CODE_FOR_nothing)
-    return NULL_RTX;
+  /* Check for __atomic_exchange.  */
+  icode = direct_optab_handler (atomic_exchange_optab, mode);
+  if (icode != CODE_FOR_nothing)
+    return true;
+
+  /* Don't check __sync_test_and_set, as on some platforms that
+     has reduced functionality.  Targets that really do support
+     a proper exchange should simply be updated to the __atomics.  */
  
-  return expand_val_compare_and_swap_1 (mem, old_val, new_val, target, icode);
+  return can_compare_and_swap_p (mode, allow_libcall);
  }
  
+
  /* Helper function to find the MODE_CC set in a sync_compare_and_swap
     pattern.  */
  
@@ -6634,58 +7140,6 @@ find_cc_set (rtx x, const_rtx pat, void *data)
      }
  }
  
-/* Expand a compare-and-swap operation and store true into the result if
-   the operation was successful and false otherwise.  Return the result.
-   Unlike other routines, TARGET is not optional.  */
-
-rtx
-expand_bool_compare_and_swap (rtx mem, rtx old_val, rtx new_val, rtx target)
-{
-  enum machine_mode mode = GET_MODE (mem);
-  enum insn_code icode;
-  rtx subtarget, seq, cc_reg;
-
-  /* If the target supports a compare-and-swap pattern that simultaneously
-     sets some flag for success, then use it.  Otherwise use the regular
-     compare-and-swap and follow that immediately with a compare insn.  */
-  icode = direct_optab_handler (sync_compare_and_swap_optab, mode);
-  if (icode == CODE_FOR_nothing)
-    return NULL_RTX;
-
-  do_pending_stack_adjust ();
-  do
-    {
-      start_sequence ();
-      subtarget = expand_val_compare_and_swap_1 (mem, old_val, new_val,
-                                                NULL_RTX, icode);
-      cc_reg = NULL_RTX;
-      if (subtarget == NULL_RTX)
-       {
-         end_sequence ();
-         return NULL_RTX;
-       }
-
-      if (have_insn_for (COMPARE, CCmode))
-       note_stores (PATTERN (get_last_insn ()), find_cc_set, &cc_reg);
-      seq = get_insns ();
-      end_sequence ();
-
-      /* We might be comparing against an old value.  Try again. :-(  */
-      if (!cc_reg && MEM_P (old_val))
-       {
-         seq = NULL_RTX;
-         old_val = force_reg (mode, old_val);
-        }
-    }
-  while (!seq);
-
-  emit_insn (seq);
-  if (cc_reg)
-    return emit_store_flag_force (target, EQ, cc_reg, const0_rtx, VOIDmode, 0, 1);
-  else
-    return emit_store_flag_force (target, EQ, subtarget, old_val, VOIDmode, 1, 1);
-}
-
  /* This is a helper function for the other atomic operations.  This function
     emits a loop that contains SEQ that iterates until a compare-and-swap
     operation at the end succeeds.  MEM is the memory to be modified.  SEQ is
@@ -6699,8 +7153,7 @@ static bool
  expand_compare_and_swap_loop (rtx mem, rtx old_reg, rtx new_reg, rtx seq)
  {
    enum machine_mode mode = GET_MODE (mem);
-  enum insn_code icode;
-  rtx label, cmp_reg, subtarget, cc_reg;
+  rtx label, cmp_reg, success, oldval;
  
    /* The loop we want to generate looks like
  
@@ -6708,8 +7161,8 @@ expand_compare_and_swap_loop (rtx mem, rtx old_reg, rtx new_reg, rtx seq)
        label:
          old_reg = cmp_reg;
         seq;
-       cmp_reg = compare-and-swap(mem, old_reg, new_reg)
-       if (cmp_reg != old_reg)
+       (success, cmp_reg) = compare-and-swap(mem, old_reg, new_reg)
+       if (success)
           goto label;
  
       Note that we only do the plain load from memory once.  Subsequent
@@ -6724,331 +7177,951 @@ expand_compare_and_swap_loop (rtx mem, rtx old_reg, rtx new_reg, rtx seq)
    if (seq)
      emit_insn (seq);
  
-  /* If the target supports a compare-and-swap pattern that simultaneously
-     sets some flag for success, then use it.  Otherwise use the regular
-     compare-and-swap and follow that immediately with a compare insn.  */
-  icode = direct_optab_handler (sync_compare_and_swap_optab, mode);
-  if (icode == CODE_FOR_nothing)
+  success = NULL_RTX;
+  oldval = cmp_reg;
+  if (!expand_atomic_compare_and_swap (&success, &oldval, mem, old_reg,
+                                      new_reg, false, MEMMODEL_SEQ_CST,
+                                      MEMMODEL_RELAXED))
      return false;
  
-  subtarget = expand_val_compare_and_swap_1 (mem, old_reg, new_reg,
-                                            cmp_reg, icode);
-  if (subtarget == NULL_RTX)
-    return false;
+  if (oldval != cmp_reg)
+    emit_move_insn (cmp_reg, oldval);
  
-  cc_reg = NULL_RTX;
-  if (have_insn_for (COMPARE, CCmode))
-    note_stores (PATTERN (get_last_insn ()), find_cc_set, &cc_reg);
-  if (cc_reg)
-    {
-      cmp_reg = cc_reg;
-      old_reg = const0_rtx;
-    }
-  else
+  /* ??? Mark this jump predicted not taken?  */
+  emit_cmp_and_jump_insns (success, const0_rtx, EQ, const0_rtx,
+                          GET_MODE (success), 1, label);
+  return true;
+}
+
+
+/* This function tries to emit an atomic_exchange intruction.  VAL is written
+   to *MEM using memory model MODEL. The previous contents of *MEM are returned,
+   using TARGET if possible.  */
+   
+static rtx
+maybe_emit_atomic_exchange (rtx target, rtx mem, rtx val, enum memmodel model)
+{
+  enum machine_mode mode = GET_MODE (mem);
+  enum insn_code icode;
+
+  /* If the target supports the exchange directly, great.  */
+  icode = direct_optab_handler (atomic_exchange_optab, mode);
+  if (icode != CODE_FOR_nothing)
      {
-      if (subtarget != cmp_reg)
-       emit_move_insn (cmp_reg, subtarget);
+      struct expand_operand ops[4];
+
+      create_output_operand (&ops[0], target, mode);
+      create_fixed_operand (&ops[1], mem);
+      /* VAL may have been promoted to a wider mode.  Shrink it if so.  */
+      create_convert_operand_to (&ops[2], val, mode, true);
+      create_integer_operand (&ops[3], model);
+      if (maybe_expand_insn (icode, 4, ops))
+       return ops[0].value;
      }
  
-  /* ??? Mark this jump predicted not taken?  */
-  emit_cmp_and_jump_insns (cmp_reg, old_reg, NE, const0_rtx, GET_MODE (cmp_reg), 1,
-                          label);
-  return true;
+  return NULL_RTX;
  }
  
-/* This function generates the atomic operation MEM CODE= VAL.  In this
-   case, we do not care about any resulting value.  Returns NULL if we
-   cannot generate the operation.  */
+/* This function tries to implement an atomic exchange operation using
+   __sync_lock_test_and_set. VAL is written to *MEM using memory model MODEL.
+   The previous contents of *MEM are returned, using TARGET if possible.
+   Since this instructionn is an acquire barrier only, stronger memory
+   models may require additional barriers to be emitted.  */
  
-rtx
-expand_sync_operation (rtx mem, rtx val, enum rtx_code code)
+static rtx
+maybe_emit_sync_lock_test_and_set (rtx target, rtx mem, rtx val,
+                                  enum memmodel model)
  {
    enum machine_mode mode = GET_MODE (mem);
    enum insn_code icode;
-  rtx insn;
+  rtx last_insn = get_last_insn ();
  
-  /* Look to see if the target supports the operation directly.  */
-  switch (code)
+  icode = optab_handler (sync_lock_test_and_set_optab, mode);
+
+  /* Legacy sync_lock_test_and_set is an acquire barrier.  If the pattern
+     exists, and the memory model is stronger than acquire, add a release 
+     barrier before the instruction.  */
+
+  if (model == MEMMODEL_SEQ_CST
+      || model == MEMMODEL_RELEASE
+      || model == MEMMODEL_ACQ_REL)
+    expand_mem_thread_fence (model);
+
+  if (icode != CODE_FOR_nothing)
      {
-    case PLUS:
-      icode = direct_optab_handler (sync_add_optab, mode);
-      break;
-    case IOR:
-      icode = direct_optab_handler (sync_ior_optab, mode);
-      break;
-    case XOR:
-      icode = direct_optab_handler (sync_xor_optab, mode);
-      break;
-    case AND:
-      icode = direct_optab_handler (sync_and_optab, mode);
-      break;
-    case NOT:
-      icode = direct_optab_handler (sync_nand_optab, mode);
-      break;
+      struct expand_operand ops[3];
+      create_output_operand (&ops[0], target, mode);
+      create_fixed_operand (&ops[1], mem);
+      /* VAL may have been promoted to a wider mode.  Shrink it if so.  */
+      create_convert_operand_to (&ops[2], val, mode, true);
+      if (maybe_expand_insn (icode, 3, ops))
+       return ops[0].value;
+    }
  
-    case MINUS:
-      icode = direct_optab_handler (sync_sub_optab, mode);
-      if (icode == CODE_FOR_nothing || CONST_INT_P (val))
+  /* If an external test-and-set libcall is provided, use that instead of
+     any external compare-and-swap that we might get from the compare-and-
+     swap-loop expansion later.  */
+  if (!can_compare_and_swap_p (mode, false))
+    {
+      rtx libfunc = optab_libfunc (sync_lock_test_and_set_optab, mode);
+      if (libfunc != NULL)
         {
-         icode = direct_optab_handler (sync_add_optab, mode);
-         if (icode != CODE_FOR_nothing)
-           {
-             val = expand_simple_unop (mode, NEG, val, NULL_RTX, 1);
-             code = PLUS;
-           }
+         rtx addr;
+
+         addr = convert_memory_address (ptr_mode, XEXP (mem, 0));
+         return emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
+                                         mode, 2, addr, ptr_mode,
+                                         val, mode);
         }
-      break;
+    }
  
-    default:
-      gcc_unreachable ();
+  /* If the test_and_set can't be emitted, eliminate any barrier that might
+     have been emitted.  */
+  delete_insns_since (last_insn);
+  return NULL_RTX;
+}
+
+/* This function tries to implement an atomic exchange operation using a 
+   compare_and_swap loop. VAL is written to *MEM.  The previous contents of
+   *MEM are returned, using TARGET if possible.  No memory model is required
+   since a compare_and_swap loop is seq-cst.  */
+
+static rtx 
+maybe_emit_compare_and_swap_exchange_loop (rtx target, rtx mem, rtx val)
+{
+  enum machine_mode mode = GET_MODE (mem);
+
+  if (can_compare_and_swap_p (mode, true))
+    {
+      if (!target || !register_operand (target, mode))
+       target = gen_reg_rtx (mode);
+      if (GET_MODE (val) != VOIDmode && GET_MODE (val) != mode)
+       val = convert_modes (mode, GET_MODE (val), val, 1);
+      if (expand_compare_and_swap_loop (mem, target, val, NULL_RTX))
+       return target;
      }
  
-  /* Generate the direct operation, if present.  */
-  if (icode != CODE_FOR_nothing)
+  return NULL_RTX;
+}
+
+/* This function tries to implement an atomic test-and-set operation
+   using the atomic_test_and_set instruction pattern.  A boolean value
+   is returned from the operation, using TARGET if possible.  */
+
+#ifndef HAVE_atomic_test_and_set
+#define HAVE_atomic_test_and_set 0
+#define CODE_FOR_atomic_test_and_set CODE_FOR_nothing
+#endif
+
+static rtx
+maybe_emit_atomic_test_and_set (rtx target, rtx mem, enum memmodel model)
+{
+  enum machine_mode pat_bool_mode;
+  struct expand_operand ops[3];
+
+  if (!HAVE_atomic_test_and_set)
+    return NULL_RTX;
+
+  /* While we always get QImode from __atomic_test_and_set, we get
+     other memory modes from __sync_lock_test_and_set.  Note that we
+     use no endian adjustment here.  This matches the 4.6 behavior
+     in the Sparc backend.  */
+  gcc_checking_assert
+    (insn_data[CODE_FOR_atomic_test_and_set].operand[1].mode == QImode);
+  if (GET_MODE (mem) != QImode)
+    mem = adjust_address_nv (mem, QImode, 0);
+
+  pat_bool_mode = insn_data[CODE_FOR_atomic_test_and_set].operand[0].mode;
+  create_output_operand (&ops[0], target, pat_bool_mode);
+  create_fixed_operand (&ops[1], mem);
+  create_integer_operand (&ops[2], model);
+
+  if (maybe_expand_insn (CODE_FOR_atomic_test_and_set, 3, ops))
+    return ops[0].value;
+  return NULL_RTX;
+}
+
+/* This function expands the legacy _sync_lock test_and_set operation which is
+   generally an atomic exchange.  Some limited targets only allow the
+   constant 1 to be stored.  This is an ACQUIRE operation. 
+
+   TARGET is an optional place to stick the return value.  
+   MEM is where VAL is stored.  */
+
+rtx
+expand_sync_lock_test_and_set (rtx target, rtx mem, rtx val)
+{
+  rtx ret;
+
+  /* Try an atomic_exchange first.  */
+  ret = maybe_emit_atomic_exchange (target, mem, val, MEMMODEL_ACQUIRE);
+  if (ret)
+    return ret;
+
+  ret = maybe_emit_sync_lock_test_and_set (target, mem, val, MEMMODEL_ACQUIRE);
+  if (ret)
+    return ret;
+
+  ret = maybe_emit_compare_and_swap_exchange_loop (target, mem, val);
+  if (ret)
+    return ret;
+
+  /* If there are no other options, try atomic_test_and_set if the value
+     being stored is 1.  */
+  if (val == const1_rtx)
+    ret = maybe_emit_atomic_test_and_set (target, mem, MEMMODEL_ACQUIRE);
+
+  return ret;
+}
+
+/* This function expands the atomic test_and_set operation:
+   atomically store a boolean TRUE into MEM and return the previous value.
+
+   MEMMODEL is the memory model variant to use.
+   TARGET is an optional place to stick the return value.  */
+
+rtx
+expand_atomic_test_and_set (rtx target, rtx mem, enum memmodel model)
+{
+  enum machine_mode mode = GET_MODE (mem);
+  rtx ret, trueval, subtarget;
+
+  ret = maybe_emit_atomic_test_and_set (target, mem, model);
+  if (ret)
+    return ret;
+
+  /* Be binary compatible with non-default settings of trueval, and different
+     cpu revisions.  E.g. one revision may have atomic-test-and-set, but
+     another only has atomic-exchange.  */
+  if (targetm.atomic_test_and_set_trueval == 1)
      {
-      struct expand_operand ops[2];
+      trueval = const1_rtx;
+      subtarget = target ? target : gen_reg_rtx (mode);
+    }
+  else
+    {
+      trueval = gen_int_mode (targetm.atomic_test_and_set_trueval, mode);
+      subtarget = gen_reg_rtx (mode);
+    }
  
-      create_fixed_operand (&ops[0], mem);
-      /* VAL may have been promoted to a wider mode.  Shrink it if so.  */
-      create_convert_operand_to (&ops[1], val, mode, true);
-      if (maybe_expand_insn (icode, 2, ops))
-       return const0_rtx;
+  /* Try the atomic-exchange optab...  */
+  ret = maybe_emit_atomic_exchange (subtarget, mem, trueval, model);
+
+  /* ... then an atomic-compare-and-swap loop ... */
+  if (!ret)
+    ret = maybe_emit_compare_and_swap_exchange_loop (subtarget, mem, trueval);
+
+  /* ... before trying the vaguely defined legacy lock_test_and_set. */
+  if (!ret)
+    ret = maybe_emit_sync_lock_test_and_set (subtarget, mem, trueval, model);
+
+  /* Recall that the legacy lock_test_and_set optab was allowed to do magic
+     things with the value 1.  Thus we try again without trueval.  */
+  if (!ret && targetm.atomic_test_and_set_trueval != 1)
+    ret = maybe_emit_sync_lock_test_and_set (subtarget, mem, const1_rtx, model);
+
+  /* Failing all else, assume a single threaded environment and simply
+     perform the operation.  */
+  if (!ret)
+    {
+      emit_move_insn (subtarget, mem);
+      emit_move_insn (mem, trueval);
+      ret = subtarget;
      }
  
-  /* Failing that, generate a compare-and-swap loop in which we perform the
-     operation with normal arithmetic instructions.  */
-  if (direct_optab_handler (sync_compare_and_swap_optab, mode)
-      != CODE_FOR_nothing)
+  /* Recall that have to return a boolean value; rectify if trueval
+     is not exactly one.  */
+  if (targetm.atomic_test_and_set_trueval != 1)
+    ret = emit_store_flag_force (target, NE, ret, const0_rtx, mode, 0, 1);
+  
+  return ret;
+}
+
+/* This function expands the atomic exchange operation:
+   atomically store VAL in MEM and return the previous value in MEM.
+
+   MEMMODEL is the memory model variant to use.
+   TARGET is an optional place to stick the return value.  */
+
+rtx
+expand_atomic_exchange (rtx target, rtx mem, rtx val, enum memmodel model)
+{
+  rtx ret;
+
+  ret = maybe_emit_atomic_exchange (target, mem, val, model);
+
+  /* Next try a compare-and-swap loop for the exchange.  */
+  if (!ret)
+    ret = maybe_emit_compare_and_swap_exchange_loop (target, mem, val);
+
+  return ret;
+}
+
+/* This function expands the atomic compare exchange operation:
+
+   *PTARGET_BOOL is an optional place to store the boolean success/failure.
+   *PTARGET_OVAL is an optional place to store the old value from memory.
+   Both target parameters may be NULL to indicate that we do not care about
+   that return value.  Both target parameters are updated on success to
+   the actual location of the corresponding result.
+
+   MEMMODEL is the memory model variant to use.
+
+   The return value of the function is true for success.  */
+
+bool
+expand_atomic_compare_and_swap (rtx *ptarget_bool, rtx *ptarget_oval,
+                               rtx mem, rtx expected, rtx desired,
+                               bool is_weak, enum memmodel succ_model,
+                               enum memmodel fail_model)
+{
+  enum machine_mode mode = GET_MODE (mem);
+  struct expand_operand ops[8];
+  enum insn_code icode;
+  rtx target_oval, target_bool = NULL_RTX;
+  rtx libfunc;
+
+  /* Load expected into a register for the compare and swap.  */
+  if (MEM_P (expected))
+    expected = copy_to_reg (expected);
+
+  /* Make sure we always have some place to put the return oldval.
+     Further, make sure that place is distinct from the input expected,
+     just in case we need that path down below.  */
+  if (ptarget_oval == NULL
+      || (target_oval = *ptarget_oval) == NULL
+      || reg_overlap_mentioned_p (expected, target_oval))
+    target_oval = gen_reg_rtx (mode);
+
+  icode = direct_optab_handler (atomic_compare_and_swap_optab, mode);
+  if (icode != CODE_FOR_nothing)
      {
-      rtx t0 = gen_reg_rtx (mode), t1;
+      enum machine_mode bool_mode = insn_data[icode].operand[0].mode;
+
+      /* Make sure we always have a place for the bool operand.  */
+      if (ptarget_bool == NULL
+         || (target_bool = *ptarget_bool) == NULL
+         || GET_MODE (target_bool) != bool_mode)
+       target_bool = gen_reg_rtx (bool_mode);
+
+      /* Emit the compare_and_swap.  */
+      create_output_operand (&ops[0], target_bool, bool_mode);
+      create_output_operand (&ops[1], target_oval, mode);
+      create_fixed_operand (&ops[2], mem);
+      create_convert_operand_to (&ops[3], expected, mode, true);
+      create_convert_operand_to (&ops[4], desired, mode, true);
+      create_integer_operand (&ops[5], is_weak);
+      create_integer_operand (&ops[6], succ_model);
+      create_integer_operand (&ops[7], fail_model);
+      expand_insn (icode, 8, ops);
+
+      /* Return success/failure.  */
+      target_bool = ops[0].value;
+      target_oval = ops[1].value;
+      goto success;
+    }
+
+  /* Otherwise fall back to the original __sync_val_compare_and_swap
+     which is always seq-cst.  */
+  icode = optab_handler (sync_compare_and_swap_optab, mode);
+  if (icode != CODE_FOR_nothing)
+    {
+      rtx cc_reg;
  
-      start_sequence ();
+      create_output_operand (&ops[0], target_oval, mode);
+      create_fixed_operand (&ops[1], mem);
+      create_convert_operand_to (&ops[2], expected, mode, true);
+      create_convert_operand_to (&ops[3], desired, mode, true);
+      if (!maybe_expand_insn (icode, 4, ops))
+       return false;
  
-      t1 = t0;
-      if (code == NOT)
+      target_oval = ops[0].value;
+
+      /* If the caller isn't interested in the boolean return value,
+        skip the computation of it.  */
+      if (ptarget_bool == NULL)
+       goto success;
+
+      /* Otherwise, work out if the compare-and-swap succeeded.  */
+      cc_reg = NULL_RTX;
+      if (have_insn_for (COMPARE, CCmode))
+       note_stores (PATTERN (get_last_insn ()), find_cc_set, &cc_reg);
+      if (cc_reg)
         {
-         t1 = expand_simple_binop (mode, AND, t1, val, NULL_RTX,
-                                   true, OPTAB_LIB_WIDEN);
-         t1 = expand_simple_unop (mode, code, t1, NULL_RTX, true);
+         target_bool = emit_store_flag_force (target_bool, EQ, cc_reg,
+                                              const0_rtx, VOIDmode, 0, 1);
+         goto success;
         }
+      goto success_bool_from_val;
+    }
+
+  /* Also check for library support for __sync_val_compare_and_swap.  */
+  libfunc = optab_libfunc (sync_compare_and_swap_optab, mode);
+  if (libfunc != NULL)
+    {
+      rtx addr = convert_memory_address (ptr_mode, XEXP (mem, 0));
+      target_oval = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
+                                            mode, 3, addr, ptr_mode,
+                                            expected, mode, desired, mode);
+
+      /* Compute the boolean return value only if requested.  */
+      if (ptarget_bool)
+       goto success_bool_from_val;
        else
-       t1 = expand_simple_binop (mode, code, t1, val, NULL_RTX,
-                                 true, OPTAB_LIB_WIDEN);
-      insn = get_insns ();
-      end_sequence ();
+       goto success;
+    }
  
-      if (t1 != NULL && expand_compare_and_swap_loop (mem, t0, t1, insn))
-       return const0_rtx;
+  /* Failure.  */
+  return false;
+
+ success_bool_from_val:
+   target_bool = emit_store_flag_force (target_bool, EQ, target_oval,
+                                       expected, VOIDmode, 1, 1);
+ success:
+  /* Make sure that the oval output winds up where the caller asked.  */
+  if (ptarget_oval)
+    *ptarget_oval = target_oval;
+  if (ptarget_bool)
+    *ptarget_bool = target_bool;
+  return true;
+}
+
+/* Generate asm volatile("" : : : "memory") as the memory barrier.  */
+
+static void
+expand_asm_memory_barrier (void)
+{
+  rtx asm_op, clob;
+
+  asm_op = gen_rtx_ASM_OPERANDS (VOIDmode, empty_string, empty_string, 0,
+                                rtvec_alloc (0), rtvec_alloc (0),
+                                rtvec_alloc (0), UNKNOWN_LOCATION);
+  MEM_VOLATILE_P (asm_op) = 1;
+
+  clob = gen_rtx_SCRATCH (VOIDmode);
+  clob = gen_rtx_MEM (BLKmode, clob);
+  clob = gen_rtx_CLOBBER (VOIDmode, clob);
+
+  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, asm_op, clob)));
+}
+
+/* This routine will either emit the mem_thread_fence pattern or issue a 
+   sync_synchronize to generate a fence for memory model MEMMODEL.  */
+
+#ifndef HAVE_mem_thread_fence
+# define HAVE_mem_thread_fence 0
+# define gen_mem_thread_fence(x) (gcc_unreachable (), NULL_RTX)
+#endif
+#ifndef HAVE_memory_barrier
+# define HAVE_memory_barrier 0
+# define gen_memory_barrier()  (gcc_unreachable (), NULL_RTX)
+#endif
+
+void
+expand_mem_thread_fence (enum memmodel model)
+{
+  if (HAVE_mem_thread_fence)
+    emit_insn (gen_mem_thread_fence (GEN_INT (model)));
+  else if (model != MEMMODEL_RELAXED)
+    {
+      if (HAVE_memory_barrier)
+       emit_insn (gen_memory_barrier ());
+      else if (synchronize_libfunc != NULL_RTX)
+       emit_library_call (synchronize_libfunc, LCT_NORMAL, VOIDmode, 0);
+      else
+       expand_asm_memory_barrier ();
      }
+}
  
-  return NULL_RTX;
+/* This routine will either emit the mem_signal_fence pattern or issue a 
+   sync_synchronize to generate a fence for memory model MEMMODEL.  */
+
+#ifndef HAVE_mem_signal_fence
+# define HAVE_mem_signal_fence 0
+# define gen_mem_signal_fence(x) (gcc_unreachable (), NULL_RTX)
+#endif
+
+void
+expand_mem_signal_fence (enum memmodel model)
+{
+  if (HAVE_mem_signal_fence)
+    emit_insn (gen_mem_signal_fence (GEN_INT (model)));
+  else if (model != MEMMODEL_RELAXED)
+    {
+      /* By default targets are coherent between a thread and the signal
+        handler running on the same thread.  Thus this really becomes a
+        compiler barrier, in that stores must not be sunk past
+        (or raised above) a given point.  */
+      expand_asm_memory_barrier ();
+    }
  }
  
-/* This function generates the atomic operation MEM CODE= VAL.  In this
-   case, we do care about the resulting value: if AFTER is true then
-   return the value MEM holds after the operation, if AFTER is false
-   then return the value MEM holds before the operation.  TARGET is an
-   optional place for the result value to be stored.  */
+/* This function expands the atomic load operation:
+   return the atomically loaded value in MEM.
+
+   MEMMODEL is the memory model variant to use.
+   TARGET is an option place to stick the return value.  */
  
  rtx
-expand_sync_fetch_operation (rtx mem, rtx val, enum rtx_code code,
-                            bool after, rtx target)
+expand_atomic_load (rtx target, rtx mem, enum memmodel model)
  {
    enum machine_mode mode = GET_MODE (mem);
-  enum insn_code old_code, new_code, icode;
-  bool compensate;
-  rtx insn;
+  enum insn_code icode;
+
+  /* If the target supports the load directly, great.  */
+  icode = direct_optab_handler (atomic_load_optab, mode);
+  if (icode != CODE_FOR_nothing)
+    {
+      struct expand_operand ops[3];
+
+      create_output_operand (&ops[0], target, mode);
+      create_fixed_operand (&ops[1], mem);
+      create_integer_operand (&ops[2], model);
+      if (maybe_expand_insn (icode, 3, ops))
+       return ops[0].value;
+    }
+
+  /* If the size of the object is greater than word size on this target,
+     then we assume that a load will not be atomic.  */
+  if (GET_MODE_PRECISION (mode) > BITS_PER_WORD)
+    {
+      /* Issue val = compare_and_swap (mem, 0, 0).
+        This may cause the occasional harmless store of 0 when the value is
+        already 0, but it seems to be OK according to the standards guys.  */
+      if (expand_atomic_compare_and_swap (NULL, &target, mem, const0_rtx,
+                                         const0_rtx, false, model, model))
+       return target;
+      else
+      /* Otherwise there is no atomic load, leave the library call.  */
+        return NULL_RTX;
+    }
+
+  /* Otherwise assume loads are atomic, and emit the proper barriers.  */
+  if (!target || target == const0_rtx)
+    target = gen_reg_rtx (mode);
+
+  /* Emit the appropriate barrier before the load.  */
+  expand_mem_thread_fence (model);
+
+  emit_move_insn (target, mem);
+
+  /* For SEQ_CST, also emit a barrier after the load.  */
+  if (model == MEMMODEL_SEQ_CST)
+    expand_mem_thread_fence (model);
+
+  return target;
+}
+
+/* This function expands the atomic store operation:
+   Atomically store VAL in MEM.
+   MEMMODEL is the memory model variant to use.
+   USE_RELEASE is true if __sync_lock_release can be used as a fall back.
+   function returns const0_rtx if a pattern was emitted.  */
+
+rtx
+expand_atomic_store (rtx mem, rtx val, enum memmodel model, bool use_release)
+{
+  enum machine_mode mode = GET_MODE (mem);
+  enum insn_code icode;
+  struct expand_operand ops[3];
+
+  /* If the target supports the store directly, great.  */
+  icode = direct_optab_handler (atomic_store_optab, mode);
+  if (icode != CODE_FOR_nothing)
+    {
+      create_fixed_operand (&ops[0], mem);
+      create_input_operand (&ops[1], val, mode);
+      create_integer_operand (&ops[2], model);
+      if (maybe_expand_insn (icode, 3, ops))
+       return const0_rtx;
+    }
+
+  /* If using __sync_lock_release is a viable alternative, try it.  */
+  if (use_release)
+    {
+      icode = direct_optab_handler (sync_lock_release_optab, mode);
+      if (icode != CODE_FOR_nothing)
+       {
+         create_fixed_operand (&ops[0], mem);
+         create_input_operand (&ops[1], const0_rtx, mode);
+         if (maybe_expand_insn (icode, 2, ops))
+           {
+             /* lock_release is only a release barrier.  */
+             if (model == MEMMODEL_SEQ_CST)
+               expand_mem_thread_fence (model);
+             return const0_rtx;
+           }
+       }
+    }
+
+  /* If the size of the object is greater than word size on this target,
+     a default store will not be atomic, Try a mem_exchange and throw away
+     the result.  If that doesn't work, don't do anything.  */
+  if (GET_MODE_PRECISION(mode) > BITS_PER_WORD)
+    {
+      rtx target = maybe_emit_atomic_exchange (NULL_RTX, mem, val, model);
+      if (!target)
+        target = maybe_emit_compare_and_swap_exchange_loop (NULL_RTX, mem, val);
+      if (target)
+        return const0_rtx;
+      else
+        return NULL_RTX;
+    }
+
+  /* If there is no mem_store, default to a move with barriers */
+  if (model == MEMMODEL_SEQ_CST || model == MEMMODEL_RELEASE)
+    expand_mem_thread_fence (model);
+
+  emit_move_insn (mem, val);
+
+  /* For SEQ_CST, also emit a barrier after the load.  */
+  if (model == MEMMODEL_SEQ_CST)
+    expand_mem_thread_fence (model);
+
+  return const0_rtx;
+}
+
+
+/* Structure containing the pointers and values required to process the
+   various forms of the atomic_fetch_op and atomic_op_fetch builtins.  */
+
+struct atomic_op_functions
+{
+  direct_optab mem_fetch_before;
+  direct_optab mem_fetch_after;
+  direct_optab mem_no_result;
+  optab fetch_before;
+  optab fetch_after;
+  direct_optab no_result;
+  enum rtx_code reverse_code;
+};
+
  
-  /* Look to see if the target supports the operation directly.  */
+/* Fill in structure pointed to by OP with the various optab entries for an 
+   operation of type CODE.  */
+
+static void
+get_atomic_op_for_code (struct atomic_op_functions *op, enum rtx_code code)
+{
+  gcc_assert (op!= NULL);
+
+  /* If SWITCHABLE_TARGET is defined, then subtargets can be switched
+     in the source code during compilation, and the optab entries are not
+     computable until runtime.  Fill in the values at runtime.  */
    switch (code)
      {
      case PLUS:
-      old_code = direct_optab_handler (sync_old_add_optab, mode);
-      new_code = direct_optab_handler (sync_new_add_optab, mode);
+      op->mem_fetch_before = atomic_fetch_add_optab;
+      op->mem_fetch_after = atomic_add_fetch_optab;
+      op->mem_no_result = atomic_add_optab;
+      op->fetch_before = sync_old_add_optab;
+      op->fetch_after = sync_new_add_optab;
+      op->no_result = sync_add_optab;
+      op->reverse_code = MINUS;
        break;
-    case IOR:
-      old_code = direct_optab_handler (sync_old_ior_optab, mode);
-      new_code = direct_optab_handler (sync_new_ior_optab, mode);
+    case MINUS:
+      op->mem_fetch_before = atomic_fetch_sub_optab;
+      op->mem_fetch_after = atomic_sub_fetch_optab;
+      op->mem_no_result = atomic_sub_optab;
+      op->fetch_before = sync_old_sub_optab;
+      op->fetch_after = sync_new_sub_optab;
+      op->no_result = sync_sub_optab;
+      op->reverse_code = PLUS;
        break;
      case XOR:
-      old_code = direct_optab_handler (sync_old_xor_optab, mode);
-      new_code = direct_optab_handler (sync_new_xor_optab, mode);
+      op->mem_fetch_before = atomic_fetch_xor_optab;
+      op->mem_fetch_after = atomic_xor_fetch_optab;
+      op->mem_no_result = atomic_xor_optab;
+      op->fetch_before = sync_old_xor_optab;
+      op->fetch_after = sync_new_xor_optab;
+      op->no_result = sync_xor_optab;
+      op->reverse_code = XOR;
        break;
      case AND:
-      old_code = direct_optab_handler (sync_old_and_optab, mode);
-      new_code = direct_optab_handler (sync_new_and_optab, mode);
+      op->mem_fetch_before = atomic_fetch_and_optab;
+      op->mem_fetch_after = atomic_and_fetch_optab;
+      op->mem_no_result = atomic_and_optab;
+      op->fetch_before = sync_old_and_optab;
+      op->fetch_after = sync_new_and_optab;
+      op->no_result = sync_and_optab;
+      op->reverse_code = UNKNOWN;
+      break;
+    case IOR:
+      op->mem_fetch_before = atomic_fetch_or_optab;
+      op->mem_fetch_after = atomic_or_fetch_optab;
+      op->mem_no_result = atomic_or_optab;
+      op->fetch_before = sync_old_ior_optab;
+      op->fetch_after = sync_new_ior_optab;
+      op->no_result = sync_ior_optab;
+      op->reverse_code = UNKNOWN;
        break;
      case NOT:
-      old_code = direct_optab_handler (sync_old_nand_optab, mode);
-      new_code = direct_optab_handler (sync_new_nand_optab, mode);
+      op->mem_fetch_before = atomic_fetch_nand_optab;
+      op->mem_fetch_after = atomic_nand_fetch_optab;
+      op->mem_no_result = atomic_nand_optab;
+      op->fetch_before = sync_old_nand_optab;
+      op->fetch_after = sync_new_nand_optab;
+      op->no_result = sync_nand_optab;
+      op->reverse_code = UNKNOWN;
        break;
+    default:
+      gcc_unreachable ();
+    }
+}
  
-    case MINUS:
-      old_code = direct_optab_handler (sync_old_sub_optab, mode);
-      new_code = direct_optab_handler (sync_new_sub_optab, mode);
-      if ((old_code == CODE_FOR_nothing && new_code == CODE_FOR_nothing)
-          || CONST_INT_P (val))
-       {
-         old_code = direct_optab_handler (sync_old_add_optab, mode);
-         new_code = direct_optab_handler (sync_new_add_optab, mode);
-         if (old_code != CODE_FOR_nothing || new_code != CODE_FOR_nothing)
-           {
-             val = expand_simple_unop (mode, NEG, val, NULL_RTX, 1);
-             code = PLUS;
-           }
+/* See if there is a more optimal way to implement the operation "*MEM CODE VAL"
+   using memory order MODEL.  If AFTER is true the operation needs to return
+   the value of *MEM after the operation, otherwise the previous value.  
+   TARGET is an optional place to place the result.  The result is unused if
+   it is const0_rtx.
+   Return the result if there is a better sequence, otherwise NULL_RTX.  */
+
+static rtx
+maybe_optimize_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
+                        enum memmodel model, bool after)
+{
+  /* If the value is prefetched, or not used, it may be possible to replace
+     the sequence with a native exchange operation.  */
+  if (!after || target == const0_rtx)
+    {
+      /* fetch_and (&x, 0, m) can be replaced with exchange (&x, 0, m).  */
+      if (code == AND && val == const0_rtx)
+        {
+         if (target == const0_rtx)
+           target = gen_reg_rtx (GET_MODE (mem));
+         return maybe_emit_atomic_exchange (target, mem, val, model);
         }
-      break;
  
-    default:
-      gcc_unreachable ();
+      /* fetch_or (&x, -1, m) can be replaced with exchange (&x, -1, m).  */
+      if (code == IOR && val == constm1_rtx)
+        {
+         if (target == const0_rtx)
+           target = gen_reg_rtx (GET_MODE (mem));
+         return maybe_emit_atomic_exchange (target, mem, val, model);
+       }
      }
  
-  /* If the target does supports the proper new/old operation, great.  But
-     if we only support the opposite old/new operation, check to see if we
-     can compensate.  In the case in which the old value is supported, then
-     we can always perform the operation again with normal arithmetic.  In
-     the case in which the new value is supported, then we can only handle
-     this in the case the operation is reversible.  */
-  compensate = false;
-  if (after)
+  return NULL_RTX;
+}
+
+/* Try to emit an instruction for a specific operation varaition. 
+   OPTAB contains the OP functions.
+   TARGET is an optional place to return the result. const0_rtx means unused.
+   MEM is the memory location to operate on.
+   VAL is the value to use in the operation.
+   USE_MEMMODEL is TRUE if the variation with a memory model should be tried.
+   MODEL is the memory model, if used.
+   AFTER is true if the returned result is the value after the operation.  */
+
+static rtx 
+maybe_emit_op (const struct atomic_op_functions *optab, rtx target, rtx mem,
+              rtx val, bool use_memmodel, enum memmodel model, bool after)
+{
+  enum machine_mode mode = GET_MODE (mem);
+  struct expand_operand ops[4];
+  enum insn_code icode;
+  int op_counter = 0;
+  int num_ops;
+
+  /* Check to see if there is a result returned.  */
+  if (target == const0_rtx)
      {
-      icode = new_code;
-      if (icode == CODE_FOR_nothing)
-       {
-         icode = old_code;
-         if (icode != CODE_FOR_nothing)
-           compensate = true;
+      if (use_memmodel)
+        {
+         icode = direct_optab_handler (optab->mem_no_result, mode);
+         create_integer_operand (&ops[2], model);
+         num_ops = 3;
+       }
+      else
+        {
+         icode = direct_optab_handler (optab->no_result, mode);
+         num_ops = 2;
         }
      }
+  /* Otherwise, we need to generate a result.  */
    else
      {
-      icode = old_code;
-      if (icode == CODE_FOR_nothing
-         && (code == PLUS || code == MINUS || code == XOR))
+      if (use_memmodel)
+        {
+         icode = direct_optab_handler (after ? optab->mem_fetch_after
+                                       : optab->mem_fetch_before, mode);
+         create_integer_operand (&ops[3], model);
+         num_ops = 4;
+       }
+      else
         {
-         icode = new_code;
-         if (icode != CODE_FOR_nothing)
-           compensate = true;
+         icode = optab_handler (after ? optab->fetch_after
+                                : optab->fetch_before, mode);
+         num_ops = 3;
         }
+      create_output_operand (&ops[op_counter++], target, mode);
      }
+  if (icode == CODE_FOR_nothing)
+    return NULL_RTX;
  
-  /* If we found something supported, great.  */
-  if (icode != CODE_FOR_nothing)
+  create_fixed_operand (&ops[op_counter++], mem);
+  /* VAL may have been promoted to a wider mode.  Shrink it if so.  */
+  create_convert_operand_to (&ops[op_counter++], val, mode, true);
+
+  if (maybe_expand_insn (icode, num_ops, ops))
+    return (target == const0_rtx ? const0_rtx : ops[0].value);
+
+  return NULL_RTX;
+} 
+
+
+/* This function expands an atomic fetch_OP or OP_fetch operation:
+   TARGET is an option place to stick the return value.  const0_rtx indicates
+   the result is unused. 
+   atomically fetch MEM, perform the operation with VAL and return it to MEM.
+   CODE is the operation being performed (OP)
+   MEMMODEL is the memory model variant to use.
+   AFTER is true to return the result of the operation (OP_fetch).
+   AFTER is false to return the value before the operation (fetch_OP).  */
+rtx
+expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
+                       enum memmodel model, bool after)
+{
+  enum machine_mode mode = GET_MODE (mem);
+  struct atomic_op_functions optab;
+  rtx result;
+  bool unused_result = (target == const0_rtx);
+
+  get_atomic_op_for_code (&optab, code);
+
+  /* Check to see if there are any better instructions.  */
+  result = maybe_optimize_fetch_op (target, mem, val, code, model, after);
+  if (result)
+    return result;
+
+  /* Check for the case where the result isn't used and try those patterns.  */
+  if (unused_result)
      {
-      struct expand_operand ops[3];
+      /* Try the memory model variant first.  */
+      result = maybe_emit_op (&optab, target, mem, val, true, model, true);
+      if (result)
+        return result;
  
-      create_output_operand (&ops[0], target, mode);
-      create_fixed_operand (&ops[1], mem);
-      /* VAL may have been promoted to a wider mode.  Shrink it if so.  */
-      create_convert_operand_to (&ops[2], val, mode, true);
-      if (maybe_expand_insn (icode, 3, ops))
+      /* Next try the old style withuot a memory model.  */
+      result = maybe_emit_op (&optab, target, mem, val, false, model, true);
+      if (result)
+        return result;
+
+      /* There is no no-result pattern, so try patterns with a result.  */
+      target = NULL_RTX;
+    }
+
+  /* Try the __atomic version.  */
+  result = maybe_emit_op (&optab, target, mem, val, true, model, after);
+  if (result)
+    return result;
+
+  /* Try the older __sync version.  */
+  result = maybe_emit_op (&optab, target, mem, val, false, model, after);
+  if (result)
+    return result;
+
+  /* If the fetch value can be calculated from the other variation of fetch,
+     try that operation.  */
+  if (after || unused_result || optab.reverse_code != UNKNOWN)
+    {
+      /* Try the __atomic version, then the older __sync version.  */
+      result = maybe_emit_op (&optab, target, mem, val, true, model, !after);
+      if (!result)
+       result = maybe_emit_op (&optab, target, mem, val, false, model, !after);
+
+      if (result)
         {
-         target = ops[0].value;
-         val = ops[2].value;
-         /* If we need to compensate for using an operation with the
-            wrong return value, do so now.  */
-         if (compensate)
+         /* If the result isn't used, no need to do compensation code.  */
+         if (unused_result)
+           return result;
+
+         /* Issue compensation code.  Fetch_after  == fetch_before OP val.
+            Fetch_before == after REVERSE_OP val.  */
+         if (!after)
+           code = optab.reverse_code;
+         if (code == NOT)
             {
-             if (!after)
-               {
-                 if (code == PLUS)
-                   code = MINUS;
-                 else if (code == MINUS)
-                   code = PLUS;
-               }
-
-             if (code == NOT)
-               {
-                 target = expand_simple_binop (mode, AND, target, val,
-                                               NULL_RTX, true,
-                                               OPTAB_LIB_WIDEN);
-                 target = expand_simple_unop (mode, code, target,
-                                              NULL_RTX, true);
-               }
-             else
-               target = expand_simple_binop (mode, code, target, val,
-                                             NULL_RTX, true,
-                                             OPTAB_LIB_WIDEN);
+             result = expand_simple_binop (mode, AND, result, val, NULL_RTX,
+                                           true, OPTAB_LIB_WIDEN);
+             result = expand_simple_unop (mode, NOT, result, target, true);
             }
+         else
+           result = expand_simple_binop (mode, code, result, val, target,
+                                         true, OPTAB_LIB_WIDEN);
+         return result;
+       }
+    }
  
-         return target;
+  /* Try the __sync libcalls only if we can't do compare-and-swap inline.  */
+  if (!can_compare_and_swap_p (mode, false))
+    {
+      rtx libfunc;
+      bool fixup = false;
+
+      libfunc = optab_libfunc (after ? optab.fetch_after
+                              : optab.fetch_before, mode);
+      if (libfunc == NULL
+         && (after || unused_result || optab.reverse_code != UNKNOWN))
+       {
+         fixup = true;
+         if (!after)
+           code = optab.reverse_code;
+         libfunc = optab_libfunc (after ? optab.fetch_before
+                                  : optab.fetch_after, mode);
+       }
+      if (libfunc != NULL)
+       {
+         rtx addr = convert_memory_address (ptr_mode, XEXP (mem, 0));
+         result = emit_library_call_value (libfunc, NULL, LCT_NORMAL, mode,
+                                           2, addr, ptr_mode, val, mode);
+
+         if (!unused_result && fixup)
+           result = expand_simple_binop (mode, code, result, val, target,
+                                         true, OPTAB_LIB_WIDEN);
+         return result;
         }
      }
  
-  /* Failing that, generate a compare-and-swap loop in which we perform the
-     operation with normal arithmetic instructions.  */
-  if (direct_optab_handler (sync_compare_and_swap_optab, mode)
-      != CODE_FOR_nothing)
+  /* If nothing else has succeeded, default to a compare and swap loop.  */
+  if (can_compare_and_swap_p (mode, true))
      {
+      rtx insn;
        rtx t0 = gen_reg_rtx (mode), t1;
  
-      if (!target || !register_operand (target, mode))
-       target = gen_reg_rtx (mode);
-
        start_sequence ();
  
-      if (!after)
-       emit_move_insn (target, t0);
+      /* If the result is used, get a register for it.  */
+      if (!unused_result) 
+        {
+         if (!target || !register_operand (target, mode))
+           target = gen_reg_rtx (mode);
+         /* If fetch_before, copy the value now.  */
+         if (!after)
+           emit_move_insn (target, t0);
+       }
+      else
+        target = const0_rtx;
+
        t1 = t0;
        if (code == NOT)
-       {
+        {
           t1 = expand_simple_binop (mode, AND, t1, val, NULL_RTX,
                                     true, OPTAB_LIB_WIDEN);
           t1 = expand_simple_unop (mode, code, t1, NULL_RTX, true);
         }
        else
-       t1 = expand_simple_binop (mode, code, t1, val, NULL_RTX,
-                                 true, OPTAB_LIB_WIDEN);
-      if (after)
-       emit_move_insn (target, t1);
+       t1 = expand_simple_binop (mode, code, t1, val, NULL_RTX, true, 
+                                 OPTAB_LIB_WIDEN);
  
+      /* For after, copy the value now.  */
+      if (!unused_result && after)
+        emit_move_insn (target, t1);
        insn = get_insns ();
        end_sequence ();
  
        if (t1 != NULL && expand_compare_and_swap_loop (mem, t0, t1, insn))
-       return target;
-    }
-
-  return NULL_RTX;
-}
-
-/* This function expands a test-and-set operation.  Ideally we atomically
-   store VAL in MEM and return the previous value in MEM.  Some targets
-   may not support this operation and only support VAL with the constant 1;
-   in this case while the return value will be 0/1, but the exact value
-   stored in MEM is target defined.  TARGET is an option place to stick
-   the return value.  */
-
-rtx
-expand_sync_lock_test_and_set (rtx mem, rtx val, rtx target)
-{
-  enum machine_mode mode = GET_MODE (mem);
-  enum insn_code icode;
-
-  /* If the target supports the test-and-set directly, great.  */
-  icode = direct_optab_handler (sync_lock_test_and_set_optab, mode);
-  if (icode != CODE_FOR_nothing)
-    {
-      struct expand_operand ops[3];
-
-      create_output_operand (&ops[0], target, mode);
-      create_fixed_operand (&ops[1], mem);
-      /* VAL may have been promoted to a wider mode.  Shrink it if so.  */
-      create_convert_operand_to (&ops[2], val, mode, true);
-      if (maybe_expand_insn (icode, 3, ops))
-       return ops[0].value;
-    }
-
-  /* Otherwise, use a compare-and-swap loop for the exchange.  */
-  if (direct_optab_handler (sync_compare_and_swap_optab, mode)
-      != CODE_FOR_nothing)
-    {
-      if (!target || !register_operand (target, mode))
-       target = gen_reg_rtx (mode);
-      if (GET_MODE (val) != VOIDmode && GET_MODE (val) != mode)
-       val = convert_modes (mode, GET_MODE (val), val, 1);
-      if (expand_compare_and_swap_loop (mem, target, val, NULL_RTX))
-       return target;
+        return target;
      }
  
    return NULL_RTX;
@@ -7094,24 +8167,31 @@ maybe_legitimize_operand_same_code (enum insn_code icode, unsigned int opno,
      return true;
  
    /* If the operand is a memory whose address has no side effects,
-     try forcing the address into a register.  The check for side
-     effects is important because force_reg cannot handle things
-     like auto-modified addresses.  */
-  if (insn_data[(int) icode].operand[opno].allows_mem
-      && MEM_P (op->value)
-      && !side_effects_p (XEXP (op->value, 0)))
-    {
-      rtx addr, mem, last;
-
-      last = get_last_insn ();
-      addr = force_reg (Pmode, XEXP (op->value, 0));
-      mem = replace_equiv_address (op->value, addr);
-      if (insn_operand_matches (icode, opno, mem))
+     try forcing the address into a non-virtual pseudo register.
+     The check for side effects is important because copy_to_mode_reg
+     cannot handle things like auto-modified addresses.  */
+  if (insn_data[(int) icode].operand[opno].allows_mem && MEM_P (op->value))
+    {
+      rtx addr, mem;
+
+      mem = op->value;
+      addr = XEXP (mem, 0);
+      if (!(REG_P (addr) && REGNO (addr) > LAST_VIRTUAL_REGISTER)
+         && !side_effects_p (addr))
         {
-         op->value = mem;
-         return true;
+         rtx last;
+         enum machine_mode mode;
+
+         last = get_last_insn ();
+         mode = targetm.addr_space.address_mode (MEM_ADDR_SPACE (mem));
+         mem = replace_equiv_address (mem, copy_to_mode_reg (mode, addr));
+         if (insn_operand_matches (icode, opno, mem))
+           {
+             op->value = mem;
+             return true;
+           }
+         delete_insns_since (last);
         }
-      delete_insns_since (last);
      }
  
    return false;
@@ -7256,6 +8336,14 @@ maybe_gen_insn (enum insn_code icode, unsigned int nops,
      case 6:
        return GEN_FCN (icode) (ops[0].value, ops[1].value, ops[2].value,
                               ops[3].value, ops[4].value, ops[5].value);
+    case 7:
+      return GEN_FCN (icode) (ops[0].value, ops[1].value, ops[2].value,
+                             ops[3].value, ops[4].value, ops[5].value,
+                             ops[6].value);
+    case 8:
+      return GEN_FCN (icode) (ops[0].value, ops[1].value, ops[2].value,
+                             ops[3].value, ops[4].value, ops[5].value,
+                             ops[6].value, ops[7].value);
      }
    gcc_unreachable ();
  }